In [16]:
import pandas as pd

doc_dict = {}
id_dict = {}
word_count_dict = {}
country_set = set()

# pulls text IDs, country codes, document types, and word counts from the excel sheet,
# using it to divide the documents into a dictionary by ID
sources_df = pd.read_excel("./text/sampleSources.xlsx", sheet_name="texts")
for text_id, (country_code, doc_type), word_count in [(l[0], tuple(l[1].split()), l[2]) for l in sources_df[["textID", "country|genre", "# words"]].values.tolist()]:
    with open(f"./text/w_{country_code.lower()}_{doc_type.lower()}.txt", 'r',
              encoding="utf-8") as file:
        # add each text_id to id_dict
        if f"{country_code}_{doc_type}" not in id_dict:
            id_dict[f"{country_code}_{doc_type}"] = [text_id]
        else:
            id_dict[f"{country_code}_{doc_type}"].append(text_id)
        # makes country code set
        country_set.add(country_code)
        # finds correct text_id and adds every line in the document to the dictionary
        IS_DOC = False
        lines = file.readlines()
        for i, line in enumerate(lines):
            if line.strip().startswith(f"##{text_id}"):
                IS_DOC = True
            elif line.strip().startswith("##"):
                IS_DOC = False
            if IS_DOC:
                if text_id not in doc_dict:
                    doc_dict[text_id] = [w.lower() for w in line.split()]
                else:
                    doc_dict[text_id] += [w.lower() for w in line.split()]
        # adds word count to dictionary
        word_count_dict[text_id] = word_count

In [17]:
import copy
from collections import Counter

# make a counter for every word in the corpus
vocab = Counter({})
vocab['<UNK>'] = 0
for doc in doc_dict.values():
    for word in doc:
        if word in vocab:
            vocab[word] += 1
        else:
            vocab[word] = 1

# make a dictionary of sets for every country and record every word used by each country
# also make a word count for every country
vocab_sets = {country_code:set() for country_code in country_set}
country_word_counts = Counter({country_code:0 for country_code in country_set})
for country_code in country_set:
    for text_id in id_dict[f"{country_code}_B"] + id_dict[f"{country_code}_G"]:
        for word in doc_dict[text_id]:
            vocab_sets[country_code].add(word)
        country_word_counts[country_code] += word_count_dict[text_id]

# make new vocabulary sets, removing words that appear in less than
# 12.5%, 25%, 37.5%, and 50% of the countries datasets respectively
vocab_25 = vocab.copy()
vocab_50 = vocab.copy()
for word in vocab:
    COUNTRY_COUNT = 0
    for country_code in country_set:
        if word in vocab_sets[country_code]:
            COUNTRY_COUNT+=1
    if COUNTRY_COUNT / len(country_set) < 0.25:
        del vocab_25[word]
    if COUNTRY_COUNT / len(country_set) < 0.5:
        del vocab_50[word]

# Replace any words that appear in less than 25% of the countries’ datasets with the <UNK> token
doc_dict_25 = copy.deepcopy(doc_dict)
for text_id, doc in doc_dict_25.items():
    for i, word in enumerate(doc):
        if word not in vocab_25:
            doc_dict_25[text_id][i] = '<UNK>'
            vocab_25['<UNK>'] += 1

# Replace any words that appear in less than 50% of the countries’ datasets with the <UNK> token
doc_dict_50 = copy.deepcopy(doc_dict)
for text_id, doc in doc_dict_50.items():
    for i, word in enumerate(doc):
        if word not in vocab_50:
            doc_dict_50[text_id][i] = '<UNK>'
            vocab_50['<UNK>'] += 1

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

# make a dataframe of one-hot representations of each text in each version of the dataset with their country labels
text_ids = []
texts = []
texts_25 = []
texts_50 = []
country_labels = []

for country_code in country_set:
    for text_id in id_dict[f"{country_code}_B"] + id_dict[f"{country_code}_G"]:
        text_ids.append(text_ids)
        texts.append(" ".join(doc_dict[text_id]))
        texts_25.append(" ".join(doc_dict_25[text_id]))
        texts_50.append(" ".join(doc_dict_50[text_id]))
        country_labels.append(country_code)

data = {
    'text_id': text_ids,
    'texts': texts,
    'texts_25': texts_25,
    'texts_50': texts_50,
    'country_labels': country_labels
}

df = pd.DataFrame(data)

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['texts'])
X_25 = vectorizer.fit_transform(df['texts_25'])
X_50 = vectorizer.fit_transform(df['texts_50'])
y = df['country_labels']

In [19]:
def average_tries(model, X_test, y_test, print_output=False):
    """
    Takes the model and testing data.
    Returns average tries needed to return the right label,
    average tries per country, and number of tests per country.
    """
    
    # Matches each country label to its probability in a dataframe
    df = pd.DataFrame(model.predict_proba(X_test).tolist(), columns=model.classes_.tolist())

    # Sorts each probability distribution for highest probability
    # and records the number of iterations needed to get to the right label
    country_try_count = {c:0 for c in country_set}
    country_test_count = Counter(y_test)
    for i in range(len(df)):
        for try_count, (country_code, _) in enumerate(sorted(df.iloc[i].to_dict().items(), key=lambda item: item[1], reverse=True)):
            if country_code == y_test.tolist()[i]:
                country_try_count[country_code] += try_count+1
                break
    
    # Averages the country try count by number of tests
    country_try_count = {country:country_try_count[country]/country_test_count[country] for country in country_try_count}
    # Computes overal average try count
    avg_tries = sum(country_try_count.values())/len(country_try_count)
    
    # Either returns or prints the results
    if print_output:
        for country, try_count in sorted(country_try_count.items(), key=lambda item: item[1]):
            print(country, f"{try_count:.1f} tries", f"({country_test_count[country]} tests)")
        print("Average number of tries:", avg_tries)
    else:
        return avg_tries, country_try_count, country_test_count

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

def average_performance(model, X_dataset, random_states=5):
    """
    Takes the model, dataset, and number of random test splits to test.
    Prints average accuracy and tries between all test splits of the model.
    """
    accuracies = []
    tries = []
    avg_country_try_count = {c:0 for c in country_set}
    total_country_test_count = {c:0 for c in country_set}

    # trains models on the specified number of random splits
    for r in range(random_states):
        X_train, X_test, y_train, y_test = train_test_split(X_dataset, y, test_size=0.2, random_state=r)
        model.fit(X_train, y_train)
        # computes accuracies
        y_pred = model.predict(X_test)
        accuracies.append(accuracy_score(y_test, y_pred))
        # computes overall and country try counts
        avg_tries, country_try_count, country_test_count = average_tries(model, X_test, y_test)
        tries.append(avg_tries)
        for country in country_try_count:
            avg_country_try_count[country] += country_try_count[country]
            total_country_test_count[country] += country_test_count[country]
        print(".",end=" ")
    print()
    
    print("Average Accuracy:", np.mean(accuracies))
    print("Averaged Average tries:", np.mean(tries))
    # Sorts country try counts and prints them in order
    avg_country_try_count = {country:avg_country_try_count[country]/(random_states) for country in avg_country_try_count}
    for country, try_count in sorted(avg_country_try_count.items(), key=lambda item: item[1]):
        print(country, f"{try_count:.1f} tries", f"({total_country_test_count[country]} tests)")

In [24]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression model with class weights balanced
model = LogisticRegression(max_iter=2000, class_weight='balanced')
average_performance(model, X)
average_performance(model, X_25)
average_performance(model, X_50)

. . . . . 
Average Accuracy: 0.28727272727272724
Averaged Average tries: 7.286789866952094
GB 2.8 tries (326 tests)
US 3.1 tries (332 tests)
IE 4.5 tries (82 tests)
AU 5.0 tries (140 tests)
CA 5.5 tries (120 tests)
IN 5.6 tries (75 tests)
BD 6.2 tries (38 tests)
KE 6.4 tries (36 tests)
NZ 6.4 tries (67 tests)
PK 7.1 tries (55 tests)
ZA 7.5 tries (35 tests)
GH 8.0 tries (33 tests)
JM 8.1 tries (33 tests)
TZ 8.6 tries (41 tests)
LK 9.2 tries (48 tests)
NG 9.5 tries (37 tests)
HK 9.7 tries (40 tests)
SG 10.1 tries (37 tests)
MY 10.9 tries (34 tests)
PH 11.5 tries (41 tests)
. . . . . 
Average Accuracy: 0.26181818181818184
Averaged Average tries: 7.461764976630898
GB 2.9 tries (326 tests)
US 3.3 tries (332 tests)
IE 4.9 tries (82 tests)
AU 5.0 tries (140 tests)
IN 5.5 tries (75 tests)
BD 5.9 tries (38 tests)
CA 6.0 tries (120 tests)
NZ 6.4 tries (67 tests)
KE 6.5 tries (36 tests)
PK 7.3 tries (55 tests)
GH 8.2 tries (33 tests)
ZA 8.2 tries (35 tests)
TZ 8.4 tries (41 tests)
JM 9.5 tries (3

In [22]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Classifier with class weights balanced
rf_model = RandomForestClassifier(random_state=3, class_weight='balanced')
average_performance(rf_model, X)
average_performance(rf_model, X_25)
average_performance(rf_model, X_50)

. . . . . 
Average Accuracy: 0.333939393939394
Averaged Average tries: 6.543373392699858
GB 1.4 tries (326 tests)
US 1.5 tries (332 tests)
IE 2.9 tries (82 tests)
AU 3.4 tries (140 tests)
CA 3.7 tries (120 tests)
IN 4.3 tries (75 tests)
JM 5.3 tries (33 tests)
BD 5.5 tries (38 tests)
NZ 6.5 tries (67 tests)
LK 6.6 tries (48 tests)
PK 7.4 tries (55 tests)
GH 8.1 tries (33 tests)
NG 8.2 tries (37 tests)
HK 8.5 tries (40 tests)
KE 8.5 tries (36 tests)
SG 8.9 tries (37 tests)
ZA 9.1 tries (35 tests)
MY 9.5 tries (34 tests)
TZ 10.4 tries (41 tests)
PH 11.2 tries (41 tests)
. . . . . 
Average Accuracy: 0.3381818181818182
Averaged Average tries: 6.8401225166525705
GB 1.5 tries (326 tests)
US 1.5 tries (332 tests)
IE 3.3 tries (82 tests)
AU 3.4 tries (140 tests)
CA 4.3 tries (120 tests)
IN 4.4 tries (75 tests)
BD 5.1 tries (38 tests)
NZ 6.4 tries (67 tests)
SG 7.1 tries (37 tests)
GH 7.3 tries (33 tests)
LK 7.6 tries (48 tests)
PK 7.9 tries (55 tests)
NG 8.0 tries (37 tests)
KE 8.8 tries (36 t

In [23]:
from sklearn.neural_network import MLPClassifier

# Multi-layer Perceptron Model
mlp_model = MLPClassifier(random_state=3)
average_performance(mlp_model,X)
average_performance(mlp_model,X)
average_performance(mlp_model,X)

. . . . . 
Average Accuracy: 0.3115151515151515
Averaged Average tries: 7.331611124639485
GB 2.0 tries (326 tests)
US 2.5 tries (332 tests)
CA 3.5 tries (120 tests)
IN 4.2 tries (75 tests)
IE 4.3 tries (82 tests)
AU 4.3 tries (140 tests)
JM 5.2 tries (33 tests)
NZ 7.3 tries (67 tests)
HK 7.9 tries (40 tests)
SG 8.1 tries (37 tests)
PH 8.2 tries (41 tests)
PK 8.6 tries (55 tests)
LK 8.7 tries (48 tests)
TZ 8.9 tries (41 tests)
BD 9.0 tries (38 tests)
KE 9.3 tries (36 tests)
ZA 10.4 tries (35 tests)
MY 11.2 tries (34 tests)
NG 11.3 tries (37 tests)
GH 11.9 tries (33 tests)
. . . . . 
Average Accuracy: 0.3115151515151515
Averaged Average tries: 7.331611124639485
GB 2.0 tries (326 tests)
US 2.5 tries (332 tests)
CA 3.5 tries (120 tests)
IN 4.2 tries (75 tests)
IE 4.3 tries (82 tests)
AU 4.3 tries (140 tests)
JM 5.2 tries (33 tests)
NZ 7.3 tries (67 tests)
HK 7.9 tries (40 tests)
SG 8.1 tries (37 tests)
PH 8.2 tries (41 tests)
PK 8.6 tries (55 tests)
LK 8.7 tries (48 tests)
TZ 8.9 tries (41