In [20]:
import pandas as pd

doc_dict = {}
id_dict = {}
word_count_dict = {}
country_set = set()

# pulls text IDs, country codes, document types, and word counts from the excel sheet,
# using it to divide the documents into a dictionary by ID
sources_df = pd.read_excel("./text/sampleSources.xlsx", sheet_name="texts")
for text_id, (country_code, doc_type), word_count in [(l[0], tuple(l[1].split()), l[2]) for l in sources_df[["textID", "country|genre", "# words"]].values.tolist()]:
    with open(f"./text/w_{country_code.lower()}_{doc_type.lower()}.txt", 'r',
              encoding="utf-8") as file:
        # add each text_id to id_dict
        if f"{country_code}_{doc_type}" not in id_dict:
            id_dict[f"{country_code}_{doc_type}"] = [text_id]
        else:
            id_dict[f"{country_code}_{doc_type}"].append(text_id)
        # makes country code set
        country_set.add(country_code)
        # finds correct text_id and adds every line in the document to the dictionary
        IS_DOC = False
        lines = file.readlines()
        for i, line in enumerate(lines):
            if line.strip().startswith(f"##{text_id}"):
                IS_DOC = True
            elif line.strip().startswith("##"):
                IS_DOC = False
            if IS_DOC:
                if text_id not in doc_dict:
                    doc_dict[text_id] = [w.lower() for w in line.split()]
                else:
                    doc_dict[text_id] += [w.lower() for w in line.split()]
        # adds word count to dictionary
        word_count_dict[text_id] = word_count

In [21]:
import copy
from collections import Counter

# make a counter for every word in the corpus
vocab = Counter({})
vocab['<UNK>'] = 0
for doc in doc_dict.values():
    for word in doc:
        if word in vocab:
            vocab[word] += 1
        else:
            vocab[word] = 1

# make a dictionary of sets for every country and record every word used by each country
# also make a word count for every country
vocab_sets = {country_code:set() for country_code in country_set}
country_word_counts = Counter({country_code:0 for country_code in country_set})
for country_code in country_set:
    for text_id in id_dict[f"{country_code}_B"] + id_dict[f"{country_code}_G"]:
        for word in doc_dict[text_id]:
            vocab_sets[country_code].add(word)
        country_word_counts[country_code] += word_count_dict[text_id]

# make new vocabulary sets, removing words that appear in less than
# 12.5%, 25%, 37.5%, and 50% of the countries datasets respectively
vocab_25 = vocab.copy()
vocab_50 = vocab.copy()
for word in vocab:
    COUNTRY_COUNT = 0
    for country_code in country_set:
        if word in vocab_sets[country_code]:
            COUNTRY_COUNT+=1
    if COUNTRY_COUNT / len(country_set) < 0.25:
        del vocab_25[word]
    if COUNTRY_COUNT / len(country_set) < 0.5:
        del vocab_50[word]

# Replace any words that appear in less than 25% of the countries’ datasets with the <UNK> token
doc_dict_25 = copy.deepcopy(doc_dict)
for text_id, doc in doc_dict_25.items():
    for i, word in enumerate(doc):
        if word not in vocab_25:
            doc_dict_25[text_id][i] = '<UNK>'
            vocab_25['<UNK>'] += 1

# Replace any words that appear in less than 50% of the countries’ datasets with the <UNK> token
doc_dict_50 = copy.deepcopy(doc_dict)
for text_id, doc in doc_dict_50.items():
    for i, word in enumerate(doc):
        if word not in vocab_50:
            doc_dict_50[text_id][i] = '<UNK>'
            vocab_50['<UNK>'] += 1

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

# make a dataframe of one-hot representations of each text in each version of the dataset with their country labels
text_ids = []
texts = []
texts_25 = []
texts_50 = []
country_labels = []

for country_code in country_set:
    for text_id in id_dict[f"{country_code}_B"] + id_dict[f"{country_code}_G"]:
        text_ids.append(text_ids)
        texts.append(" ".join(doc_dict[text_id]))
        texts_25.append(" ".join(doc_dict_25[text_id]))
        texts_50.append(" ".join(doc_dict_50[text_id]))
        country_labels.append(country_code)

data = {
    'text_id': text_ids,
    'texts': texts,
    'texts_25': texts_25,
    'texts_50': texts_50,
    'country_labels': country_labels
}

df = pd.DataFrame(data)

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['texts'])
X_25 = vectorizer.fit_transform(df['texts_25'])
X_50 = vectorizer.fit_transform(df['texts_50'])
y = df['country_labels']

In [23]:
def probability_breakdown(model, X_test, y_test):
    # makes a dictionary of probability Counters for every country label for each country's documents
    most_probable_country = {c:Counter({c:0 for c in country_set}) for c in country_set}
    for test_num, probabilities in enumerate(model.predict_proba(X_test)):
        for i, probability in enumerate(probabilities):
            most_probable_country[y_test.tolist()[test_num]][model.classes_[i]] += probability  

    return most_probable_country

In [24]:
def average_tries(model, X_test, y_test, print_output=False):
    """
    Takes the model and testing data.
    Returns average tries needed to return the right label,
    average tries per country, and number of tests per country.
    """
    
    # Matches each country label to its probability in a dataframe
    df = pd.DataFrame(model.predict_proba(X_test).tolist(), columns=model.classes_.tolist())

    # Sorts each probability distribution for highest probability
    # and records the number of iterations needed to get to the right label
    country_try_count = {c:0 for c in country_set}
    country_test_count = Counter(y_test)
    for i in range(len(df)):
        for try_count, (country_code, _) in enumerate(sorted(df.iloc[i].to_dict().items(), key=lambda item: item[1], reverse=True)):
            if country_code == y_test.tolist()[i]:
                country_try_count[country_code] += try_count+1
                break
    
    # Averages the country try count by number of tests
    country_try_count = {country:country_try_count[country]/country_test_count[country] for country in country_try_count}
    # Computes overal average try count
    avg_tries = sum(country_try_count.values())/len(country_try_count)
    
    # Either returns or prints the results
    if print_output:
        for country, try_count in sorted(country_try_count.items(), key=lambda item: item[1]):
            print(country, f"{try_count:.1f} tries", f"({country_test_count[country]} tests)")
        print("Average number of tries:", avg_tries)
    else:
        return avg_tries, country_try_count, country_test_count

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

def average_performance(model, X_dataset, random_states=5):
    """
    Takes the model, dataset, and number of random test splits to test.
    Prints average accuracy and tries between all test splits of the model.
    """
    accuracies = []
    tries = []
    avg_country_try_count = {c:0 for c in country_set}
    total_country_test_count = {c:0 for c in country_set}
    avg_most_probable_country = {c:Counter({c:0 for c in country_set}) for c in country_set}

    # trains models on the specified number of random splits
    for r in range(random_states):
        X_train, X_test, y_train, y_test = train_test_split(X_dataset, y, test_size=0.2, random_state=r)
        model.fit(X_train, y_train)
        # computes accuracies
        y_pred = model.predict(X_test)
        accuracies.append(accuracy_score(y_test, y_pred))
        # computes overall and country try counts
        avg_tries, country_try_count, country_test_count = average_tries(model, X_test, y_test)
        tries.append(avg_tries)
        # computes similar countries
        most_probable_country = probability_breakdown(model, X_test, y_test)
        for country in country_try_count:
            avg_country_try_count[country] += country_try_count[country]
            total_country_test_count[country] += country_test_count[country]
            avg_most_probable_country[country] += most_probable_country[country]
        print(".",end=" ")
    print()
    
    print("Average Accuracy:", np.mean(accuracies))
    print("Averaged Average tries:", np.mean(tries))
    # Sorts country try counts and prints them in order
    avg_country_try_count = {country:avg_country_try_count[country]/(random_states) for country in avg_country_try_count}
    for country, try_count in sorted(avg_country_try_count.items(), key=lambda item: item[1]):
        most_similar_five = " ".join([country for country, _ in avg_most_probable_country[country].most_common(5)])
        print(f"{country}: {try_count:4.1f} tries", f"({total_country_test_count[country]:3d} tests) | {most_similar_five}")

In [26]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression model with class weights balanced
model = LogisticRegression(max_iter=2000, class_weight='balanced')
average_performance(model, X)
average_performance(model, X_25)
average_performance(model, X_50)

. . . . . 
Average Accuracy: 0.31878787878787884
Averaged Average tries: 7.1662871208676835
GB:  2.5 tries (328 tests) | GB US AU CA IN
US:  2.7 tries (342 tests) | US GB CA AU NZ
AU:  4.7 tries (129 tests) | AU US GB CA NZ
IE:  5.0 tries (101 tests) | IE GB US PH AU
CA:  5.3 tries (119 tests) | GB US CA AU IE
GH:  5.5 tries ( 28 tests) | GH AU US GB CA
PK:  5.5 tries ( 45 tests) | PK US GB IN AU
NZ:  5.7 tries ( 63 tests) | NZ GB US PH AU
LK:  6.0 tries ( 37 tests) | LK GB IN US IE
IN:  6.7 tries ( 96 tests) | GB IN US CA AU
BD:  7.6 tries ( 34 tests) | BD GB US IN IE
KE:  8.0 tries ( 43 tests) | KE US GB JM CA
NG:  8.6 tries ( 37 tests) | GB US NG AU HK
HK:  8.7 tries ( 32 tests) | US GB HK AU CA
TZ:  8.9 tries ( 27 tests) | GB TZ US IN JM
MY:  9.8 tries ( 39 tests) | GB US MY SG CA
SG:  9.9 tries ( 36 tests) | US SG GB MY AU
ZA: 10.3 tries ( 43 tests) | US AU CA GB ZA
JM: 10.8 tries ( 34 tests) | GB US IE JM CA
PH: 11.1 tries ( 37 tests) | GB US AU CA KE
. . . . . 
Average Accuracy:

In [27]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Classifier with class weights balanced
rf_model = RandomForestClassifier(random_state=3, class_weight='balanced')
average_performance(rf_model, X)
average_performance(rf_model, X_25)
average_performance(rf_model, X_50)

. . . . . 
Average Accuracy: 0.32727272727272727
Averaged Average tries: 6.291609257495978
GB:  1.4 tries (328 tests) | GB US AU CA IE
US:  1.6 tries (342 tests) | US GB CA AU IN
AU:  3.2 tries (129 tests) | GB US AU CA NZ
CA:  3.7 tries (119 tests) | US GB CA AU IN
IE:  3.7 tries (101 tests) | GB US IE AU CA
LK:  3.9 tries ( 37 tests) | LK GB US IN AU
NZ:  4.2 tries ( 63 tests) | GB US NZ AU CA
IN:  5.3 tries ( 96 tests) | GB US IN AU CA
NG:  5.3 tries ( 37 tests) | GB US NG AU CA
BD:  6.7 tries ( 34 tests) | GB US BD IN CA
GH:  6.8 tries ( 28 tests) | GB US GH AU ZA
PK:  7.1 tries ( 45 tests) | GB US PK AU CA
SG:  7.5 tries ( 36 tests) | US GB SG AU CA
KE:  8.1 tries ( 43 tests) | GB US AU CA KE
JM:  8.3 tries ( 34 tests) | GB US JM CA AU
HK:  9.1 tries ( 32 tests) | GB US AU HK CA
TZ:  9.2 tries ( 27 tests) | GB US AU TZ CA
MY:  9.5 tries ( 39 tests) | GB US CA MY AU
PH: 10.2 tries ( 37 tests) | US GB AU CA PH
ZA: 11.1 tries ( 43 tests) | GB US AU CA ZA
. . . . . 
Average Accuracy: 

In [28]:
from sklearn.neural_network import MLPClassifier

# Multi-layer Perceptron Model
mlp_model = MLPClassifier(random_state=3)
average_performance(mlp_model,X)
average_performance(mlp_model,X_25)
average_performance(mlp_model,X_50)

. . . . . 
Average Accuracy: 0.33272727272727276
Averaged Average tries: 7.3105587667934255
GB:  2.0 tries (328 tests) | GB US CA AU IN
US:  2.2 tries (342 tests) | US GB CA AU IE
CA:  3.4 tries (119 tests) | US GB CA AU PH
AU:  3.7 tries (129 tests) | GB US AU CA PH
IE:  5.2 tries (101 tests) | GB US IE AU CA
NZ:  5.7 tries ( 63 tests) | GB US NZ AU CA
PK:  6.3 tries ( 45 tests) | GB US PK CA PH
IN:  6.5 tries ( 96 tests) | GB US IN AU CA
LK:  6.8 tries ( 37 tests) | GB LK US AU CA
PH:  6.8 tries ( 37 tests) | US GB CA AU PH
TZ:  6.9 tries ( 27 tests) | GB US CA AU IE
KE:  7.3 tries ( 43 tests) | US GB CA AU PH
BD:  7.4 tries ( 34 tests) | GB BD US AU CA
SG:  8.4 tries ( 36 tests) | GB US AU SG CA
HK:  8.9 tries ( 32 tests) | US HK CA GB AU
JM:  9.2 tries ( 34 tests) | GB US JM CA AU
NG: 11.3 tries ( 37 tests) | GB US CA AU NG
GH: 11.9 tries ( 28 tests) | GB AU US CA GH
MY: 12.9 tries ( 39 tests) | GB US CA AU IN
ZA: 13.5 tries ( 43 tests) | US CA GB AU JM
. . . . . 
Average Accuracy: