In [3]:
import pandas as pd

doc_dict = {}
id_dict = {}
word_count_dict = {}
country_set = set()

# pulls text IDs, country codes, document types, and word counts from the excel sheet,
# using it to divide the documents into a dictionary by ID
sources_df = pd.read_excel("./text/sampleSources.xlsx", sheet_name="texts")
for text_id, (country_code, doc_type), word_count in [(l[0], tuple(l[1].split()), l[2]) for l in sources_df[["textID", "country|genre", "# words"]].values.tolist()]:
    with open(f"./text/w_{country_code.lower()}_{doc_type.lower()}.txt", 'r',
              encoding="utf-8") as file:
        # add each text_id to id_dict
        if f"{country_code}_{doc_type}" not in id_dict:
            id_dict[f"{country_code}_{doc_type}"] = [text_id]
        else:
            id_dict[f"{country_code}_{doc_type}"].append(text_id)
        # makes country code set
        country_set.add(country_code)
        # finds correct text_id and adds every line in the document to the dictionary
        IS_DOC = False
        lines = file.readlines()
        for i, line in enumerate(lines):
            if line.strip().startswith(f"##{text_id}"):
                IS_DOC = True
            elif line.strip().startswith("##"):
                IS_DOC = False
            if IS_DOC:
                if text_id not in doc_dict:
                    doc_dict[text_id] = [w.lower() for w in line.split()]
                else:
                    doc_dict[text_id] += [w.lower() for w in line.split()]
        # adds word count to dictionary
        word_count_dict[text_id] = word_count

In [4]:
import copy
from collections import Counter

# make a counter for every word in the corpus
vocab = Counter({})
vocab['<UNK>'] = 0
for doc in doc_dict.values():
    for word in doc:
        if word in vocab:
            vocab[word] += 1
        else:
            vocab[word] = 1

# make a dictionary of sets for every country and record every word used by each country
# also make a word count for every country
vocab_sets = {country_code:set() for country_code in country_set}
country_word_counts = Counter({country_code:0 for country_code in country_set})
for country_code in country_set:
    for text_id in id_dict[f"{country_code}_B"] + id_dict[f"{country_code}_G"]:
        for word in doc_dict[text_id]:
            vocab_sets[country_code].add(word)
        country_word_counts[country_code] += word_count_dict[text_id]

# make new vocabulary sets, removing words that appear in less than
# 25% and 50% of the countries datasets
vocab_25 = vocab.copy()
vocab_50 = vocab.copy()
for word in vocab:
    COUNTRY_COUNT = 0
    for country_code in country_set:
        if word in vocab_sets[country_code]:
            COUNTRY_COUNT+=1
    if COUNTRY_COUNT / len(country_set) < 0.25:
        del vocab_25[word]
    if COUNTRY_COUNT / len(country_set) < 0.5:
        del vocab_50[word]

# Replace any words that appear in less than 25% of the countries’ datasets with the <UNK> token
doc_dict_25 = copy.deepcopy(doc_dict)
for text_id, doc in doc_dict_25.items():
    for i, word in enumerate(doc):
        if word not in vocab_25:
            doc_dict_25[text_id][i] = '<UNK>'
            vocab_25['<UNK>'] += 1

# Replace any words that appear in less than 50% of the countries’ datasets with the <UNK> token
doc_dict_50 = copy.deepcopy(doc_dict)
for text_id, doc in doc_dict_50.items():
    for i, word in enumerate(doc):
        if word not in vocab_50:
            doc_dict_50[text_id][i] = '<UNK>'
            vocab_50['<UNK>'] += 1

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

# make a dataframe with each version of the dataset with their country labels
texts = []
texts_25 = []
texts_50 = []
country_labels = []

for country_code in country_set:
    for text_id in id_dict[f"{country_code}_B"] + id_dict[f"{country_code}_G"]:
        texts.append(" ".join(doc_dict[text_id]))
        texts_25.append(" ".join(doc_dict_25[text_id]))
        texts_50.append(" ".join(doc_dict_50[text_id]))
        country_labels.append(country_code)

data = {
    'texts': texts,
    'texts_25': texts_25,
    'texts_50': texts_50,
    'country_labels': country_labels
}

df = pd.DataFrame(data)

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['texts'])
X_25 = vectorizer.fit_transform(df['texts_25'])
X_50 = vectorizer.fit_transform(df['texts_50'])
y = df['country_labels']

In [6]:
def probability_breakdown(model, X_test, y_test):
    # makes a dictionary of probability Counters for every country label for each country's documents
    most_probable_country = {c:Counter({c:0 for c in country_set}) for c in country_set}
    for test_num, probabilities in enumerate(model.predict_proba(X_test)):
        for i, probability in enumerate(probabilities):
            most_probable_country[y_test.tolist()[test_num]][model.classes_[i]] += probability  

    return most_probable_country

In [7]:
def average_tries(model, X_test, y_test, print_output=False):
    """
    Takes the model and testing data.
    Returns average tries needed to return the right label,
    average tries per country, and number of tests per country.
    """
    
    # Matches each country label to its probability in a dataframe
    df = pd.DataFrame(model.predict_proba(X_test).tolist(), columns=model.classes_.tolist())

    # Sorts each probability distribution for highest probability
    # and records the number of iterations needed to get to the right label
    country_try_count = {c:0 for c in country_set}
    country_test_count = Counter(y_test)
    for i in range(len(df)):
        for try_count, (country_code, _) in enumerate(sorted(df.iloc[i].to_dict().items(), key=lambda item: item[1], reverse=True)):
            if country_code == y_test.tolist()[i]:
                country_try_count[country_code] += try_count+1
                break
    
    # Averages the country try count by number of tests
    country_try_count = {country:country_try_count[country]/country_test_count[country] for country in country_try_count}
    # Computes overal average try count
    avg_tries = sum(country_try_count.values())/len(country_try_count)
    
    # Either returns or prints the results
    if print_output:
        for country, try_count in sorted(country_try_count.items(), key=lambda item: item[1]):
            print(country, f"{try_count:.1f} tries", f"({country_test_count[country]} tests)")
        print("Average number of tries:", avg_tries)
    else:
        return avg_tries, country_try_count, country_test_count

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
import random

def average_performance(model, X_dataset, y_dataset = y, random_states=10, undersampling=False, scrambling=False):
    """
    Takes the model, dataset, and number of random test splits to test.
    Prints average accuracy and tries between all test splits of the model.
    """
    accuracies = []
    tries = []
    avg_country_try_count = {c:0 for c in country_set}
    total_country_test_count = {c:0 for c in country_set}
    avg_most_probable_country = {c:Counter({c:0 for c in country_set}) for c in country_set}
    
    if scrambling:
        random.seed = 3
        y_scrambled = y_dataset.tolist()
        random.shuffle(y_scrambled)
        y_dataset = pd.Series(y_scrambled)

    # trains models on the specified number of random splits
    for r in range(random_states):
        X_train, X_test, y_train, y_test = train_test_split(X_dataset, y_dataset, test_size=0.2, random_state=r)
        # undersampling
        if undersampling:
            X_train, y_train = RandomUnderSampler(random_state=3).fit_resample(X_train, y_train)
        model.fit(X_train, y_train)
        # computes accuracies
        y_pred = model.predict(X_test)
        accuracies.append(accuracy_score(y_test, y_pred))
        # computes overall and country try counts
        avg_tries, country_try_count, country_test_count = average_tries(model, X_test, y_test)
        tries.append(avg_tries)
        # computes similar countries
        most_probable_country = probability_breakdown(model, X_test, y_test)
        for country in country_try_count:
            avg_country_try_count[country] += country_try_count[country]
            total_country_test_count[country] += country_test_count[country]
            avg_most_probable_country[country] += most_probable_country[country]
        print(".",end=" ")
    print()
    
    print("Average Accuracy:", np.mean(accuracies))
    print("Averaged Average tries:", np.mean(tries))
    # Sorts country try counts and prints them in order
    avg_country_try_count = {country:avg_country_try_count[country]/(random_states) for country in avg_country_try_count}
    for country, try_count in sorted(avg_country_try_count.items(), key=lambda item: item[1]):
        most_similar_five = " ".join([country for country, _ in avg_most_probable_country[country].most_common(5)])
        print(f"{country}: {try_count:4.1f} tries", f"({total_country_test_count[country]:3d} tests) | {most_similar_five}")

In [None]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression model with class weights balanced
model = LogisticRegression(max_iter=2000, class_weight='balanced')
average_performance(model, X)
average_performance(model, X_25)
average_performance(model, X_50)

. . . . . . . . . . 
Average Accuracy: 0.3115151515151516
Averaged Average tries: 7.4150612910582625
GB:  2.5 tries (663 tests) | GB US AU CA IN
US:  2.7 tries (685 tests) | US GB AU CA NZ
AU:  4.4 tries (272 tests) | AU GB US CA NZ
IE:  5.1 tries (185 tests) | IE GB US PH CA
CA:  5.2 tries (226 tests) | US GB CA AU IE
NZ:  6.0 tries (121 tests) | NZ GB US AU PH
GH:  6.3 tries ( 64 tests) | GH GB US AU CA
IN:  6.3 tries (186 tests) | GB US IN CA NZ
PK:  6.4 tries ( 92 tests) | PK US GB AU IN
LK:  6.9 tries ( 82 tests) | LK GB US IN AU
KE:  7.4 tries ( 81 tests) | KE US GB JM CA
BD:  8.2 tries ( 64 tests) | BD GB US IE IN
ZA:  9.4 tries ( 81 tests) | US GB CA AU ZA
HK:  9.6 tries ( 74 tests) | US HK GB MY IN
TZ:  9.7 tries ( 54 tests) | GB US TZ IN IE
NG:  9.8 tries ( 77 tests) | GB US NG AU IN
SG:  9.9 tries ( 75 tests) | US GB SG IN MY
MY: 10.0 tries ( 76 tests) | GB US CA MY JM
JM: 11.3 tries ( 67 tests) | GB US IE JM AU
PH: 11.3 tries ( 75 tests) | GB US AU CA KE
. . . . . . . . . .

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Classifier with class weights balanced
rf_model = RandomForestClassifier(random_state=3, class_weight='balanced')
average_performance(rf_model, X)
average_performance(rf_model, X_25)
average_performance(rf_model, X_50)

. . . . . . . . . . 
Average Accuracy: 0.2793939393939394
Averaged Average tries: 8.269544545510092
US:  1.5 tries (654 tests) | US GB CA AU IN
GB:  1.6 tries (696 tests) | GB US CA AU IN
AU:  3.9 tries (270 tests) | US GB AU CA NZ
CA:  4.3 tries (219 tests) | US GB CA AU IN
IE:  4.6 tries (185 tests) | GB US IE AU CA
IN:  5.0 tries (172 tests) | US GB IN CA AU
NZ:  6.3 tries (142 tests) | US GB NZ AU CA
PK:  7.5 tries ( 85 tests) | GB US PK CA AU
SG:  8.4 tries ( 74 tests) | US GB AU SG CA
HK:  8.6 tries ( 79 tests) | US GB AU CA HK
BD:  9.4 tries ( 63 tests) | GB US IN AU CA
LK:  9.7 tries ( 67 tests) | GB US IN CA AU
KE:  9.8 tries ( 66 tests) | GB US AU CA KE
GH: 10.7 tries ( 66 tests) | US GB AU CA ZA
JM: 11.2 tries ( 75 tests) | GB US CA AU IE
ZA: 11.6 tries ( 84 tests) | GB US AU CA IN
NG: 11.6 tries ( 75 tests) | US GB CA AU IN
PH: 12.8 tries ( 79 tests) | US GB CA AU IN
MY: 13.3 tries ( 75 tests) | US GB CA AU IN
TZ: 13.7 tries ( 74 tests) | US GB IE AU CA


In [None]:
from sklearn.metrics import classification_report
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)
rf_model = RandomForestClassifier(random_state=3, class_weight='balanced')
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          AU       0.40      0.07      0.12        28
          BD       1.00      0.17      0.29         6
          CA       1.00      0.04      0.07        27
          GB       0.27      0.56      0.37        62
          GH       1.00      0.40      0.57        10
          HK       1.00      0.11      0.20         9
          IE       0.86      0.33      0.48        18
          IN       0.50      0.05      0.09        20
          JM       0.00      0.00      0.00         8
          KE       1.00      0.12      0.22         8
          LK       1.00      0.25      0.40         8
          MY       0.00      0.00      0.00         9
          NG       1.00      0.36      0.53        11
          NZ       1.00      0.29      0.45        17
          PH       0.00      0.00      0.00         6
          PK       1.00      0.27      0.43        11
          SG       0.00      0.00      0.00         6
          TZ       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.neural_network import MLPClassifier

# Multi-layer Perceptron Model
mlp_model = MLPClassifier(random_state=3)
average_performance(mlp_model,X)
average_performance(mlp_model,X_25)
average_performance(mlp_model,X_50)

. . . . . . . . . . 
Average Accuracy: 0.3251515151515152
Averaged Average tries: 7.455500516530901
GB:  2.0 tries (663 tests) | GB US CA AU IN
US:  2.2 tries (685 tests) | US GB CA AU IN
CA:  3.7 tries (226 tests) | US GB CA AU IN
AU:  4.0 tries (272 tests) | GB US AU CA IE
IE:  5.4 tries (185 tests) | GB US IE CA AU
IN:  5.5 tries (186 tests) | GB US IN CA AU
NZ:  6.2 tries (121 tests) | GB US NZ CA AU
TZ:  6.6 tries ( 54 tests) | GB US AU CA TZ
PK:  7.1 tries ( 92 tests) | US GB PK CA AU
KE:  7.3 tries ( 81 tests) | US GB CA AU KE
LK:  7.7 tries ( 82 tests) | GB US LK AU CA
HK:  8.0 tries ( 74 tests) | US GB HK CA AU
PH:  8.4 tries ( 75 tests) | GB US CA PH AU
JM:  8.6 tries ( 67 tests) | GB US CA JM NZ
SG:  9.2 tries ( 75 tests) | GB US AU SG CA
BD:  9.2 tries ( 64 tests) | GB US BD AU CA
NG: 11.4 tries ( 77 tests) | GB US AU CA NG
GH: 11.6 tries ( 64 tests) | GB US AU GH CA
MY: 12.3 tries ( 76 tests) | US GB CA AU IN
ZA: 12.9 tries ( 81 tests) | US CA GB AU JM
. . . . . . . . . . 

In [None]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression model with undersampling
model = LogisticRegression(max_iter=2000)
average_performance(model, X, undersampling=True)
average_performance(model, X_25, undersampling=True)
average_performance(model, X_50, undersampling=True)

. . . . . . . . . . 
Average Accuracy: 0.14575757575757578
Averaged Average tries: 7.680798657555729
BD:  5.1 tries ( 80 tests) | BD KE IN PK GB
GH:  5.4 tries ( 67 tests) | GH KE PH TZ NG
KE:  6.0 tries ( 88 tests) | KE TZ PH JM US
NG:  7.0 tries ( 81 tests) | NG TZ ZA GH KE
PK:  7.3 tries ( 97 tests) | PK NG MY JM PH
TZ:  7.6 tries ( 60 tests) | TZ PH GH IN HK
ZA:  7.7 tries ( 80 tests) | ZA JM IN TZ CA
SG:  7.8 tries ( 75 tests) | SG PH IN HK MY
LK:  7.8 tries ( 96 tests) | LK KE TZ PH JM
JM:  8.0 tries ( 65 tests) | ZA JM AU US SG
NZ:  8.0 tries (128 tests) | NZ PH GB ZA IN
HK:  8.0 tries ( 69 tests) | HK MY IN JM AU
IN:  8.1 tries (177 tests) | IN GB TZ CA IE
AU:  8.1 tries (261 tests) | AU NG PH GB US
US:  8.2 tries (646 tests) | GB US CA PH NG
IE:  8.3 tries (179 tests) | IE US PH GB TZ
GB:  8.3 tries (652 tests) | GB US PH NZ CA
PH:  8.8 tries ( 77 tests) | PH SG GB CA IE
CA:  8.9 tries (246 tests) | US CA GB AU PH
MY:  9.2 tries ( 76 tests) | PH BD SG MY GB
. . . . . . . . . .

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Classifier with undersampling
rf_model = RandomForestClassifier(random_state=3)
average_performance(rf_model, X, undersampling=True)
average_performance(rf_model, X_25, undersampling=True)
average_performance(rf_model, X_50, undersampling=True)

. . . . . . . . . . 
Average Accuracy: 0.21545454545454543
Averaged Average tries: 6.8500851223667265
BD:  4.2 tries ( 80 tests) | BD KE IN NZ IE
GH:  5.4 tries ( 67 tests) | GH TZ ZA KE NG
NG:  5.6 tries ( 81 tests) | NG ZA GB JM CA
JM:  5.7 tries ( 65 tests) | JM GB CA IN SG
IE:  5.9 tries (179 tests) | IE GB PH AU US
SG:  6.3 tries ( 75 tests) | SG GB PH US CA
LK:  6.4 tries ( 96 tests) | LK IN GB CA US
KE:  6.6 tries ( 88 tests) | KE TZ ZA AU CA
CA:  6.6 tries (246 tests) | CA GB PH US JM
NZ:  6.7 tries (128 tests) | NZ GB ZA AU US
IN:  6.7 tries (177 tests) | IN CA GB US LK
PH:  6.8 tries ( 77 tests) | PH GB US CA AU
HK:  6.9 tries ( 69 tests) | HK PH CA AU GB
AU:  7.0 tries (261 tests) | AU GB US NZ PH
GB:  7.2 tries (652 tests) | GB US PH CA IN
TZ:  8.0 tries ( 60 tests) | TZ ZA GH PH IE
PK:  8.5 tries ( 97 tests) | PK GB CA BD IN
US:  8.6 tries (646 tests) | GB US CA IN PH
ZA:  8.7 tries ( 80 tests) | ZA GB US CA TZ
MY:  9.1 tries ( 76 tests) | MY GB CA PH US
. . . . . . . . . 

In [None]:
from sklearn.neural_network import MLPClassifier

# Multi-layer Perceptron Model with undersampling
mlp_model = MLPClassifier(random_state=3)
average_performance(mlp_model,X, undersampling=True)
average_performance(mlp_model,X_25, undersampling=True)
average_performance(mlp_model,X_50, undersampling=True)

. . . . . . . . . . 
Average Accuracy: 0.15696969696969698
Averaged Average tries: 7.61506610507797
TZ:  3.4 tries ( 60 tests) | TZ PH US IN JM
IN:  4.0 tries (177 tests) | IN US PH JM TZ
JM:  4.6 tries ( 65 tests) | JM IN PH US TZ
PH:  4.6 tries ( 77 tests) | PH TZ US IN JM
HK:  5.4 tries ( 69 tests) | HK US JM TZ IN
SG:  5.5 tries ( 75 tests) | SG PH HK IN US
KE:  6.1 tries ( 88 tests) | TZ US KE PH IN
LK:  6.7 tries ( 96 tests) | LK IN TZ US PH
US:  7.3 tries (646 tests) | US PH IN TZ JM
BD:  7.5 tries ( 80 tests) | IN BD TZ PH US
NZ:  7.6 tries (128 tests) | PH US NZ IN JM
CA:  7.9 tries (246 tests) | PH US JM TZ IN
GB:  8.1 tries (652 tests) | PH US IN TZ JM
NG:  8.6 tries ( 81 tests) | US TZ NG IN JM
AU:  8.9 tries (261 tests) | PH US TZ IN AU
IE:  9.2 tries (179 tests) | PH TZ US IN IE
GH:  9.6 tries ( 67 tests) | GH TZ JM IN US
MY: 11.5 tries ( 76 tests) | PH IN US JM HK
PK: 12.7 tries ( 97 tests) | PH US TZ IN PK
ZA: 13.0 tries ( 80 tests) | TZ JM IN PH US
. . . . . . . . . . 

In [15]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression model with scrambling
model = LogisticRegression(max_iter=2000)
average_performance(model, X, scrambling=True)
average_performance(model, X_25, scrambling=True)
average_performance(model, X_50, scrambling=True)

. 

KeyboardInterrupt: 

In [2]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Classifier with scrambled labels
rf_model = RandomForestClassifier(random_state=3)
average_performance(rf_model, X, scrambling=True)
average_performance(rf_model, X_25, scrambling=True)
average_performance(rf_model, X_50, scrambling=True)

NameError: name 'average_performance' is not defined

In [None]:
from sklearn.neural_network import MLPClassifier

# Multi-layer Perceptron Model with scrambling
mlp_model = MLPClassifier(random_state=3)
average_performance(mlp_model,X, scrambling=True)
average_performance(mlp_model,X_25, scrambling=True)
average_performance(mlp_model,X_50, scrambling=True)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_50, y, test_size=0.2, random_state=3)
rf_model.predict_proba(X_test).tolist()

[[0.7,
  0.02,
  0.03,
  0.07,
  0.0,
  0.0,
  0.03,
  0.01,
  0.02,
  0.01,
  0.01,
  0.0,
  0.01,
  0.0,
  0.01,
  0.01,
  0.0,
  0.0,
  0.07,
  0.0],
 [0.04,
  0.0,
  0.02,
  0.66,
  0.02,
  0.03,
  0.02,
  0.02,
  0.0,
  0.0,
  0.01,
  0.0,
  0.02,
  0.0,
  0.0,
  0.02,
  0.02,
  0.01,
  0.1,
  0.01],
 [0.04,
  0.01,
  0.04,
  0.63,
  0.0,
  0.0,
  0.03,
  0.03,
  0.0,
  0.03,
  0.01,
  0.01,
  0.0,
  0.02,
  0.0,
  0.02,
  0.02,
  0.01,
  0.07,
  0.03],
 [0.03,
  0.01,
  0.01,
  0.69,
  0.0,
  0.0,
  0.0,
  0.0,
  0.02,
  0.01,
  0.01,
  0.0,
  0.07,
  0.0,
  0.0,
  0.02,
  0.0,
  0.01,
  0.09,
  0.03],
 [0.04,
  0.0,
  0.06,
  0.04,
  0.01,
  0.02,
  0.01,
  0.04,
  0.0,
  0.01,
  0.0,
  0.01,
  0.61,
  0.01,
  0.01,
  0.02,
  0.03,
  0.01,
  0.05,
  0.02],
 [0.01,
  0.02,
  0.0,
  0.09,
  0.01,
  0.01,
  0.01,
  0.04,
  0.61,
  0.02,
  0.03,
  0.0,
  0.0,
  0.01,
  0.0,
  0.02,
  0.0,
  0.04,
  0.06,
  0.02],
 [0.03,
  0.04,
  0.02,
  0.26,
  0.02,
  0.01,
  0.04,
  0.04,
  0.08