In [259]:
import copy
from collections import Counter
import pandas as pd

In [260]:
doc_dict = {}
id_dict = {}
word_count_dict = {}
country_set = set()

# pulls text IDs, country codes, and document types from the excel sheet,
# using it to divide the documents into a dictionary by ID
sources_df = pd.read_excel("./text/sampleSources.xlsx", sheet_name="texts")
for text_id, (country_code, doc_type), word_count in [(l[0], tuple(l[1].split()), l[2]) for l in sources_df[["textID", "country|genre", "# words"]].values.tolist()]:
    with open(f"./text/w_{country_code.lower()}_{doc_type.lower()}.txt", 'r',
              encoding="utf-8") as file:
        # add each text_id to id_dict
        if f"{country_code}_{doc_type}" not in id_dict:
            id_dict[f"{country_code}_{doc_type}"] = [text_id]
        else:
            id_dict[f"{country_code}_{doc_type}"].append(text_id)
        # makes country code set
        country_set.add(country_code)
        # finds correct text_id and adds every line in the document to the dictionary
        IS_DOC = False
        lines = file.readlines()
        for i, line in enumerate(lines):
            if line.strip().startswith(f"##{text_id}"):
                IS_DOC = True
            elif line.strip().startswith("##"):
                IS_DOC = False
            if IS_DOC:
                if text_id not in doc_dict:
                    doc_dict[text_id] = [w.lower() for w in line.split()]
                else:
                    doc_dict[text_id] += [w.lower() for w in line.split()]
        # adds word count to dictionary
        word_count_dict[text_id] = word_count

In [261]:
# make a counter for every word in the corpus
vocab = Counter({})
vocab['<UNK>'] = 0
for doc in doc_dict.values():
    for word in doc:
        if word in vocab:
            vocab[word] += 1
        else:
            vocab[word] = 1

# make a dictionary of sets for every country and record every word used by each country
# also make a word count for every country
vocab_sets = {country_code:set() for country_code in country_set}
country_word_counts = Counter({country_code:0 for country_code in country_set})
for country_code in country_set:
    for text_id in id_dict[f"{country_code}_B"] + id_dict[f"{country_code}_G"]:
        for word in doc_dict[text_id]:
            vocab_sets[country_code].add(word)
        country_word_counts[country_code] += word_count_dict[text_id]

# make new vocabulary sets, removing words that appear in less than
# 12.5%, 25%, 37.5%, and 50% of the countries datasets respectively
vocab_125 = vocab.copy()
vocab_25 = vocab.copy()
vocab_375 = vocab.copy()
vocab_50 = vocab.copy()
for word in vocab:
    COUNTRY_COUNT = 0
    for country_code in country_set:
        if word in vocab_sets[country_code]:
            COUNTRY_COUNT+=1
    if COUNTRY_COUNT / len(country_set) < 0.125:
        del vocab_125[word]
    if COUNTRY_COUNT / len(country_set) < 0.25:
        del vocab_25[word]
    if COUNTRY_COUNT / len(country_set) < 0.375:
        del vocab_375[word]
    if COUNTRY_COUNT / len(country_set) < 0.5:
        del vocab_50[word]

# Replace any words that appear in less than 12.5% of the countries’ datasets with the <UNK> token
doc_dict_125 = copy.deepcopy(doc_dict)
for text_id, doc in doc_dict_125.items():
    for i, word in enumerate(doc):
        if word not in vocab_125:
            doc_dict_125[text_id][i] = '<UNK>'
            vocab_125['<UNK>'] += 1

# Replace any words that appear in less than 25% of the countries’ datasets with the <UNK> token
doc_dict_25 = copy.deepcopy(doc_dict)
for text_id, doc in doc_dict_25.items():
    for i, word in enumerate(doc):
        if word not in vocab_25:
            doc_dict_25[text_id][i] = '<UNK>'
            vocab_25['<UNK>'] += 1

# Replace any words that appear in less than 37.5% of the countries’ datasets with the <UNK> token
doc_dict_375 = copy.deepcopy(doc_dict)
for text_id, doc in doc_dict_375.items():
    for i, word in enumerate(doc):
        if word not in vocab_375:
            doc_dict_375[text_id][i] = '<UNK>'
            vocab_375['<UNK>'] += 1

# Replace any words that appear in less than 50% of the countries’ datasets with the <UNK> token
doc_dict_50 = copy.deepcopy(doc_dict)
for text_id, doc in doc_dict_50.items():
    for i, word in enumerate(doc):
        if word not in vocab_50:
            doc_dict_50[text_id][i] = '<UNK>'
            vocab_50['<UNK>'] += 1

In [262]:
from sklearn.feature_extraction.text import CountVectorizer

# make a dataframe of one-hot representations of each text in each version of the dataset with their country labels
text_ids = []
texts = []
texts_125 = []
texts_25 = []
texts_375 = []
texts_50 = []
country_labels = []

for country_code in country_set:
    for text_id in id_dict[f"{country_code}_B"] + id_dict[f"{country_code}_G"]:
        text_ids.append(text_ids)
        texts.append(" ".join(doc_dict[text_id]))
        texts_125.append(" ".join(doc_dict_125[text_id]))
        texts_25.append(" ".join(doc_dict_25[text_id]))
        texts_375.append(" ".join(doc_dict_375[text_id]))
        texts_50.append(" ".join(doc_dict_50[text_id]))
        country_labels.append(country_code)

data = {
    'text_id': text_ids,
    'texts': texts,
    'texts_125': texts_125,
    'texts_25': texts_25,
    'texts_375': texts_375,
    'texts_50': texts_50,
    'country_labels': country_labels
}

df = pd.DataFrame(data)

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['texts'])
X_125 = vectorizer.fit_transform(df['texts_125'])
X_25 = vectorizer.fit_transform(df['texts_25'])
X_375 = vectorizer.fit_transform(df['texts_375'])
X_50 = vectorizer.fit_transform(df['texts_50'])
y = df['country_labels']

In [263]:
from sklearn.model_selection import train_test_split

# split the dataset into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=3)
print("Original class distribution:", Counter(y_train))

Original class distribution: Counter({'GB': 310, 'US': 309, 'AU': 113, 'CA': 107, 'IE': 82, 'IN': 75, 'NZ': 65, 'PK': 40, 'LK': 36, 'MY': 34, 'PH': 34, 'KE': 33, 'NG': 33, 'ZA': 33, 'JM': 31, 'SG': 31, 'GH': 31, 'HK': 31, 'BD': 31, 'TZ': 26})


In [None]:
most_similar_country = {c:Counter({c:0 for c in country_set}) for c in country_set}
times_added = {c:Counter({c:0 for c in country_set}) for c in country_set}
for test_num, probabilities in enumerate(rf_model.predict_proba(X_test)):
    for i, probability in enumerate(probabilities):
        most_similar_country[y_test.tolist()[test_num]][rf_model.classes_[i]] += probability
        times_added[y_test.tolist()[test_num]][rf_model.classes_[i]] += 1     

for country in most_similar_country:
    print(f"{country}",end=" ")
    for cur_country in most_similar_country[country]:
        if times_added[country][cur_country] != 0:
            most_similar_country[country][cur_country] = most_similar_country[country][cur_country] / times_added[country][cur_country]
        else: 
            most_similar_country[country][cur_country] = 0
    for i, (country_code,prob) in enumerate(most_similar_country[country].most_common(20)):
        if country_code==country:
            print(f"({i+1}):", end=" ")
    for cur_country, prob in most_similar_country[country].most_common(20):
        print(cur_country, f"{prob:.3f}", end=", ")
    print()

In [264]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Logistic Regression model with class weights balanced
model = LogisticRegression(solver='lbfgs', max_iter=500, class_weight='balanced')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

most_similar_country = {c:Counter({c:0 for c in country_set}) for c in country_set}
times_added = {c:Counter({c:0 for c in country_set}) for c in country_set}
for test_num, probabilities in enumerate(model.predict_proba(X_test)):
    for i, probability in enumerate(probabilities):
        most_similar_country[y_test.tolist()[test_num]][model.classes_[i]] += probability
        times_added[y_test.tolist()[test_num]][model.classes_[i]] += 1     

for country in most_similar_country:
    print(f"{country}",end=" ")
    for cur_country in most_similar_country[country]:
        if times_added[country][cur_country] != 0:
            most_similar_country[country][cur_country] = most_similar_country[country][cur_country] / times_added[country][cur_country]
        else: 
            most_similar_country[country][cur_country] = 0
    for i, (country_code,prob) in enumerate(most_similar_country[country].most_common(20)):
        if country_code==country:
            print(f"({i+1}):", end=" ")
    for cur_country, prob in most_similar_country[country].most_common(20):
        print(cur_country, f"{prob:.3f}", end=", ")
    print()

Accuracy: 0.28
LK (4): PK 0.210, GB 0.169, CA 0.134, LK 0.098, HK 0.072, NZ 0.068, US 0.038, GH 0.032, NG 0.030, IE 0.030, AU 0.025, JM 0.025, IN 0.015, TZ 0.010, ZA 0.009, MY 0.008, KE 0.008, BD 0.007, PH 0.007, SG 0.006, 
ZA (6): AU 0.214, CA 0.144, NG 0.090, GB 0.086, US 0.075, ZA 0.071, GH 0.069, IE 0.045, NZ 0.037, TZ 0.036, HK 0.032, IN 0.020, KE 0.018, JM 0.012, PH 0.011, LK 0.011, MY 0.009, PK 0.008, SG 0.007, BD 0.004, 
AU (1): AU 0.219, GB 0.168, US 0.146, IE 0.091, IN 0.081, CA 0.057, ZA 0.039, JM 0.037, PH 0.032, NZ 0.023, NG 0.019, HK 0.014, LK 0.013, MY 0.013, TZ 0.011, SG 0.009, BD 0.008, GH 0.008, PK 0.006, KE 0.005, 
PH (10): AU 0.227, US 0.213, IE 0.141, GB 0.131, CA 0.107, LK 0.036, NG 0.026, IN 0.019, NZ 0.016, PH 0.016, SG 0.010, GH 0.010, HK 0.009, KE 0.008, TZ 0.006, BD 0.006, JM 0.006, MY 0.006, ZA 0.004, PK 0.004, 
NZ (8): LK 0.179, AU 0.146, US 0.120, GB 0.116, ZA 0.088, CA 0.083, IN 0.053, NZ 0.046, IE 0.038, BD 0.037, JM 0.018, NG 0.012, GH 0.011, SG 0.010, 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [265]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Classifier with class weights balanced
rf_model = RandomForestClassifier(random_state=3, class_weight='balanced')
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

most_similar_country = {c:Counter({c:0 for c in country_set}) for c in country_set}
times_added = {c:Counter({c:0 for c in country_set}) for c in country_set}
for test_num, probabilities in enumerate(rf_model.predict_proba(X_test)):
    for i, probability in enumerate(probabilities):
        most_similar_country[y_test.tolist()[test_num]][rf_model.classes_[i]] += probability
        times_added[y_test.tolist()[test_num]][rf_model.classes_[i]] += 1     

for country in most_similar_country:
    print(f"{country}",end=" ")
    for cur_country in most_similar_country[country]:
        if times_added[country][cur_country] != 0:
            most_similar_country[country][cur_country] = most_similar_country[country][cur_country] / times_added[country][cur_country]
        else: 
            most_similar_country[country][cur_country] = 0
    for i, (country_code,prob) in enumerate(most_similar_country[country].most_common(20)):
        if country_code==country:
            print(f"({i+1}):", end=" ")
    for cur_country, prob in most_similar_country[country].most_common(20):
        print(cur_country, f"{prob:.3f}", end=", ")
    print()

Accuracy: 0.29
LK (3): US 0.232, GB 0.182, LK 0.136, AU 0.070, PK 0.052, CA 0.046, IN 0.040, HK 0.034, IE 0.030, BD 0.022, TZ 0.022, NZ 0.020, MY 0.020, JM 0.020, PH 0.016, NG 0.016, SG 0.016, ZA 0.010, KE 0.008, GH 0.008, 
ZA (5): US 0.213, GB 0.209, AU 0.084, CA 0.066, ZA 0.057, NZ 0.057, IE 0.041, KE 0.031, NG 0.031, IN 0.027, TZ 0.026, GH 0.024, HK 0.020, PK 0.020, SG 0.020, LK 0.019, JM 0.019, PH 0.013, BD 0.013, MY 0.010, 
AU (3): US 0.243, GB 0.241, AU 0.146, CA 0.057, IE 0.054, IN 0.045, NZ 0.035, LK 0.020, HK 0.018, ZA 0.017, MY 0.017, SG 0.016, GH 0.016, KE 0.015, PK 0.014, JM 0.012, PH 0.011, BD 0.009, NG 0.008, TZ 0.005, 
PH (8): GB 0.270, US 0.268, AU 0.095, NZ 0.053, CA 0.045, IN 0.043, IE 0.037, PH 0.035, HK 0.027, TZ 0.015, PK 0.015, NG 0.015, LK 0.013, ZA 0.013, KE 0.013, JM 0.013, SG 0.013, GH 0.010, MY 0.005, BD 0.005, 
NZ (4): US 0.203, GB 0.183, AU 0.167, NZ 0.148, CA 0.050, HK 0.047, IN 0.040, IE 0.027, TZ 0.023, MY 0.015, PK 0.012, SG 0.012, PH 0.010, GH 0.010, B

In [266]:
from sklearn.neural_network import MLPClassifier

# Multi-layer Perceptron Model
mlp_model = MLPClassifier(random_state=3)
mlp_model.fit(X_train, y_train)
y_pred = mlp_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

most_similar_country = {c:Counter({c:0 for c in country_set}) for c in country_set}
times_added = {c:Counter({c:0 for c in country_set}) for c in country_set}
for test_num, probabilities in enumerate(mlp_model.predict_proba(X_test)):
    for i, probability in enumerate(probabilities):
        most_similar_country[y_test.tolist()[test_num]][mlp_model.classes_[i]] += probability
        times_added[y_test.tolist()[test_num]][mlp_model.classes_[i]] += 1  
   
for country in most_similar_country:
    print(f"{country}",end=" ")
    for cur_country in most_similar_country[country]:
        if times_added[country][cur_country] != 0:
            most_similar_country[country][cur_country] = most_similar_country[country][cur_country] / times_added[country][cur_country]
        else: 
            most_similar_country[country][cur_country] = 0
    for i, (country_code,prob) in enumerate(most_similar_country[country].most_common(20)):
        if country_code==country:
            print(f"({i+1}):", end=" ")
    for cur_country, prob in most_similar_country[country].most_common(20):
        print(cur_country, f"{prob:.3f}", end=", ")
    print()

Accuracy: 0.30
LK (5): CA 0.473, GB 0.218, US 0.109, AU 0.036, LK 0.023, IE 0.021, HK 0.014, KE 0.014, IN 0.013, MY 0.013, NZ 0.011, NG 0.010, TZ 0.009, ZA 0.008, SG 0.006, PH 0.005, JM 0.005, PK 0.005, BD 0.005, GH 0.004, 
ZA (10): CA 0.306, AU 0.180, US 0.179, GB 0.154, IE 0.025, KE 0.019, NZ 0.019, IN 0.014, PH 0.014, ZA 0.013, TZ 0.013, HK 0.011, MY 0.010, BD 0.010, JM 0.010, GH 0.006, NG 0.005, SG 0.004, PK 0.004, LK 0.003, 
AU (1): AU 0.338, US 0.182, GB 0.168, CA 0.111, IE 0.023, NZ 0.018, HK 0.017, KE 0.016, TZ 0.015, JM 0.013, BD 0.013, MY 0.012, PH 0.012, IN 0.012, ZA 0.011, NG 0.009, LK 0.008, SG 0.008, PK 0.008, GH 0.006, 
PH (11): GB 0.333, CA 0.302, US 0.145, AU 0.075, KE 0.017, IN 0.015, TZ 0.013, NZ 0.012, JM 0.011, NG 0.009, PH 0.009, HK 0.009, BD 0.008, ZA 0.008, IE 0.007, SG 0.007, MY 0.006, PK 0.005, LK 0.004, GH 0.003, 
NZ (5): GB 0.350, CA 0.239, AU 0.182, US 0.071, NZ 0.061, JM 0.023, IE 0.016, ZA 0.010, HK 0.010, IN 0.006, TZ 0.006, PH 0.005, BD 0.005, LK 0.004,

In [267]:
from imblearn.under_sampling import RandomUnderSampler

# Even when class weights are balanced, the bigger countries are heavily favored
# so we can try undersampling the dataset so everything is equal
rus = RandomUnderSampler(random_state=3)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=3)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
print("Resampled class distribution:", Counter(y_resampled))

Resampled class distribution: Counter({'AU': 26, 'BD': 26, 'CA': 26, 'GB': 26, 'GH': 26, 'HK': 26, 'IE': 26, 'IN': 26, 'JM': 26, 'KE': 26, 'LK': 26, 'MY': 26, 'NG': 26, 'NZ': 26, 'PH': 26, 'PK': 26, 'SG': 26, 'TZ': 26, 'US': 26, 'ZA': 26})


In [268]:
# Trains a logistic regression model with undersampling
model = LogisticRegression(solver='lbfgs', max_iter=500)
model.fit(X_resampled, y_resampled)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

most_similar_country = {c:Counter({c:0 for c in country_set}) for c in country_set}
times_added = {c:Counter({c:0 for c in country_set}) for c in country_set}
for test_num, probabilities in enumerate(model.predict_proba(X_test)):
    for i, probability in enumerate(probabilities):
        most_similar_country[y_test.tolist()[test_num]][model.classes_[i]] += probability
        times_added[y_test.tolist()[test_num]][model.classes_[i]] += 1  

for country in most_similar_country:
    print(f"{country}",end=" ")
    for cur_country in most_similar_country[country]:
        if times_added[country][cur_country] != 0:
            most_similar_country[country][cur_country] = most_similar_country[country][cur_country] / times_added[country][cur_country]
        else: 
            most_similar_country[country][cur_country] = 0
    for i, (country_code,prob) in enumerate(most_similar_country[country].most_common(20)):
        if country_code==country:
            print(f"({i+1}):", end=" ")
    for cur_country, prob in most_similar_country[country].most_common(20):
        print(cur_country, f"{prob:.3f}", end=", ")
    print()

Accuracy: 0.09
LK (5): PK 0.212, NZ 0.125, JM 0.085, HK 0.085, LK 0.061, US 0.055, GH 0.050, NG 0.048, SG 0.044, ZA 0.040, MY 0.037, AU 0.035, IN 0.024, TZ 0.022, PH 0.018, CA 0.017, IE 0.015, BD 0.011, KE 0.009, GB 0.009, 
ZA (4): GB 0.232, IN 0.154, AU 0.154, ZA 0.066, GH 0.052, US 0.041, TZ 0.041, HK 0.039, NZ 0.039, IE 0.036, JM 0.023, NG 0.020, CA 0.018, MY 0.017, SG 0.017, PH 0.016, KE 0.014, LK 0.012, BD 0.004, PK 0.004, 
AU (8): US 0.117, IE 0.095, JM 0.095, ZA 0.063, NG 0.061, IN 0.053, HK 0.051, AU 0.050, GB 0.043, KE 0.042, TZ 0.041, LK 0.040, NZ 0.039, BD 0.039, GH 0.033, SG 0.032, PH 0.031, MY 0.030, CA 0.022, PK 0.022, 
PH (4): LK 0.136, IE 0.121, US 0.104, PH 0.100, NG 0.092, CA 0.080, NZ 0.053, AU 0.040, HK 0.033, GH 0.032, GB 0.030, SG 0.030, IN 0.028, JM 0.027, TZ 0.025, MY 0.019, ZA 0.019, KE 0.016, BD 0.011, PK 0.005, 
NZ (6): LK 0.190, ZA 0.171, AU 0.108, IE 0.074, US 0.072, NZ 0.053, NG 0.046, JM 0.043, SG 0.042, BD 0.035, TZ 0.025, PH 0.022, CA 0.022, MY 0.019, I

In [269]:
# Random Forest Classifier with undersampling
rf_model = RandomForestClassifier(random_state=3)
rf_model.fit(X_resampled, y_resampled)
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

most_similar_country = {c:Counter({c:0 for c in country_set}) for c in country_set}
times_added = {c:Counter({c:0 for c in country_set}) for c in country_set}
for test_num, probabilities in enumerate(rf_model.predict_proba(X_test)):
    for i, probability in enumerate(probabilities):
        most_similar_country[y_test.tolist()[test_num]][rf_model.classes_[i]] += probability
        times_added[y_test.tolist()[test_num]][rf_model.classes_[i]] += 1   
  
for country in most_similar_country:
    print(f"{country}",end=" ")
    for cur_country in most_similar_country[country]:
        if times_added[country][cur_country] != 0:
            most_similar_country[country][cur_country] = most_similar_country[country][cur_country] / times_added[country][cur_country]
        else: 
            most_similar_country[country][cur_country] = 0
    for i, (country_code,prob) in enumerate(most_similar_country[country].most_common(20)):
        if country_code==country:
            print(f"({i+1}):", end=" ")
    for cur_country, prob in most_similar_country[country].most_common(20):
        print(cur_country, f"{prob:.3f}", end=", ")
    print()

Accuracy: 0.16
LK (1): LK 0.108, US 0.080, PK 0.068, IE 0.064, JM 0.056, CA 0.054, NG 0.052, ZA 0.050, MY 0.050, NZ 0.048, HK 0.044, IN 0.042, TZ 0.042, GB 0.042, SG 0.040, AU 0.036, KE 0.036, PH 0.034, GH 0.028, BD 0.026, 
ZA (7): IE 0.071, KE 0.069, GH 0.067, TZ 0.061, PH 0.060, NG 0.056, ZA 0.053, NZ 0.053, BD 0.050, JM 0.050, US 0.049, HK 0.047, CA 0.046, IN 0.044, SG 0.041, LK 0.040, GB 0.040, AU 0.037, PK 0.036, MY 0.030, 
AU (7): US 0.095, NZ 0.062, JM 0.062, ZA 0.057, PH 0.055, SG 0.054, AU 0.054, IE 0.053, GB 0.052, MY 0.048, CA 0.046, HK 0.046, NG 0.045, KE 0.045, LK 0.044, IN 0.041, BD 0.041, GH 0.036, TZ 0.033, PK 0.029, 
PH (1): PH 0.085, AU 0.080, JM 0.080, NZ 0.075, IE 0.072, CA 0.070, HK 0.065, NG 0.062, SG 0.055, IN 0.052, LK 0.048, US 0.043, GH 0.037, GB 0.037, ZA 0.035, MY 0.035, TZ 0.025, BD 0.018, KE 0.015, PK 0.010, 
NZ (1): NZ 0.093, AU 0.085, HK 0.067, US 0.063, JM 0.063, ZA 0.057, GB 0.057, SG 0.057, CA 0.055, IN 0.047, LK 0.045, IE 0.045, PH 0.043, NG 0.043, G

In [270]:
from sklearn.neural_network import MLPClassifier

# Multi-layer Perceptron Model with undersampling
mlp_model = MLPClassifier(random_state=3)
mlp_model.fit(X_resampled, y_resampled)
y_pred = mlp_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

most_similar_country = {c:Counter({c:0 for c in country_set}) for c in country_set}
times_added = {c:Counter({c:0 for c in country_set}) for c in country_set}
for test_num, probabilities in enumerate(mlp_model.predict_proba(X_test)):
    for i, probability in enumerate(probabilities):
        most_similar_country[y_test.tolist()[test_num]][mlp_model.classes_[i]] += probability
        times_added[y_test.tolist()[test_num]][mlp_model.classes_[i]] += 1     

for country in most_similar_country:
    print(f"{country}",end=" ")
    for cur_country in most_similar_country[country]:
        if times_added[country][cur_country] != 0:
            most_similar_country[country][cur_country] = most_similar_country[country][cur_country] / times_added[country][cur_country]
        else: 
            most_similar_country[country][cur_country] = 0
    for i, (country_code,prob) in enumerate(most_similar_country[country].most_common(20)):
        if country_code==country:
            print(f"({i+1}):", end=" ")
    for cur_country, prob in most_similar_country[country].most_common(20):
        print(cur_country, f"{prob:.3f}", end=", ")
    print()

Accuracy: 0.15
LK (3): HK 0.402, JM 0.217, LK 0.056, NG 0.054, US 0.053, TZ 0.037, IN 0.026, MY 0.025, SG 0.023, KE 0.015, PH 0.015, GB 0.014, CA 0.012, IE 0.011, NZ 0.008, AU 0.008, GH 0.007, ZA 0.007, BD 0.006, PK 0.004, 
ZA (16): HK 0.210, JM 0.185, TZ 0.137, KE 0.067, NZ 0.052, IN 0.043, PH 0.038, GB 0.032, SG 0.027, AU 0.027, US 0.026, NG 0.024, BD 0.023, IE 0.022, CA 0.022, ZA 0.019, GH 0.016, MY 0.015, LK 0.009, PK 0.006, 
AU (13): US 0.198, TZ 0.179, JM 0.141, HK 0.074, PH 0.050, IE 0.042, NG 0.039, SG 0.036, IN 0.029, MY 0.029, KE 0.026, CA 0.026, AU 0.023, NZ 0.022, GB 0.020, BD 0.017, LK 0.016, ZA 0.014, GH 0.011, PK 0.008, 
PH (1): PH 0.189, US 0.115, JM 0.115, IN 0.099, NG 0.086, TZ 0.085, HK 0.046, SG 0.043, KE 0.034, MY 0.031, CA 0.030, GB 0.030, LK 0.018, NZ 0.017, IE 0.014, BD 0.012, AU 0.012, ZA 0.011, GH 0.007, PK 0.006, 
NZ (2): HK 0.187, NZ 0.131, TZ 0.092, US 0.072, JM 0.067, NG 0.061, ZA 0.053, AU 0.052, PH 0.040, KE 0.030, LK 0.030, IE 0.029, SG 0.027, CA 0.025,

In [271]:
# Calculate weights based on document number (default "balanced" weights)
total_docs = sum(Counter(y_train).values())
weights_dict_doc = Counter({country_code:0 for country_code in country_set})

for country_code in weights_dict_doc:
    weights_dict_doc[country_code] = total_docs/(len(country_set) * Counter(y_train)[country_code])

print(weights_dict_doc)

Counter({'TZ': 2.855769230769231, 'GH': 2.3951612903225805, 'BD': 2.3951612903225805, 'JM': 2.3951612903225805, 'HK': 2.3951612903225805, 'SG': 2.3951612903225805, 'ZA': 2.25, 'KE': 2.25, 'NG': 2.25, 'PH': 2.1838235294117645, 'MY': 2.1838235294117645, 'LK': 2.0625, 'PK': 1.85625, 'NZ': 1.1423076923076922, 'IN': 0.99, 'IE': 0.9054878048780488, 'CA': 0.6939252336448598, 'AU': 0.6570796460176991, 'US': 0.24029126213592233, 'GB': 0.23951612903225805})


In [272]:
# Calculate weights based on word count
total_words = sum(country_word_counts.values())
weights_dict_word = Counter({country_code:0 for country_code in country_set})

for country_code in weights_dict_word:
    weights_dict_word[country_code] = total_words/(len(country_set) * country_word_counts[country_code])

print(weights_dict_word)


Counter({'PH': 3.910410016719866, 'JM': 3.8743401980347127, 'GH': 3.3747532220509626, 'ZA': 2.622619654416637, 'BD': 2.469495572460464, 'KE': 2.4500928549320253, 'SG': 2.444912093133761, 'NG': 2.262075749582344, 'MY': 2.1723032593092966, 'HK': 2.0897236211518155, 'TZ': 2.0851083014203797, 'NZ': 1.4886779214767236, 'LK': 1.3360817179730204, 'IN': 0.9784297599239363, 'IE': 0.9313634718074034, 'PK': 0.7469376719338613, 'CA': 0.6877617376775271, 'AU': 0.4746887497866569, 'GB': 0.3223559981455493, 'US': 0.23762114120382663})


In [273]:
# Random Forest Classifier with class weights balanced by word count
rf_model = RandomForestClassifier(random_state=3, class_weight=weights_dict_word)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

most_similar_country = {c:Counter({c:0 for c in country_set}) for c in country_set}
times_added = {c:Counter({c:0 for c in country_set}) for c in country_set}
for test_num, probabilities in enumerate(rf_model.predict_proba(X_test)):
    for i, probability in enumerate(probabilities):
        most_similar_country[y_test.tolist()[test_num]][rf_model.classes_[i]] += probability
        times_added[y_test.tolist()[test_num]][rf_model.classes_[i]] += 1     

for country in most_similar_country:
    print(f"{country}",end=" ")
    for cur_country in most_similar_country[country]:
        if times_added[country][cur_country] != 0:
            most_similar_country[country][cur_country] = most_similar_country[country][cur_country] / times_added[country][cur_country]
        else: 
            most_similar_country[country][cur_country] = 0
    for i, (country_code,prob) in enumerate(most_similar_country[country].most_common(20)):
        if country_code==country:
            print(f"({i+1}):", end=" ")
    for cur_country, prob in most_similar_country[country].most_common(20):
        print(cur_country, f"{prob:.3f}", end=", ")
    print()

Accuracy: 0.28
LK (3): GB 0.232, US 0.198, LK 0.118, AU 0.056, IN 0.050, CA 0.050, NZ 0.044, PK 0.038, HK 0.036, BD 0.024, IE 0.024, PH 0.024, JM 0.022, SG 0.016, GH 0.014, KE 0.012, TZ 0.012, NG 0.012, MY 0.010, ZA 0.008, 
ZA (7): US 0.234, GB 0.229, AU 0.070, NZ 0.059, CA 0.056, IE 0.053, ZA 0.043, IN 0.033, KE 0.030, TZ 0.026, GH 0.023, NG 0.023, HK 0.020, PH 0.017, PK 0.017, LK 0.016, BD 0.016, SG 0.014, MY 0.013, JM 0.010, 
AU (3): GB 0.268, US 0.248, AU 0.113, IN 0.058, CA 0.053, IE 0.044, NZ 0.039, LK 0.021, ZA 0.021, MY 0.016, KE 0.016, PH 0.015, PK 0.014, HK 0.013, BD 0.012, SG 0.012, GH 0.011, NG 0.010, TZ 0.009, JM 0.006, 
PH (8): US 0.280, GB 0.270, AU 0.075, CA 0.065, NZ 0.048, IE 0.030, IN 0.028, PH 0.025, MY 0.025, ZA 0.022, BD 0.022, HK 0.020, KE 0.018, JM 0.015, NG 0.015, LK 0.013, GH 0.010, SG 0.010, TZ 0.005, PK 0.005, 
NZ (3): US 0.222, GB 0.207, NZ 0.133, AU 0.122, CA 0.053, HK 0.033, IE 0.030, IN 0.025, PH 0.020, SG 0.018, LK 0.018, GH 0.017, BD 0.015, PK 0.015, M

In [274]:
# It looks like the weight balanced RF and MLP model without undersampling had the best performance so we will use them for the 25%...
X_train, X_test, y_train, y_test = train_test_split(X_25, y, test_size=0.1, random_state=3)

In [275]:
# Random Forest Classifier with class weights balanced
rf_model = RandomForestClassifier(random_state=3, class_weight='balanced')
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

most_similar_country = {c:Counter({c:0 for c in country_set}) for c in country_set}
times_added = {c:Counter({c:0 for c in country_set}) for c in country_set}
for test_num, probabilities in enumerate(rf_model.predict_proba(X_test)):
    for i, probability in enumerate(probabilities):
        most_similar_country[y_test.tolist()[test_num]][rf_model.classes_[i]] += probability
        times_added[y_test.tolist()[test_num]][rf_model.classes_[i]] += 1     

for country in most_similar_country:
    print(f"{country}",end=" ")
    for cur_country in most_similar_country[country]:
        if times_added[country][cur_country] != 0:
            most_similar_country[country][cur_country] = most_similar_country[country][cur_country] / times_added[country][cur_country]
        else: 
            most_similar_country[country][cur_country] = 0
    for i, (country_code,prob) in enumerate(most_similar_country[country].most_common(20)):
        if country_code==country:
            print(f"({i+1}):", end=" ")
    for cur_country, prob in most_similar_country[country].most_common(20):
        print(cur_country, f"{prob:.3f}", end=", ")
    print()

Accuracy: 0.27
LK (3): GB 0.188, US 0.176, LK 0.164, AU 0.076, CA 0.052, IE 0.048, NZ 0.044, PK 0.034, IN 0.032, HK 0.032, TZ 0.024, SG 0.022, NG 0.018, JM 0.016, BD 0.014, ZA 0.014, KE 0.014, PH 0.012, GH 0.012, MY 0.008, 
ZA (6): US 0.224, GB 0.169, AU 0.086, NZ 0.074, CA 0.064, ZA 0.054, IN 0.041, IE 0.036, JM 0.030, NG 0.030, KE 0.030, GH 0.029, BD 0.023, PH 0.019, LK 0.017, PK 0.017, TZ 0.016, HK 0.016, MY 0.013, SG 0.013, 
AU (3): US 0.244, GB 0.225, AU 0.157, IE 0.058, IN 0.052, CA 0.046, NZ 0.033, LK 0.024, MY 0.024, KE 0.018, ZA 0.016, SG 0.016, HK 0.014, BD 0.013, PH 0.012, GH 0.011, NG 0.011, PK 0.010, JM 0.008, TZ 0.007, 
PH (12): US 0.268, GB 0.253, AU 0.125, CA 0.062, NZ 0.052, IN 0.045, IE 0.028, GH 0.023, JM 0.022, HK 0.022, SG 0.022, PH 0.015, KE 0.015, PK 0.015, NG 0.010, ZA 0.007, MY 0.005, TZ 0.005, BD 0.005, LK 0.000, 
NZ (4): AU 0.188, US 0.168, GB 0.152, NZ 0.145, HK 0.053, CA 0.047, IN 0.038, IE 0.033, GH 0.025, LK 0.022, ZA 0.017, TZ 0.017, BD 0.017, PH 0.013, 

In [276]:
# Multi-layer Perceptron Model removing words that appear in less than 25% of countries
mlp_model = MLPClassifier(random_state=3)
mlp_model.fit(X_train, y_train)
y_pred = mlp_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

most_similar_country = {c:Counter({c:0 for c in country_set}) for c in country_set}
times_added = {c:Counter({c:0 for c in country_set}) for c in country_set}
for test_num, probabilities in enumerate(mlp_model.predict_proba(X_test)):
    for i, probability in enumerate(probabilities):
        most_similar_country[y_test.tolist()[test_num]][mlp_model.classes_[i]] += probability
        times_added[y_test.tolist()[test_num]][mlp_model.classes_[i]] += 1  
   
for country in most_similar_country:
    print(f"{country}",end=" ")
    for cur_country in most_similar_country[country]:
        if times_added[country][cur_country] != 0:
            most_similar_country[country][cur_country] = most_similar_country[country][cur_country] / times_added[country][cur_country]
        else: 
            most_similar_country[country][cur_country] = 0
    for i, (country_code,prob) in enumerate(most_similar_country[country].most_common(20)):
        if country_code==country:
            print(f"({i+1}):", end=" ")
    for cur_country, prob in most_similar_country[country].most_common(20):
        print(cur_country, f"{prob:.3f}", end=", ")
    print()

Accuracy: 0.25
LK (8): GB 0.459, CA 0.158, IE 0.115, US 0.052, NG 0.042, AU 0.035, JM 0.027, LK 0.024, MY 0.017, IN 0.014, HK 0.012, SG 0.010, BD 0.008, TZ 0.008, ZA 0.005, GH 0.004, NZ 0.003, PH 0.003, KE 0.003, PK 0.002, 
ZA (4): US 0.233, GB 0.219, IN 0.129, ZA 0.059, IE 0.057, AU 0.055, CA 0.039, NZ 0.039, PH 0.033, JM 0.026, HK 0.021, MY 0.017, NG 0.014, GH 0.011, SG 0.009, BD 0.009, LK 0.008, TZ 0.008, PK 0.006, KE 0.006, 
AU (1): AU 0.272, GB 0.261, US 0.215, IE 0.054, CA 0.040, NG 0.020, MY 0.014, ZA 0.013, IN 0.013, TZ 0.011, HK 0.011, SG 0.010, PH 0.010, KE 0.010, JM 0.010, BD 0.009, LK 0.007, NZ 0.007, PK 0.006, GH 0.006, 
PH (13): GB 0.277, AU 0.253, US 0.213, CA 0.127, IE 0.030, NG 0.019, HK 0.011, IN 0.010, JM 0.008, LK 0.008, BD 0.007, SG 0.007, PH 0.005, ZA 0.005, KE 0.005, MY 0.004, NZ 0.004, GH 0.004, TZ 0.002, PK 0.001, 
NZ (13): GB 0.331, US 0.263, AU 0.223, CA 0.097, JM 0.015, LK 0.011, IE 0.011, TZ 0.006, HK 0.006, SG 0.006, NG 0.005, PH 0.005, NZ 0.005, MY 0.003,

In [277]:
# RF and MLP model without undersampling for the 50% dataset
X_train, X_test, y_train, y_test = train_test_split(X_50, y, test_size=0.1, random_state=3)

In [278]:
# Random Forest Classifier with class weights balanced
rf_model = RandomForestClassifier(random_state=3, class_weight='balanced')
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

most_similar_country = {c:Counter({c:0 for c in country_set}) for c in country_set}
times_added = {c:Counter({c:0 for c in country_set}) for c in country_set}
for test_num, probabilities in enumerate(rf_model.predict_proba(X_test)):
    for i, probability in enumerate(probabilities):
        most_similar_country[y_test.tolist()[test_num]][rf_model.classes_[i]] += probability
        times_added[y_test.tolist()[test_num]][rf_model.classes_[i]] += 1     

for country in most_similar_country:
    print(f"{country}",end=" ")
    for cur_country in most_similar_country[country]:
        if times_added[country][cur_country] != 0:
            most_similar_country[country][cur_country] = most_similar_country[country][cur_country] / times_added[country][cur_country]
        else: 
            most_similar_country[country][cur_country] = 0
    for i, (country_code,prob) in enumerate(most_similar_country[country].most_common(20)):
        if country_code==country:
            print(f"({i+1}):", end=" ")
    for cur_country, prob in most_similar_country[country].most_common(20):
        print(cur_country, f"{prob:.3f}", end=", ")
    print()

Accuracy: 0.22
LK (11): US 0.230, GB 0.180, AU 0.088, CA 0.060, PK 0.056, HK 0.052, IE 0.044, IN 0.042, NZ 0.038, SG 0.032, LK 0.028, MY 0.020, TZ 0.020, BD 0.020, PH 0.020, JM 0.018, KE 0.016, NG 0.016, ZA 0.012, GH 0.008, 
ZA (6): US 0.177, GB 0.170, AU 0.097, NZ 0.080, GH 0.056, ZA 0.050, CA 0.046, KE 0.036, IN 0.034, IE 0.034, LK 0.029, JM 0.029, NG 0.027, PK 0.024, BD 0.024, TZ 0.023, PH 0.021, MY 0.017, HK 0.014, SG 0.011, 
AU (3): US 0.225, GB 0.223, AU 0.144, IE 0.058, IN 0.054, CA 0.054, NZ 0.035, LK 0.023, SG 0.021, ZA 0.021, JM 0.019, BD 0.018, NG 0.016, KE 0.016, PH 0.015, GH 0.014, MY 0.013, PK 0.012, HK 0.011, TZ 0.010, 
PH (18): US 0.258, GB 0.222, AU 0.092, CA 0.085, NZ 0.055, IE 0.042, IN 0.038, HK 0.037, MY 0.025, NG 0.018, SG 0.017, GH 0.015, TZ 0.015, LK 0.013, ZA 0.013, KE 0.013, PK 0.013, PH 0.010, BD 0.010, JM 0.010, 
NZ (4): US 0.170, GB 0.153, AU 0.148, NZ 0.128, IE 0.048, CA 0.043, HK 0.042, IN 0.032, PK 0.028, MY 0.027, ZA 0.025, JM 0.023, TZ 0.023, LK 0.022,

In [279]:
# Multi-layer Perceptron Model removing words that appear in less than 50% of countries
X_train, X_test, y_train, y_test = train_test_split(X_50, y, test_size=0.1, random_state=3)
mlp_model = MLPClassifier(random_state=3)
mlp_model.fit(X_train, y_train)
y_pred = mlp_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

most_similar_country = {c:Counter({c:0 for c in country_set}) for c in country_set}
times_added = {c:Counter({c:0 for c in country_set}) for c in country_set}
for test_num, probabilities in enumerate(mlp_model.predict_proba(X_test)):
    for i, probability in enumerate(probabilities):
        most_similar_country[y_test.tolist()[test_num]][mlp_model.classes_[i]] += probability
        times_added[y_test.tolist()[test_num]][mlp_model.classes_[i]] += 1  

for country in most_similar_country:
    print(f"{country}",end=" ")
    for cur_country in most_similar_country[country]:
        if times_added[country][cur_country] != 0:
            most_similar_country[country][cur_country] = most_similar_country[country][cur_country] / times_added[country][cur_country]
        else: 
            most_similar_country[country][cur_country] = 0
    for i, (country_code,prob) in enumerate(most_similar_country[country].most_common(20)):
        if country_code==country:
            print(f"({i+1}):", end=" ")
    for cur_country, prob in most_similar_country[country].most_common(20):
        print(cur_country, f"{prob:.3f}", end=", ")
    print()

Accuracy: 0.23
LK (7): CA 0.259, GB 0.247, PK 0.208, IN 0.116, AU 0.035, US 0.030, LK 0.027, HK 0.013, NG 0.010, NZ 0.009, IE 0.009, TZ 0.008, ZA 0.008, SG 0.005, JM 0.005, KE 0.003, BD 0.003, MY 0.002, GH 0.002, PH 0.001, 
ZA (6): GB 0.307, IE 0.136, US 0.107, CA 0.090, GH 0.075, ZA 0.064, AU 0.037, IN 0.032, KE 0.023, PH 0.022, TZ 0.022, NZ 0.017, HK 0.016, PK 0.011, NG 0.010, JM 0.009, MY 0.008, BD 0.005, SG 0.004, LK 0.003, 
AU (3): US 0.316, GB 0.277, AU 0.177, IE 0.038, CA 0.035, NG 0.022, IN 0.022, ZA 0.013, HK 0.012, NZ 0.010, LK 0.010, PH 0.009, SG 0.009, MY 0.008, BD 0.008, GH 0.008, JM 0.007, PK 0.007, TZ 0.006, KE 0.005, 
PH (7): US 0.363, GB 0.235, CA 0.124, AU 0.106, IE 0.049, IN 0.028, PH 0.014, PK 0.012, NZ 0.009, NG 0.009, GH 0.008, KE 0.008, SG 0.007, BD 0.007, HK 0.007, ZA 0.003, TZ 0.003, JM 0.003, MY 0.003, LK 0.002, 
NZ (5): GB 0.392, US 0.222, AU 0.097, IE 0.051, NZ 0.049, CA 0.045, BD 0.043, IN 0.023, ZA 0.018, LK 0.015, PK 0.013, HK 0.005, JM 0.004, NG 0.004, G