In [150]:
import copy
from collections import Counter
import pandas as pd

In [151]:
doc_dict = {}
id_dict = {}
word_count_dict = {}
country_set = set()

# pulls text IDs, country codes, document types, and word counts from the excel sheet,
# using it to divide the documents into a dictionary by ID
sources_df = pd.read_excel("./text/sampleSources.xlsx", sheet_name="texts")
for text_id, (country_code, doc_type), word_count in [(l[0], tuple(l[1].split()), l[2]) for l in sources_df[["textID", "country|genre", "# words"]].values.tolist()]:
    with open(f"./text/w_{country_code.lower()}_{doc_type.lower()}.txt", 'r',
              encoding="utf-8") as file:
        # add each text_id to id_dict
        if f"{country_code}_{doc_type}" not in id_dict:
            id_dict[f"{country_code}_{doc_type}"] = [text_id]
        else:
            id_dict[f"{country_code}_{doc_type}"].append(text_id)
        # makes country code set
        country_set.add(country_code)
        # finds correct text_id and adds every line in the document to the dictionary
        IS_DOC = False
        lines = file.readlines()
        for i, line in enumerate(lines):
            if line.strip().startswith(f"##{text_id}"):
                IS_DOC = True
            elif line.strip().startswith("##"):
                IS_DOC = False
            if IS_DOC:
                if text_id not in doc_dict:
                    doc_dict[text_id] = [w.lower() for w in line.split()]
                else:
                    doc_dict[text_id] += [w.lower() for w in line.split()]
        # adds word count to dictionary
        word_count_dict[text_id] = word_count

In [152]:
# make a counter for every word in the corpus
vocab = Counter({})
vocab['<UNK>'] = 0
for doc in doc_dict.values():
    for word in doc:
        if word in vocab:
            vocab[word] += 1
        else:
            vocab[word] = 1

# make a dictionary of sets for every country and record every word used by each country
# also make a word count for every country
vocab_sets = {country_code:set() for country_code in country_set}
country_word_counts = Counter({country_code:0 for country_code in country_set})
for country_code in country_set:
    for text_id in id_dict[f"{country_code}_B"] + id_dict[f"{country_code}_G"]:
        for word in doc_dict[text_id]:
            vocab_sets[country_code].add(word)
        country_word_counts[country_code] += word_count_dict[text_id]

# make new vocabulary sets, removing words that appear in less than
# 12.5%, 25%, 37.5%, and 50% of the countries datasets respectively
vocab_125 = vocab.copy()
vocab_25 = vocab.copy()
vocab_375 = vocab.copy()
vocab_50 = vocab.copy()
for word in vocab:
    COUNTRY_COUNT = 0
    for country_code in country_set:
        if word in vocab_sets[country_code]:
            COUNTRY_COUNT+=1
    if COUNTRY_COUNT / len(country_set) < 0.125:
        del vocab_125[word]
    if COUNTRY_COUNT / len(country_set) < 0.25:
        del vocab_25[word]
    if COUNTRY_COUNT / len(country_set) < 0.375:
        del vocab_375[word]
    if COUNTRY_COUNT / len(country_set) < 0.5:
        del vocab_50[word]

# Replace any words that appear in less than 12.5% of the countries’ datasets with the <UNK> token
doc_dict_125 = copy.deepcopy(doc_dict)
for text_id, doc in doc_dict_125.items():
    for i, word in enumerate(doc):
        if word not in vocab_125:
            doc_dict_125[text_id][i] = '<UNK>'
            vocab_125['<UNK>'] += 1

# Replace any words that appear in less than 25% of the countries’ datasets with the <UNK> token
doc_dict_25 = copy.deepcopy(doc_dict)
for text_id, doc in doc_dict_25.items():
    for i, word in enumerate(doc):
        if word not in vocab_25:
            doc_dict_25[text_id][i] = '<UNK>'
            vocab_25['<UNK>'] += 1

# Replace any words that appear in less than 37.5% of the countries’ datasets with the <UNK> token
doc_dict_375 = copy.deepcopy(doc_dict)
for text_id, doc in doc_dict_375.items():
    for i, word in enumerate(doc):
        if word not in vocab_375:
            doc_dict_375[text_id][i] = '<UNK>'
            vocab_375['<UNK>'] += 1

# Replace any words that appear in less than 50% of the countries’ datasets with the <UNK> token
doc_dict_50 = copy.deepcopy(doc_dict)
for text_id, doc in doc_dict_50.items():
    for i, word in enumerate(doc):
        if word not in vocab_50:
            doc_dict_50[text_id][i] = '<UNK>'
            vocab_50['<UNK>'] += 1

print(len(vocab))
print(len(vocab_50))

73745
5412


In [153]:
from sklearn.feature_extraction.text import CountVectorizer

# make a dataframe of one-hot representations of each text in each version of the dataset with their country labels
text_ids = []
texts = []
texts_125 = []
texts_25 = []
texts_375 = []
texts_50 = []
country_labels = []

for country_code in country_set:
    for text_id in id_dict[f"{country_code}_B"] + id_dict[f"{country_code}_G"]:
        text_ids.append(text_ids)
        texts.append(" ".join(doc_dict[text_id]))
        texts_125.append(" ".join(doc_dict_125[text_id]))
        texts_25.append(" ".join(doc_dict_25[text_id]))
        texts_375.append(" ".join(doc_dict_375[text_id]))
        texts_50.append(" ".join(doc_dict_50[text_id]))
        country_labels.append(country_code)

data = {
    'text_id': text_ids,
    'texts': texts,
    'texts_125': texts_125,
    'texts_25': texts_25,
    'texts_375': texts_375,
    'texts_50': texts_50,
    'country_labels': country_labels
}

df = pd.DataFrame(data)

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['texts'])
X_125 = vectorizer.fit_transform(df['texts_125'])
X_25 = vectorizer.fit_transform(df['texts_25'])
X_375 = vectorizer.fit_transform(df['texts_375'])
X_50 = vectorizer.fit_transform(df['texts_50'])
y = df['country_labels']

In [154]:
from sklearn.model_selection import train_test_split

# split the dataset into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=3)
print("Original class distribution:", Counter(y_train))

Original class distribution: Counter({'US': 310, 'GB': 309, 'AU': 117, 'CA': 103, 'IE': 77, 'IN': 76, 'NZ': 64, 'LK': 39, 'PK': 37, 'PH': 36, 'MY': 35, 'SG': 35, 'ZA': 34, 'BD': 33, 'KE': 32, 'GH': 31, 'HK': 31, 'NG': 30, 'JM': 30, 'TZ': 26})


In [155]:
def probability_breakdown(model):
    """
    Prints the average probabilities for each country label on a given country's documents,
    highlighting where the country itself ranks in paranthesis.
    "Average tries estimate" is the estimated average number of tries needed to identify a country correctly.
    """
    most_similar_country = {c:Counter({c:0 for c in country_set}) for c in country_set}
    times_added = {c:Counter({c:0 for c in country_set}) for c in country_set}
    for test_num, probabilities in enumerate(model.predict_proba(X_test)):
        for i, probability in enumerate(probabilities):
            most_similar_country[y_test.tolist()[test_num]][model.classes_[i]] += probability
            times_added[y_test.tolist()[test_num]][model.classes_[i]] += 1     
    total_tries = 0
    for country in most_similar_country:
        print(f"{country}",end=" ")
        for cur_country in most_similar_country[country]:
            if times_added[country][cur_country] != 0:
                most_similar_country[country][cur_country] = most_similar_country[country][cur_country] / times_added[country][cur_country]
            else: 
                most_similar_country[country][cur_country] = 0
        for i, (country_code,prob) in enumerate(most_similar_country[country].most_common(20)):
            if country_code==country:
                print(f"({i+1}):", end=" ")
                total_tries+=(i+1)
                break
        for cur_country, prob in most_similar_country[country].most_common(20):
            print(cur_country, f"{prob:.3f}", end=", ")
        print()
    print()
    

In [156]:
def average_tries(model, X_test, y_test):
    """
    Takes the model and testing data.
    Matches each country label to its probability
    Sorts each probability distribution for highest probability
    records the number of iterations needed to get to the right label

    """
    country_try_count = {c:0 for c in country_set}
    country_test_count = {c:0 for c in country_set}
    df = pd.DataFrame(model.predict_proba(X_test).tolist(), columns=model.classes_.tolist())
    for i in range(len(df)):
        for try_count, (country_code, _) in enumerate(sorted(df.iloc[i].to_dict().items(), key=lambda item: item[1], reverse=True)):
            if country_code == y_test.tolist()[i]:
                country_try_count[country_code] += try_count+1
                country_test_count[country_code] += 1
                break
    country_try_count = {country:country_try_count[country]/country_test_count[country] for country in country_try_count}
    
    for country, try_count in sorted(country_try_count.items(), key=lambda item: item[1]):
        print(country, f"{try_count:.1f} tries", f"({country_test_count[country]} tests)")

    print("Average number of tries:", sum(country_try_count.values())/len(country_try_count))

In [157]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Logistic Regression model with class weights balanced
model = LogisticRegression(solver='lbfgs', max_iter=500, class_weight='balanced')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

probability_breakdown(model)
average_tries(model,X_test,y_test)

Accuracy: 0.28
KE (10): US 0.435, GB 0.230, CA 0.116, GH 0.070, JM 0.054, AU 0.027, NG 0.025, NZ 0.008, IE 0.008, KE 0.005, LK 0.005, PK 0.003, ZA 0.003, BD 0.003, MY 0.002, TZ 0.002, PH 0.002, HK 0.001, IN 0.001, SG 0.001, 
NG (2): GB 0.200, NG 0.157, CA 0.115, KE 0.100, NZ 0.097, TZ 0.078, GH 0.061, US 0.043, PK 0.041, IE 0.019, AU 0.017, IN 0.015, ZA 0.013, HK 0.012, PH 0.009, JM 0.008, SG 0.007, LK 0.004, MY 0.003, BD 0.002, 
ZA (5): GB 0.175, CA 0.163, NG 0.146, AU 0.124, ZA 0.122, IE 0.067, US 0.060, MY 0.041, NZ 0.033, PK 0.012, BD 0.010, TZ 0.009, SG 0.008, LK 0.008, IN 0.006, GH 0.006, HK 0.004, JM 0.003, KE 0.003, PH 0.001, 
IE (1): IE 0.322, GB 0.163, US 0.142, PH 0.098, LK 0.072, IN 0.039, TZ 0.033, CA 0.033, ZA 0.028, AU 0.018, NZ 0.009, JM 0.008, PK 0.006, KE 0.005, HK 0.005, BD 0.004, NG 0.004, GH 0.004, SG 0.004, MY 0.003, 
HK (1): HK 0.304, GB 0.161, JM 0.148, MY 0.147, AU 0.105, CA 0.048, US 0.033, SG 0.013, NZ 0.007, PH 0.007, IE 0.007, LK 0.006, IN 0.005, NG 0.004, 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [158]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Classifier with class weights balanced
rf_model = RandomForestClassifier(random_state=3, class_weight='balanced')
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

probability_breakdown(rf_model)
average_tries(rf_model,X_test,y_test)

Accuracy: 0.35
KE (3): GB 0.255, US 0.215, KE 0.080, CA 0.062, AU 0.052, IE 0.050, BD 0.028, TZ 0.028, IN 0.025, ZA 0.022, GH 0.022, PK 0.022, JM 0.020, LK 0.020, HK 0.018, NZ 0.018, MY 0.018, NG 0.017, SG 0.015, PH 0.013, 
NG (3): GB 0.207, US 0.183, NG 0.161, CA 0.077, AU 0.049, IN 0.049, HK 0.039, IE 0.036, NZ 0.029, KE 0.027, PK 0.023, ZA 0.021, TZ 0.021, LK 0.017, MY 0.017, SG 0.016, GH 0.011, PH 0.009, BD 0.006, JM 0.003, 
ZA (8): US 0.272, GB 0.213, CA 0.078, AU 0.062, IN 0.057, IE 0.038, NZ 0.037, ZA 0.032, PH 0.027, HK 0.025, SG 0.023, LK 0.020, KE 0.018, GH 0.018, NG 0.017, MY 0.015, TZ 0.013, PK 0.013, BD 0.012, JM 0.010, 
IE (2): GB 0.233, IE 0.207, US 0.205, AU 0.065, CA 0.045, IN 0.039, ZA 0.027, NZ 0.025, SG 0.024, HK 0.016, BD 0.015, TZ 0.015, MY 0.015, LK 0.014, PK 0.011, KE 0.010, PH 0.009, JM 0.009, GH 0.009, NG 0.007, 
HK (4): US 0.227, GB 0.220, CA 0.083, HK 0.068, AU 0.068, IE 0.048, NZ 0.033, PK 0.033, SG 0.028, JM 0.028, BD 0.025, IN 0.025, TZ 0.020, PH 0.020, M

In [159]:
from sklearn.neural_network import MLPClassifier

# Multi-layer Perceptron Model
mlp_model = MLPClassifier(random_state=3)
mlp_model.fit(X_train, y_train)
y_pred = mlp_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

probability_breakdown(mlp_model)
average_tries(mlp_model,X_test,y_test)

Accuracy: 0.32
KE (8): US 0.514, GB 0.437, CA 0.031, AU 0.008, IN 0.003, NZ 0.002, IE 0.002, KE 0.001, BD 0.001, TZ 0.000, JM 0.000, PK 0.000, PH 0.000, MY 0.000, ZA 0.000, HK 0.000, NG 0.000, GH 0.000, SG 0.000, LK 0.000, 
NG (17): GB 0.485, CA 0.202, US 0.173, IN 0.038, NZ 0.014, AU 0.014, JM 0.012, PH 0.007, TZ 0.007, BD 0.006, MY 0.006, HK 0.006, KE 0.005, LK 0.005, ZA 0.005, IE 0.004, NG 0.004, SG 0.003, PK 0.003, GH 0.001, 
ZA (5): US 0.334, CA 0.232, AU 0.157, GB 0.132, ZA 0.040, IE 0.022, NZ 0.014, BD 0.013, IN 0.012, MY 0.009, PH 0.008, HK 0.007, TZ 0.007, JM 0.005, KE 0.004, PK 0.002, LK 0.001, NG 0.000, SG 0.000, GH 0.000, 
IE (1): IE 0.345, GB 0.249, US 0.195, CA 0.054, BD 0.019, AU 0.018, PH 0.018, TZ 0.017, IN 0.015, NZ 0.015, LK 0.010, JM 0.009, HK 0.007, PK 0.006, KE 0.006, MY 0.004, ZA 0.003, GH 0.003, SG 0.003, NG 0.002, 
HK (2): US 0.343, HK 0.279, IN 0.132, AU 0.068, GB 0.053, CA 0.035, PH 0.022, SG 0.013, JM 0.010, MY 0.009, KE 0.008, NZ 0.006, BD 0.005, TZ 0.005, 

In [160]:
from imblearn.under_sampling import RandomUnderSampler

# Even when class weights are balanced, the bigger countries are heavily favored
# so we can try undersampling the dataset so everything is equal
rus = RandomUnderSampler(random_state=3)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=3)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
print("Resampled class distribution:", Counter(y_resampled))

Resampled class distribution: Counter({'AU': 26, 'BD': 26, 'CA': 26, 'GB': 26, 'GH': 26, 'HK': 26, 'IE': 26, 'IN': 26, 'JM': 26, 'KE': 26, 'LK': 26, 'MY': 26, 'NG': 26, 'NZ': 26, 'PH': 26, 'PK': 26, 'SG': 26, 'TZ': 26, 'US': 26, 'ZA': 26})


In [161]:
# Trains a logistic regression model with undersampling
model = LogisticRegression(solver='lbfgs', max_iter=500)
model.fit(X_resampled, y_resampled)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

probability_breakdown(model)
average_tries(model,X_test,y_test)

Accuracy: 0.15
KE (8): CA 0.264, IE 0.108, GH 0.096, NG 0.081, US 0.072, JM 0.068, AU 0.066, KE 0.040, ZA 0.039, PK 0.031, NZ 0.022, BD 0.021, TZ 0.020, MY 0.018, LK 0.017, PH 0.013, GB 0.010, IN 0.006, SG 0.005, HK 0.005, 
NG (1): NG 0.171, NZ 0.139, KE 0.137, CA 0.115, PK 0.095, GB 0.067, TZ 0.065, AU 0.050, HK 0.023, IN 0.022, ZA 0.019, PH 0.019, JM 0.017, GH 0.017, US 0.014, SG 0.009, IE 0.008, MY 0.006, BD 0.006, LK 0.003, 
ZA (4): NG 0.195, MY 0.147, TZ 0.083, ZA 0.080, IE 0.061, SG 0.059, US 0.056, GB 0.042, LK 0.034, KE 0.034, HK 0.031, NZ 0.030, GH 0.026, CA 0.026, JM 0.022, AU 0.022, IN 0.016, PK 0.014, BD 0.013, PH 0.010, 
IE (2): ZA 0.159, IE 0.147, LK 0.120, PH 0.117, TZ 0.079, GB 0.061, BD 0.048, CA 0.047, NZ 0.029, NG 0.028, JM 0.027, PK 0.023, US 0.022, IN 0.021, GH 0.015, AU 0.014, HK 0.013, SG 0.011, KE 0.011, MY 0.010, 
HK (1): HK 0.305, MY 0.208, JM 0.171, GB 0.094, US 0.063, NZ 0.050, SG 0.028, AU 0.025, PH 0.015, CA 0.014, NG 0.007, KE 0.004, IN 0.004, TZ 0.004, P

In [162]:
# Random Forest Classifier with undersampling
rf_model = RandomForestClassifier(random_state=3)
rf_model.fit(X_resampled, y_resampled)
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

probability_breakdown(rf_model)
average_tries(rf_model,X_test,y_test)

Accuracy: 0.22
KE (1): KE 0.103, US 0.073, NG 0.068, TZ 0.060, JM 0.060, PH 0.058, CA 0.055, ZA 0.053, SG 0.053, IE 0.052, GH 0.050, BD 0.048, NZ 0.040, IN 0.040, AU 0.038, GB 0.035, LK 0.035, HK 0.033, MY 0.028, PK 0.022, 
NG (1): NG 0.131, GH 0.060, US 0.056, JM 0.053, HK 0.053, MY 0.053, ZA 0.050, IE 0.049, SG 0.049, CA 0.049, NZ 0.047, BD 0.047, KE 0.043, GB 0.043, PK 0.041, TZ 0.040, IN 0.037, AU 0.036, PH 0.036, LK 0.029, 
ZA (8): NZ 0.070, SG 0.063, PH 0.063, AU 0.058, HK 0.057, IE 0.055, KE 0.053, ZA 0.053, US 0.053, IN 0.052, GH 0.048, CA 0.048, TZ 0.047, LK 0.047, GB 0.045, BD 0.042, PK 0.040, MY 0.038, JM 0.035, NG 0.032, 
IE (1): IE 0.108, AU 0.068, ZA 0.068, JM 0.061, CA 0.057, NG 0.054, GB 0.054, IN 0.047, KE 0.046, PH 0.046, LK 0.046, US 0.045, TZ 0.044, PK 0.042, NZ 0.038, BD 0.038, GH 0.037, HK 0.035, MY 0.035, SG 0.031, 
HK (1): HK 0.100, PH 0.085, PK 0.062, AU 0.060, TZ 0.060, IN 0.060, NZ 0.055, SG 0.055, BD 0.053, CA 0.052, GH 0.050, IE 0.045, KE 0.040, US 0.040, G

In [163]:
from sklearn.neural_network import MLPClassifier

# Multi-layer Perceptron Model with undersampling
mlp_model = MLPClassifier(random_state=3)
mlp_model.fit(X_resampled, y_resampled)
y_pred = mlp_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

probability_breakdown(mlp_model)
average_tries(mlp_model,X_test,y_test)

Accuracy: 0.21
KE (9): PH 0.277, TZ 0.222, BD 0.061, CA 0.050, AU 0.046, GH 0.039, US 0.038, NZ 0.037, KE 0.037, IN 0.035, JM 0.034, GB 0.031, HK 0.024, NG 0.020, LK 0.013, MY 0.013, ZA 0.008, SG 0.008, IE 0.006, PK 0.002, 
NG (1): NG 0.232, TZ 0.193, GH 0.135, PH 0.108, JM 0.060, IN 0.047, HK 0.032, NZ 0.032, KE 0.024, CA 0.019, GB 0.019, SG 0.016, US 0.015, MY 0.012, AU 0.012, IE 0.011, BD 0.011, LK 0.009, ZA 0.009, PK 0.003, 
ZA (13): TZ 0.421, IN 0.120, HK 0.071, PH 0.064, US 0.047, MY 0.043, CA 0.042, NZ 0.036, SG 0.032, GB 0.023, JM 0.022, LK 0.017, ZA 0.016, KE 0.015, AU 0.011, BD 0.008, GH 0.005, IE 0.003, NG 0.003, PK 0.001, 
IE (5): PH 0.169, HK 0.124, TZ 0.121, LK 0.119, IE 0.087, IN 0.055, ZA 0.052, JM 0.039, GB 0.038, CA 0.033, BD 0.031, GH 0.023, US 0.021, NZ 0.018, KE 0.015, SG 0.013, AU 0.013, NG 0.012, MY 0.011, PK 0.007, 
HK (1): HK 0.505, US 0.161, PH 0.068, TZ 0.055, MY 0.027, SG 0.024, JM 0.023, GB 0.020, CA 0.019, KE 0.016, NZ 0.014, IE 0.013, IN 0.012, AU 0.011, 

In [164]:
# Calculate weights based on document number (default "balanced" weights)
total_docs = sum(Counter(y_train).values())
weights_dict_doc = Counter({country_code:0 for country_code in country_set})

for country_code in weights_dict_doc:
    weights_dict_doc[country_code] = total_docs/(len(country_set) * Counter(y_train)[country_code])

print(weights_dict_doc)

Counter({'TZ': 2.855769230769231, 'NG': 2.475, 'JM': 2.475, 'HK': 2.3951612903225805, 'GH': 2.3951612903225805, 'KE': 2.3203125, 'BD': 2.25, 'ZA': 2.1838235294117645, 'SG': 2.1214285714285714, 'MY': 2.1214285714285714, 'PH': 2.0625, 'PK': 2.0067567567567566, 'LK': 1.9038461538461537, 'NZ': 1.16015625, 'IN': 0.9769736842105263, 'IE': 0.9642857142857143, 'CA': 0.720873786407767, 'AU': 0.6346153846153846, 'GB': 0.24029126213592233, 'US': 0.23951612903225805})


In [165]:
# Calculate weights based on word count
total_words = sum(country_word_counts.values())
weights_dict_word = Counter({country_code:0 for country_code in country_set})

for country_code in weights_dict_word:
    weights_dict_word[country_code] = total_words/(len(country_set) * country_word_counts[country_code])

print(weights_dict_word)
print(total_words)


Counter({'PH': 3.910410016719866, 'JM': 3.8743401980347127, 'GH': 3.3747532220509626, 'ZA': 2.622619654416637, 'BD': 2.469495572460464, 'KE': 2.4500928549320253, 'SG': 2.444912093133761, 'NG': 2.262075749582344, 'MY': 2.1723032593092966, 'HK': 2.0897236211518155, 'TZ': 2.0851083014203797, 'NZ': 1.4886779214767236, 'LK': 1.3360817179730204, 'IN': 0.9784297599239363, 'IE': 0.9313634718074034, 'PK': 0.7469376719338613, 'CA': 0.6877617376775271, 'AU': 0.4746887497866569, 'GB': 0.3223559981455493, 'US': 0.23762114120382663})
2058127


In [166]:
# Logistic Regression model with class weights balanced by word count
model = LogisticRegression(solver='lbfgs', max_iter=500, class_weight=weights_dict_word)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

probability_breakdown(model)
average_tries(model,X_test,y_test)

Accuracy: 0.28
KE (10): US 0.424, GB 0.249, CA 0.110, GH 0.075, JM 0.056, AU 0.024, NG 0.021, IE 0.008, NZ 0.007, KE 0.005, LK 0.004, ZA 0.003, PK 0.003, BD 0.003, MY 0.002, TZ 0.002, PH 0.002, IN 0.001, HK 0.001, SG 0.001, 
NG (2): GB 0.209, NG 0.156, CA 0.114, NZ 0.101, KE 0.099, GH 0.078, TZ 0.067, US 0.042, PK 0.030, IE 0.018, AU 0.016, IN 0.014, ZA 0.013, HK 0.011, PH 0.009, JM 0.008, SG 0.007, LK 0.003, MY 0.003, BD 0.002, 
ZA (5): GB 0.179, CA 0.160, NG 0.146, AU 0.117, ZA 0.103, IE 0.086, US 0.059, MY 0.043, NZ 0.037, BD 0.011, PK 0.010, SG 0.009, TZ 0.009, LK 0.008, IN 0.007, GH 0.006, HK 0.004, JM 0.003, KE 0.003, PH 0.001, 
IE (1): IE 0.320, GB 0.175, US 0.142, PH 0.098, LK 0.070, IN 0.038, CA 0.032, TZ 0.029, ZA 0.028, AU 0.016, NZ 0.009, JM 0.009, PK 0.005, KE 0.005, HK 0.004, BD 0.004, GH 0.004, NG 0.004, SG 0.004, MY 0.003, 
HK (1): HK 0.298, GB 0.170, JM 0.158, MY 0.153, AU 0.097, CA 0.042, US 0.030, SG 0.012, PH 0.007, NZ 0.007, IE 0.007, LK 0.005, IN 0.005, NG 0.004, 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [167]:
# Random Forest Classifier with class weights balanced by word count
rf_model = RandomForestClassifier(random_state=3, class_weight=weights_dict_word)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

probability_breakdown(rf_model)
average_tries(rf_model,X_test,y_test)

Accuracy: 0.31
KE (3): GB 0.255, US 0.185, KE 0.077, CA 0.057, AU 0.048, NZ 0.045, NG 0.035, TZ 0.035, IE 0.033, IN 0.032, GH 0.030, ZA 0.030, BD 0.028, SG 0.025, PK 0.022, LK 0.018, JM 0.015, HK 0.013, MY 0.013, PH 0.005, 
NG (3): GB 0.186, US 0.183, NG 0.161, CA 0.063, AU 0.054, IN 0.043, IE 0.040, NZ 0.033, GH 0.031, ZA 0.029, KE 0.021, MY 0.021, LK 0.021, HK 0.020, JM 0.019, PK 0.017, BD 0.016, TZ 0.016, SG 0.014, PH 0.011, 
ZA (11): GB 0.258, US 0.230, AU 0.098, CA 0.070, IN 0.037, SG 0.035, IE 0.028, PH 0.027, KE 0.025, MY 0.025, ZA 0.023, PK 0.023, TZ 0.023, NG 0.020, GH 0.020, NZ 0.015, HK 0.013, BD 0.012, LK 0.010, JM 0.007, 
IE (3): GB 0.241, US 0.202, IE 0.156, AU 0.085, CA 0.060, NZ 0.033, IN 0.032, PK 0.021, MY 0.020, HK 0.017, SG 0.017, ZA 0.016, TZ 0.015, GH 0.015, LK 0.015, PH 0.015, KE 0.012, BD 0.012, JM 0.010, NG 0.007, 
HK (3): GB 0.215, US 0.212, HK 0.073, AU 0.070, CA 0.068, NZ 0.048, IE 0.043, SG 0.043, JM 0.040, IN 0.038, MY 0.028, BD 0.020, PK 0.020, ZA 0.018, 

In [168]:
# Models with weight balancing and normal sampling for the 25% dataset
X_train, X_test, y_train, y_test = train_test_split(X_25, y, test_size=0.1, random_state=3)

In [169]:
# Logistic Regression model removing words that appear in less than 25% of countries
model = LogisticRegression(solver='lbfgs', max_iter=500, class_weight='balanced')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

probability_breakdown(model)
average_tries(model, X_test,y_test)

Accuracy: 0.25
KE (13): US 0.480, GB 0.211, CA 0.135, JM 0.077, AU 0.036, GH 0.019, NG 0.015, IE 0.006, NZ 0.004, PK 0.004, MY 0.003, IN 0.002, KE 0.002, LK 0.002, BD 0.001, SG 0.001, ZA 0.001, PH 0.001, TZ 0.000, HK 0.000, 
NG (2): GB 0.248, NG 0.148, US 0.139, CA 0.121, NZ 0.085, KE 0.077, GH 0.027, TZ 0.026, PK 0.024, IE 0.019, AU 0.016, JM 0.016, IN 0.013, ZA 0.010, HK 0.009, SG 0.006, PH 0.006, MY 0.004, LK 0.003, BD 0.002, 
ZA (8): GB 0.191, NG 0.158, CA 0.155, IE 0.138, AU 0.099, US 0.082, NZ 0.043, ZA 0.040, MY 0.038, TZ 0.011, SG 0.009, BD 0.008, LK 0.006, PK 0.005, IN 0.004, KE 0.004, HK 0.003, GH 0.003, JM 0.002, PH 0.001, 
IE (1): IE 0.237, GB 0.232, US 0.110, PH 0.097, TZ 0.075, ZA 0.049, LK 0.045, IN 0.042, CA 0.025, AU 0.021, JM 0.010, NZ 0.009, KE 0.008, PK 0.007, SG 0.006, BD 0.006, GH 0.006, NG 0.005, MY 0.005, HK 0.004, 
HK (1): HK 0.239, AU 0.219, GB 0.149, MY 0.101, JM 0.100, CA 0.072, US 0.021, SG 0.021, IE 0.020, NZ 0.020, IN 0.015, PH 0.007, LK 0.006, NG 0.004, 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [170]:
# Random Forest Classifier removing words that appear in less than 25% of countries
rf_model = RandomForestClassifier(random_state=3, class_weight='balanced')
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

probability_breakdown(rf_model)
average_tries(rf_model,X_test,y_test)

Accuracy: 0.34
KE (7): GB 0.237, US 0.195, CA 0.085, AU 0.062, GH 0.048, ZA 0.048, KE 0.045, NG 0.030, IE 0.030, NZ 0.028, TZ 0.028, LK 0.025, IN 0.022, PH 0.022, MY 0.020, PK 0.020, BD 0.017, SG 0.015, JM 0.013, HK 0.010, 
NG (3): GB 0.213, US 0.176, NG 0.100, CA 0.066, AU 0.059, IN 0.046, HK 0.041, NZ 0.034, GH 0.029, KE 0.027, ZA 0.027, LK 0.027, IE 0.027, MY 0.026, TZ 0.026, SG 0.019, JM 0.019, BD 0.017, PH 0.013, PK 0.010, 
ZA (9): GB 0.232, US 0.208, AU 0.092, CA 0.082, IN 0.048, SG 0.040, NZ 0.035, KE 0.033, ZA 0.033, IE 0.032, MY 0.030, NG 0.028, JM 0.018, PH 0.018, HK 0.013, GH 0.013, BD 0.012, PK 0.012, TZ 0.010, LK 0.010, 
IE (2): GB 0.218, IE 0.197, US 0.175, AU 0.099, CA 0.050, NZ 0.030, ZA 0.028, SG 0.023, JM 0.023, IN 0.021, MY 0.016, TZ 0.016, NG 0.015, KE 0.015, BD 0.014, LK 0.014, GH 0.013, HK 0.012, PH 0.012, PK 0.009, 
HK (5): GB 0.240, US 0.195, AU 0.068, IE 0.058, HK 0.058, IN 0.055, CA 0.052, NZ 0.043, SG 0.033, MY 0.028, TZ 0.022, KE 0.020, JM 0.020, GH 0.020, P

In [171]:
# Multi-layer Perceptron Model removing words that appear in less than 25% of countries
mlp_model = MLPClassifier(random_state=3)
mlp_model.fit(X_train, y_train)
y_pred = mlp_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

probability_breakdown(mlp_model)
average_tries(mlp_model,X_test,y_test)

Accuracy: 0.30
KE (9): US 0.557, GB 0.389, CA 0.034, AU 0.017, IE 0.002, IN 0.000, HK 0.000, BD 0.000, KE 0.000, NZ 0.000, NG 0.000, MY 0.000, LK 0.000, GH 0.000, ZA 0.000, SG 0.000, JM 0.000, PK 0.000, PH 0.000, TZ 0.000, 
NG (10): US 0.291, GB 0.270, CA 0.251, NZ 0.080, IN 0.037, AU 0.011, JM 0.010, MY 0.008, HK 0.008, NG 0.006, PH 0.006, IE 0.005, SG 0.004, BD 0.003, ZA 0.003, GH 0.002, KE 0.001, PK 0.001, LK 0.001, TZ 0.001, 
ZA (7): CA 0.279, US 0.258, AU 0.241, GB 0.127, NG 0.028, MY 0.020, ZA 0.012, NZ 0.012, JM 0.007, KE 0.005, LK 0.002, BD 0.002, HK 0.002, PH 0.001, TZ 0.001, IN 0.001, IE 0.001, SG 0.000, GH 0.000, PK 0.000, 
IE (3): GB 0.299, US 0.284, IE 0.203, AU 0.055, CA 0.053, TZ 0.019, IN 0.012, ZA 0.010, LK 0.008, PH 0.007, BD 0.007, JM 0.007, MY 0.006, HK 0.006, PK 0.006, KE 0.005, NG 0.005, NZ 0.003, GH 0.003, SG 0.003, 
HK (4): US 0.258, AU 0.254, GB 0.171, HK 0.154, MY 0.070, CA 0.039, ZA 0.036, NZ 0.006, PH 0.003, IE 0.002, NG 0.002, JM 0.001, IN 0.001, SG 0.001, 

In [172]:
# Models with weight balancing and normal sampling for the 50% dataset
X_train, X_test, y_train, y_test = train_test_split(X_50, y, test_size=0.1, random_state=3)

In [173]:
# Logistic Regression model removing words that appear in less than 50% of countries
model = LogisticRegression(solver='lbfgs', max_iter=500, class_weight='balanced')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

probability_breakdown(model)
average_tries(model,X_test,y_test)

Accuracy: 0.26
KE (9): US 0.461, CA 0.229, JM 0.141, GB 0.074, GH 0.049, LK 0.015, NG 0.010, AU 0.006, KE 0.004, NZ 0.003, MY 0.002, IE 0.002, BD 0.001, PK 0.001, IN 0.000, SG 0.000, PH 0.000, ZA 0.000, TZ 0.000, HK 0.000, 
NG (3): GB 0.175, CA 0.164, NG 0.143, NZ 0.124, KE 0.107, US 0.092, GH 0.076, TZ 0.060, IE 0.012, LK 0.009, AU 0.007, JM 0.006, IN 0.006, HK 0.004, PH 0.004, ZA 0.004, SG 0.003, MY 0.002, BD 0.001, PK 0.001, 
ZA (8): NG 0.202, CA 0.178, GB 0.169, IE 0.146, NZ 0.122, US 0.076, AU 0.029, ZA 0.023, LK 0.012, MY 0.012, BD 0.011, TZ 0.006, IN 0.004, SG 0.004, GH 0.003, HK 0.002, KE 0.001, PK 0.001, PH 0.000, JM 0.000, 
IE (2): GB 0.236, IE 0.171, TZ 0.098, PH 0.094, LK 0.094, US 0.085, ZA 0.079, IN 0.067, CA 0.021, AU 0.021, JM 0.006, BD 0.004, NZ 0.004, NG 0.004, PK 0.003, GH 0.003, KE 0.003, MY 0.002, SG 0.002, HK 0.002, 
HK (1): HK 0.375, AU 0.194, GB 0.160, JM 0.076, MY 0.065, SG 0.030, CA 0.027, US 0.022, IE 0.021, NZ 0.020, PH 0.004, LK 0.002, KE 0.002, NG 0.002, I

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [174]:
# Random Forest Classifier removing words that appear in less than 50% of countries
rf_model = RandomForestClassifier(random_state=3, class_weight='balanced')
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")  

probability_breakdown(rf_model)
average_tries(rf_model,X_test,y_test)

Accuracy: 0.31
KE (6): GB 0.230, US 0.180, AU 0.077, CA 0.065, NZ 0.048, KE 0.040, ZA 0.040, LK 0.040, GH 0.038, IE 0.037, PK 0.035, BD 0.033, IN 0.033, JM 0.033, NG 0.022, MY 0.018, PH 0.013, TZ 0.010, SG 0.007, HK 0.003, 
NG (14): US 0.219, GB 0.193, IN 0.069, CA 0.064, AU 0.056, NZ 0.046, KE 0.040, HK 0.034, IE 0.033, MY 0.031, JM 0.027, GH 0.027, LK 0.027, NG 0.026, PK 0.026, ZA 0.024, PH 0.020, SG 0.014, BD 0.014, TZ 0.010, 
ZA (13): US 0.220, GB 0.208, AU 0.088, CA 0.060, IN 0.047, IE 0.042, NG 0.038, NZ 0.032, TZ 0.032, PH 0.030, KE 0.025, GH 0.025, ZA 0.023, HK 0.022, MY 0.022, SG 0.020, BD 0.020, PK 0.020, LK 0.015, JM 0.012, 
IE (1): IE 0.195, GB 0.195, US 0.168, AU 0.083, CA 0.043, IN 0.040, NZ 0.035, JM 0.025, LK 0.025, SG 0.024, TZ 0.023, BD 0.020, GH 0.019, NG 0.018, ZA 0.016, PH 0.016, PK 0.015, MY 0.014, KE 0.013, HK 0.013, 
HK (3): US 0.178, GB 0.170, HK 0.088, AU 0.068, CA 0.065, IN 0.053, IE 0.050, SG 0.045, NZ 0.040, LK 0.035, KE 0.028, BD 0.025, JM 0.023, TZ 0.022,

In [175]:
# Multi-layer Perceptron Model removing words that appear in less than 50% of countries
X_train, X_test, y_train, y_test = train_test_split(X_50, y, test_size=0.1, random_state=3)
mlp_model = MLPClassifier(random_state=3)
mlp_model.fit(X_train, y_train)
y_pred = mlp_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

probability_breakdown(mlp_model)
average_tries(mlp_model,X_test,y_test)

Accuracy: 0.28
KE (17): US 0.682, GB 0.217, NZ 0.063, CA 0.019, IE 0.005, PH 0.004, ZA 0.003, LK 0.003, PK 0.001, MY 0.001, JM 0.000, NG 0.000, BD 0.000, AU 0.000, IN 0.000, GH 0.000, KE 0.000, HK 0.000, TZ 0.000, SG 0.000, 
NG (19): CA 0.339, GB 0.247, US 0.208, AU 0.099, IN 0.025, NZ 0.019, IE 0.013, JM 0.013, TZ 0.005, MY 0.005, GH 0.005, HK 0.005, SG 0.004, BD 0.003, PH 0.002, ZA 0.002, PK 0.002, KE 0.002, NG 0.001, LK 0.001, 
ZA (14): CA 0.278, US 0.220, GB 0.141, AU 0.123, IN 0.046, NG 0.044, MY 0.038, TZ 0.038, IE 0.018, SG 0.014, PK 0.010, KE 0.009, BD 0.005, ZA 0.004, NZ 0.004, HK 0.002, JM 0.002, PH 0.002, GH 0.001, LK 0.001, 
IE (1): IE 0.339, US 0.155, GB 0.144, CA 0.130, LK 0.060, ZA 0.043, NZ 0.032, AU 0.022, BD 0.021, MY 0.009, TZ 0.007, JM 0.006, NG 0.005, SG 0.005, IN 0.005, KE 0.005, PK 0.004, HK 0.003, GH 0.002, PH 0.002, 
HK (12): AU 0.454, NZ 0.257, US 0.204, GB 0.032, CA 0.009, LK 0.008, SG 0.007, BD 0.006, IE 0.006, PH 0.003, IN 0.002, HK 0.002, PK 0.002, MY 0.00