In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
print(STOPWORDS)

{"you'll", 'ma', 'how', 'can', 'you', 'won', 'am', 'whom', 'were', 'most', "hasn't", 'up', 'been', 'yourself', 'hers', 'under', 'o', 'didn', 'off', 'with', 'before', 'to', 'had', 'or', 'as', 's', 'about', 'its', 'me', 'and', 'yours', 'through', 'will', 'against', 'because', 'own', "mustn't", "isn't", 'he', "that'll", 'have', 'itself', 'at', 'd', 'few', "weren't", 'hadn', 'from', "wasn't", 'm', 'no', 'for', 'nor', 'shouldn', 'doesn', "you're", 'we', 'an', 'both', 'that', 'further', 'be', 'mightn', "she's", 'by', 'over', 'more', 'her', 'down', 'll', "won't", 'has', 'it', 'there', 'ourselves', 'ain', "you'd", 'do', 'wouldn', 'not', "hadn't", "haven't", 'himself', 'did', 'their', 'should', 'now', 'all', 'y', 'until', 'your', 'where', "it's", 'are', 'ours', 'him', 'into', 'here', 're', 'does', 'them', "couldn't", 'while', 'then', 'below', 'shan', 't', 'what', 'isn', 'herself', 'above', 'only', "wouldn't", "shan't", 'aren', 'theirs', 'too', 'yourselves', 'our', 'having', 'in', 'a', 'she', 'b

In [None]:
import pandas as pd

doc_dict = {}
id_dict = {}
word_count_dict = {}
country_set = set()

# pulls text IDs, country codes, document types, and word counts from the excel sheet,
# using it to divide the documents into a dictionary by ID
sources_df = pd.read_excel("/content/drive/MyDrive/text/sampleSources.xlsx", sheet_name="texts")
for text_id, (country_code, doc_type), word_count in [
    (l[0], tuple(l[1].split()), l[2]) for l in sources_df[["textID", "country|genre", "# words"]].values.tolist()
]:
    with open(f"/content/drive/MyDrive/text/w_{country_code.lower()}_{doc_type.lower()}.txt", 'r', encoding="utf-8") as file:
        if f"{country_code}_{doc_type}" not in id_dict:
            id_dict[f"{country_code}_{doc_type}"] = [text_id]
        else:
            id_dict[f"{country_code}_{doc_type}"].append(text_id)

        country_set.add(country_code)

        IS_DOC = False
        lines = file.readlines()
        for i, line in enumerate(lines):
            if line.strip().startswith(f"##{text_id}"):
                IS_DOC = True
            elif line.strip().startswith("##"):
                IS_DOC = False
            if IS_DOC:
                if text_id not in doc_dict:
                    doc_dict[text_id] = [w.lower() for w in line.split() if w.lower() in STOPWORDS]
                else:
                    doc_dict[text_id] += [w.lower() for w in line.split() if w.lower() in STOPWORDS]

        word_count_dict[text_id] = word_count

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
print(word_count)

683


In [None]:
import copy
from collections import Counter

# make a counter for every word in the corpus
vocab = Counter({})
vocab['<UNK>'] = 0
for doc in doc_dict.values():
    for word in doc:
        if word in vocab:
            vocab[word] += 1
        else:
            vocab[word] = 1

# make a dictionary of sets for every country and record every word used by each country
# also make a word count for every country
vocab_sets = {country_code:set() for country_code in country_set}
country_word_counts = Counter({country_code:0 for country_code in country_set})
for country_code in country_set:
    for text_id in id_dict[f"{country_code}_B"] + id_dict[f"{country_code}_G"]:
        for word in doc_dict[text_id]:
            vocab_sets[country_code].add(word)
        country_word_counts[country_code] += word_count_dict[text_id]

# make new vocabulary sets, removing words that appear in less than
# 12.5%, 25%, 37.5%, and 50% of the countries datasets respectively
vocab_25 = vocab.copy()
vocab_50 = vocab.copy()
for word in vocab:
    COUNTRY_COUNT = 0
    for country_code in country_set:
        if word in vocab_sets[country_code]:
            COUNTRY_COUNT+=1
    if COUNTRY_COUNT / len(country_set) < 0.25:
        del vocab_25[word]
    if COUNTRY_COUNT / len(country_set) < 0.5:
        del vocab_50[word]

# Replace any words that appear in less than 25% of the countries’ datasets with the <UNK> token
doc_dict_25 = copy.deepcopy(doc_dict)
for text_id, doc in doc_dict_25.items():
    for i, word in enumerate(doc):
        if word not in vocab_25:
            doc_dict_25[text_id][i] = '<UNK>'
            vocab_25['<UNK>'] += 1

# Replace any words that appear in less than 50% of the countries’ datasets with the <UNK> token
doc_dict_50 = copy.deepcopy(doc_dict)
for text_id, doc in doc_dict_50.items():
    for i, word in enumerate(doc):
        if word not in vocab_50:
            doc_dict_50[text_id][i] = '<UNK>'
            vocab_50['<UNK>'] += 1

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# make a dataframe of one-hot representations of each text in each version of the dataset with their country labels
text_ids = []
texts = []
texts_25 = []
texts_50 = []
country_labels = []

for country_code in country_set:
    for text_id in id_dict[f"{country_code}_B"] + id_dict[f"{country_code}_G"]:
        text_ids.append(text_ids)
        texts.append(" ".join(doc_dict[text_id]))
        texts_25.append(" ".join(doc_dict_25[text_id]))
        texts_50.append(" ".join(doc_dict_50[text_id]))
        country_labels.append(country_code)

data = {
    'text_id': text_ids,
    'texts': texts,
    'texts_25': texts_25,
    'texts_50': texts_50,
    'country_labels': country_labels
}

df = pd.DataFrame(data)

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['texts'])
X_25 = vectorizer.fit_transform(df['texts_25'])
X_50 = vectorizer.fit_transform(df['texts_50'])
y = df['country_labels']

In [None]:
def probability_breakdown(model, X_test, y_test):
    # makes a dictionary of probability Counters for every country label for each country's documents
    most_probable_country = {c:Counter({c:0 for c in country_set}) for c in country_set}
    for test_num, probabilities in enumerate(model.predict_proba(X_test)):
        for i, probability in enumerate(probabilities):
            most_probable_country[y_test.tolist()[test_num]][model.classes_[i]] += probability

    return most_probable_country

In [None]:
def average_tries(model, X_test, y_test, print_output=False):
    """
    Takes the model and testing data.
    Returns average tries needed to return the right label,
    average tries per country, and number of tests per country.
    """

    # Matches each country label to its probability in a dataframe
    df = pd.DataFrame(model.predict_proba(X_test).tolist(), columns=model.classes_.tolist())

    # Sorts each probability distribution for highest probability
    # and records the number of iterations needed to get to the right label
    country_try_count = {c:0 for c in country_set}
    country_test_count = Counter(y_test)
    for i in range(len(df)):
        for try_count, (country_code, _) in enumerate(sorted(df.iloc[i].to_dict().items(), key=lambda item: item[1], reverse=True)):
            if country_code == y_test.tolist()[i]:
                country_try_count[country_code] += try_count+1
                break

    # Averages the country try count by number of tests
    country_try_count = {country:country_try_count[country]/country_test_count[country] for country in country_try_count}
    # Computes overal average try count
    avg_tries = sum(country_try_count.values())/len(country_try_count)

    # Either returns or prints the results
    if print_output:
        for country, try_count in sorted(country_try_count.items(), key=lambda item: item[1]):
            print(country, f"{try_count:.1f} tries", f"({country_test_count[country]} tests)")
        print("Average number of tries:", avg_tries)
    else:
        return avg_tries, country_try_count, country_test_count

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
import random

def average_performance(model, X_dataset, y_dataset = y, random_states=10, undersampling=False, scrambling=False):
    """
    Takes the model, dataset, and number of random test splits to test.
    Prints average accuracy and tries between all test splits of the model.
    """
    accuracies = []
    tries = []
    avg_country_try_count = {c:0 for c in country_set}
    total_country_test_count = {c:0 for c in country_set}
    avg_most_probable_country = {c:Counter({c:0 for c in country_set}) for c in country_set}

    if scrambling:
        random.seed = 3
        y_scrambled = y_dataset.tolist()
        random.shuffle(y_scrambled)
        y_dataset = pd.Series(y_scrambled)

    # trains models on the specified number of random splits
    for r in range(random_states):
        X_train, X_test, y_train, y_test = train_test_split(X_dataset, y_dataset, test_size=0.2, random_state=r)
        # undersampling
        if undersampling:
            X_train, y_train = RandomUnderSampler(random_state=3).fit_resample(X_train, y_train)
        model.fit(X_train, y_train)
        # computes accuracies
        y_pred = model.predict(X_test)
        accuracies.append(accuracy_score(y_test, y_pred))
        # computes overall and country try counts
        avg_tries, country_try_count, country_test_count = average_tries(model, X_test, y_test)
        tries.append(avg_tries)
        # computes similar countries
        most_probable_country = probability_breakdown(model, X_test, y_test)
        for country in country_try_count:
            avg_country_try_count[country] += country_try_count[country]
            total_country_test_count[country] += country_test_count[country]
            avg_most_probable_country[country] += most_probable_country[country]
        print(".",end=" ")
    print()

    print("Average Accuracy:", np.mean(accuracies))
    print("Averaged Average tries:", np.mean(tries))
    # Sorts country try counts and prints them in order
    avg_country_try_count = {country:avg_country_try_count[country]/(random_states) for country in avg_country_try_count}
    for country, try_count in sorted(avg_country_try_count.items(), key=lambda item: item[1]):
        most_similar_five = " ".join([country for country, _ in avg_most_probable_country[country].most_common(5)])
        print(f"{country}: {try_count:4.1f} tries", f"({total_country_test_count[country]:3d} tests) | {most_similar_five}")

In [None]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression model with class weights balanced
model = LogisticRegression(max_iter=2000, class_weight='balanced')
average_performance(model, X)
average_performance(model, X_25)
average_performance(model, X_50)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 
Average Accuracy: 0.08424242424242424
Averaged Average tries: 9.408149052768083
GB:  4.5 tries (683 tests) | GB US CA AU NZ
US:  4.6 tries (656 tests) | US CA GB AU MY
AU:  5.5 tries (257 tests) | AU US NZ GB IE
CA:  6.0 tries (242 tests) | GB CA NZ US AU
IE:  7.0 tries (170 tests) | US CA IE NZ TZ
IN:  7.2 tries (159 tests) | PK IN SG GB US
NZ:  7.7 tries (149 tests) | IN LK GB SG ZA
PK:  8.6 tries ( 97 tests) | PK NG US CA PH
GH:  9.7 tries ( 78 tests) | GH NZ ZA TZ MY
MY: 10.2 tries ( 69 tests) | US SG MY GB GH
SG: 10.3 tries ( 72 tests) | GH SG BD NZ IN
LK: 10.6 tries ( 61 tests) | JM GB IN IE ZA
NG: 10.9 tries ( 71 tests) | HK BD GH AU LK
PH: 11.3 tries ( 75 tests) | TZ IE CA HK KE
TZ: 11.3 tries ( 59 tests) | GH NG NZ GB AU
KE: 11.5 tries ( 77 tests) | KE MY US ZA LK
HK: 12.2 tries ( 89 tests) | IE IN KE SG NZ
ZA: 12.2 tries ( 79 tests) | NG MY US CA IN
BD: 12.4 tries ( 70 tests) | NG IN MY IE LK
JM: 14.4 tries ( 87 tests) | NZ CA PH GB LK


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 
Average Accuracy: 0.08393939393939395
Averaged Average tries: 9.401179116513251
GB:  4.5 tries (683 tests) | GB US CA AU NZ
US:  4.6 tries (656 tests) | US CA GB AU MY
AU:  5.5 tries (257 tests) | AU US NZ GB IE
CA:  6.0 tries (242 tests) | GB CA NZ US AU
IE:  7.0 tries (170 tests) | US CA IE NZ TZ
IN:  7.2 tries (159 tests) | PK IN SG GB US
NZ:  7.7 tries (149 tests) | IN LK US SG ZA
PK:  8.6 tries ( 97 tests) | PK NG US CA SG
GH:  9.7 tries ( 78 tests) | GH NZ ZA TZ MY
MY: 10.2 tries ( 69 tests) | US SG MY GB GH
SG: 10.3 tries ( 72 tests) | GH SG BD NZ AU
LK: 10.5 tries ( 61 tests) | JM GB IE IN ZA
NG: 10.9 tries ( 71 tests) | HK BD GH AU LK
PH: 11.2 tries ( 75 tests) | TZ IE CA HK KE
TZ: 11.3 tries ( 59 tests) | GH NG GB NZ LK
KE: 11.5 tries ( 77 tests) | KE MY US ZA LK
HK: 12.2 tries ( 89 tests) | IE IN KE SG PK
ZA: 12.2 tries ( 79 tests) | NG MY US CA IN
BD: 12.4 tries ( 70 tests) | NG IN MY IE LK
JM: 14.4 tries ( 87 tests) | NZ CA PH LK GB


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. . 
Average Accuracy: 0.08666666666666667
Averaged Average tries: 9.400026712136068
GB:  4.5 tries (683 tests) | GB US CA AU NZ
US:  4.6 tries (656 tests) | US CA GB AU MY
AU:  5.5 tries (257 tests) | AU US NZ GB IE
CA:  6.0 tries (242 tests) | GB CA NZ US IN
IE:  7.0 tries (170 tests) | US CA IE TZ NZ
IN:  7.2 tries (159 tests) | PK IN SG GB JM
NZ:  7.9 tries (149 tests) | LK IN US SG ZA
PK:  8.5 tries ( 97 tests) | PK NG US CA PH
GH:  9.7 tries ( 78 tests) | GH TZ NZ ZA MY
MY: 10.3 tries ( 69 tests) | US SG MY GB GH
SG: 10.3 tries ( 72 tests) | GH SG BD NZ IN
LK: 10.4 tries ( 61 tests) | JM GB IE IN MY
NG: 10.9 tries ( 71 tests) | HK BD GH AU LK
PH: 11.2 tries ( 75 tests) | TZ IE CA HK KE
TZ: 11.3 tries ( 59 tests) | GH NG NZ GB KE
KE: 11.4 tries ( 77 tests) | KE MY US ZA LK
HK: 12.2 tries ( 89 tests) | IE IN KE SG NZ
ZA: 12.3 tries ( 79 tests) | NG MY US CA IN
BD: 12.3 tries ( 70 tests) | NG IN MY IE LK
JM: 14.3 tries ( 87 tests) | NZ CA PH LK GB


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Classifier with class weights balanced
rf_model = RandomForestClassifier(random_state=3, class_weight='balanced')
average_performance(rf_model, X)
average_performance(rf_model, X_25)
average_performance(rf_model, X_50)

. . . . . . . . . . 
Average Accuracy: 0.21606060606060606
Averaged Average tries: 9.855607542926899
GB:  1.5 tries (683 tests) | GB US AU CA IN
US:  1.6 tries (656 tests) | US GB AU CA IE
AU:  4.4 tries (257 tests) | US GB AU CA IE
CA:  5.6 tries (242 tests) | US GB AU CA IE
IN:  6.8 tries (159 tests) | GB US AU IN CA
IE:  6.9 tries (170 tests) | GB US AU CA IE
NZ:  8.9 tries (149 tests) | GB US AU CA IE
PK: 10.7 tries ( 97 tests) | GB US AU CA PK
GH: 10.9 tries ( 78 tests) | GB US AU CA NZ
LK: 11.4 tries ( 61 tests) | GB US AU CA IE
ZA: 12.1 tries ( 79 tests) | GB US AU CA IE
HK: 12.4 tries ( 89 tests) | US GB AU CA IE
BD: 12.4 tries ( 70 tests) | GB US AU CA IN
KE: 12.4 tries ( 77 tests) | US GB AU CA IE
SG: 12.7 tries ( 72 tests) | GB US AU CA IE
JM: 12.8 tries ( 87 tests) | GB US AU CA IE
PH: 12.8 tries ( 75 tests) | US GB AU CA IE
NG: 13.0 tries ( 71 tests) | GB US AU CA IN
MY: 13.6 tries ( 69 tests) | GB US CA AU IE
TZ: 14.0 tries ( 59 tests) | GB US AU CA IE
. . . . . . . . . .

In [None]:
from sklearn.metrics import classification_report
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)
rf_model = RandomForestClassifier(random_state=3, class_weight='balanced')
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          AU       0.29      0.10      0.15        20
          BD       0.00      0.00      0.00        12
          CA       0.00      0.00      0.00        24
          GB       0.20      0.68      0.31        68
          GH       0.00      0.00      0.00         8
          HK       0.00      0.00      0.00         9
          IE       0.00      0.00      0.00        13
          IN       0.00      0.00      0.00        14
          JM       0.00      0.00      0.00         9
          KE       0.00      0.00      0.00        11
          LK       0.00      0.00      0.00         4
          MY       0.00      0.00      0.00         7
          NG       0.00      0.00      0.00         6
          NZ       0.00      0.00      0.00        14
          PH       0.00      0.00      0.00         1
          PK       0.67      0.18      0.29        11
          SG       0.00      0.00      0.00        12
          TZ       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.neural_network import MLPClassifier

# Multi-layer Perceptron Model
mlp_model = MLPClassifier(random_state=3)
average_performance(mlp_model,X)
average_performance(mlp_model,X_25)
average_performance(mlp_model,X_50)



. 



. 



. . 



. 



. 



. 



. 



. . 
Average Accuracy: 0.17393939393939395
Averaged Average tries: 9.266203398011273
US:  3.8 tries (656 tests) | US GB CA AU IN
GB:  4.4 tries (683 tests) | GB US AU CA NZ
AU:  6.2 tries (257 tests) | US GB AU CA IE
CA:  6.6 tries (242 tests) | GB US CA IE NZ
IN:  7.0 tries (159 tests) | US GB IN CA AU
IE:  8.0 tries (170 tests) | US GB AU CA IE
NZ:  8.6 tries (149 tests) | GB US AU CA IE
PK:  8.9 tries ( 97 tests) | US GB PK CA AU
SG:  9.2 tries ( 72 tests) | US GB SG AU GH
MY: 10.2 tries ( 69 tests) | US GB CA AU MY
KE: 10.6 tries ( 77 tests) | US GB CA KE IN
BD: 10.8 tries ( 70 tests) | GB IN US CA IE
GH: 10.9 tries ( 78 tests) | US AU GB IN NZ
LK: 10.9 tries ( 61 tests) | GB US JM IN CA
ZA: 11.1 tries ( 79 tests) | US GB CA IE AU
PH: 11.2 tries ( 75 tests) | US GB CA AU IN
NG: 11.3 tries ( 71 tests) | US GB AU IN IE
HK: 11.7 tries ( 89 tests) | US GB IE AU NG
TZ: 11.8 tries ( 59 tests) | GB US AU CA IE
JM: 12.2 tries ( 87 tests) | GB US IN AU IE
. 



. 



. . 



. 



. 



. 



. 



. 



. 
Average Accuracy: 0.17181818181818181
Averaged Average tries: 9.421397695002684
GB:  3.5 tries (683 tests) | GB US CA AU IE
US:  3.7 tries (656 tests) | US GB AU CA NZ
CA:  6.2 tries (242 tests) | GB US CA AU IE
AU:  6.3 tries (257 tests) | US GB AU CA IE
IN:  8.0 tries (159 tests) | GB US IN AU CA
IE:  8.6 tries (170 tests) | US GB CA AU IE
SG:  8.7 tries ( 72 tests) | US PH GB GH SG
NZ:  8.7 tries (149 tests) | GB US AU CA IE
PK:  9.1 tries ( 97 tests) | US AU GB PK IN
LK:  9.8 tries ( 61 tests) | GB US AU LK IE
BD:  9.9 tries ( 70 tests) | GB US IN IE NZ
GH: 10.6 tries ( 78 tests) | US AU GB IN SG
ZA: 11.4 tries ( 79 tests) | US GB AU CA IE
HK: 11.5 tries ( 89 tests) | US GB CA AU IE
KE: 11.6 tries ( 77 tests) | US GB CA AU IE
MY: 11.7 tries ( 69 tests) | US GB AU NZ CA
NG: 12.0 tries ( 71 tests) | US GB CA KE AU
TZ: 12.1 tries ( 59 tests) | GB US AU NG IE
JM: 12.5 tries ( 87 tests) | GB US AU NZ CA
PH: 12.7 tries ( 75 tests) | US GB AU CA IN




. 



. 



. 



. 



. 



. 



. 



. 



. . 
Average Accuracy: 0.17454545454545456
Averaged Average tries: 9.26595509064371
US:  3.8 tries (656 tests) | US GB AU CA IN
GB:  4.0 tries (683 tests) | GB US CA AU IE
CA:  6.4 tries (242 tests) | US GB CA AU IN
AU:  6.4 tries (257 tests) | GB US AU CA NZ
IN:  6.6 tries (159 tests) | GB US IN CA IE
IE:  8.1 tries (170 tests) | US GB AU CA IN
NZ:  8.6 tries (149 tests) | GB US AU IE NZ
PK:  8.6 tries ( 97 tests) | US GB PK AU IN
BD:  9.6 tries ( 70 tests) | GB US IN CA IE
LK:  9.6 tries ( 61 tests) | GB US IE IN LK
SG: 10.2 tries ( 72 tests) | GB US SG CA IN
KE: 10.2 tries ( 77 tests) | US GB GH KE CA
HK: 10.3 tries ( 89 tests) | GB US CA IE AU
GH: 10.3 tries ( 78 tests) | US GB AU IN NZ
MY: 11.2 tries ( 69 tests) | US GB NZ AU CA
ZA: 11.2 tries ( 79 tests) | US GB AU CA ZA
PH: 11.8 tries ( 75 tests) | US GB CA AU IE
JM: 12.3 tries ( 87 tests) | GB US CA AU IN
NG: 12.5 tries ( 71 tests) | US GB AU KE IN
TZ: 13.7 tries ( 59 tests) | GB US AU IE NZ




In [None]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression model with undersampling
model = LogisticRegression(max_iter=2000)
average_performance(model, X, undersampling=True)
average_performance(model, X_25, undersampling=True)
average_performance(model, X_50, undersampling=True)

. . . . . . . . . . 
Average Accuracy: 0.07696969696969697
Averaged Average tries: 9.358864171856451
PK:  6.8 tries ( 97 tests) | PK NG SG IN AU
GH:  8.1 tries ( 78 tests) | PH GH TZ NG ZA
TZ:  8.2 tries ( 59 tests) | GH PH NG AU LK
SG:  8.2 tries ( 72 tests) | PH GH BD SG IN
AU:  8.7 tries (257 tests) | AU PH MY US GB
CA:  8.9 tries (242 tests) | CA US NZ HK BD
IN:  9.2 tries (159 tests) | NZ PK IN PH LK
ZA:  9.3 tries ( 79 tests) | MY CA ZA IN NG
HK:  9.5 tries ( 89 tests) | MY NZ KE IN SG
US:  9.5 tries (656 tests) | US GB CA MY AU
NZ:  9.5 tries (149 tests) | JM IE US NZ TZ
PH:  9.6 tries ( 75 tests) | TZ HK CA GB SG
IE:  9.6 tries (170 tests) | BD IN TZ NG US
KE:  9.8 tries ( 77 tests) | KE TZ LK GH ZA
GB:  9.9 tries (683 tests) | AU BD US NZ CA
LK: 10.0 tries ( 61 tests) | JM ZA GH LK IE
MY: 10.1 tries ( 69 tests) | BD US JM NZ CA
NG: 10.1 tries ( 71 tests) | HK BD ZA GH IE
BD: 10.5 tries ( 70 tests) | NG KE JM IN GH
JM: 11.8 tries ( 87 tests) | LK CA PH IE IN
. . . . . . . . . .

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Classifier with undersampling
rf_model = RandomForestClassifier(random_state=3)
average_performance(rf_model, X, undersampling=True)
average_performance(rf_model, X_25, undersampling=True)
average_performance(rf_model, X_50, undersampling=True)

. . . . . . . . . . 
Average Accuracy: 0.08878787878787879
Averaged Average tries: 9.564896729650242
GH:  7.9 tries ( 78 tests) | GH TZ LK NZ SG
CA:  8.4 tries (242 tests) | CA KE US AU GB
AU:  8.5 tries (257 tests) | AU CA GB MY US
GB:  8.7 tries (683 tests) | GB US CA AU IN
BD:  8.9 tries ( 70 tests) | IN NZ KE BD NG
IN:  8.9 tries (159 tests) | IN AU BD GB US
PK:  9.4 tries ( 97 tests) | PK IN HK ZA AU
KE:  9.5 tries ( 77 tests) | GH KE LK US CA
HK:  9.6 tries ( 89 tests) | US HK NZ NG TZ
NZ:  9.7 tries (149 tests) | BD AU IE NZ SG
IE:  9.7 tries (170 tests) | ZA TZ NZ BD LK
JM:  9.7 tries ( 87 tests) | CA GB IE US JM
PH:  9.7 tries ( 75 tests) | PH US NZ GB IE
US: 10.0 tries (656 tests) | GB CA US AU IN
NG: 10.2 tries ( 71 tests) | BD US KE GH JM
LK: 10.2 tries ( 61 tests) | BD KE IE GH TZ
ZA: 10.3 tries ( 79 tests) | IE ZA HK US CA
SG: 10.4 tries ( 72 tests) | BD SG CA IE AU
TZ: 10.5 tries ( 59 tests) | HK LK ZA MY GH
MY: 11.0 tries ( 69 tests) | AU KE TZ CA JM
. . . . . . . . . .

In [None]:
from sklearn.neural_network import MLPClassifier

# Multi-layer Perceptron Model with undersampling
mlp_model = MLPClassifier(random_state=3)
average_performance(mlp_model,X, undersampling=True)
average_performance(mlp_model,X_25, undersampling=True)
average_performance(mlp_model,X_50, undersampling=True)



. 



. 



. 



. 



. 



. 



. 



. 



. 



. 
Average Accuracy: 0.07424242424242425
Averaged Average tries: 9.524997843271251
PK:  8.5 tries ( 97 tests) | PK AU NG SG ZA
BD:  8.5 tries ( 70 tests) | BD NG JM KE GH
IN:  8.6 tries (159 tests) | BD IN NZ SG IE
SG:  8.6 tries ( 72 tests) | GH SG PH NZ IE
KE:  9.0 tries ( 77 tests) | KE GH LK AU TZ
AU:  9.0 tries (257 tests) | CA AU US PH BD
ZA:  9.0 tries ( 79 tests) | US ZA NG JM HK
MY:  9.1 tries ( 69 tests) | BD MY LK JM PH
CA:  9.2 tries (242 tests) | CA US PK TZ HK
GH:  9.3 tries ( 78 tests) | NG AU SG GH NZ
HK:  9.6 tries ( 89 tests) | IE MY HK SG NZ
TZ:  9.8 tries ( 59 tests) | TZ HK JM NZ GB
GB:  9.9 tries (683 tests) | AU BD US TZ MY
NG: 10.0 tries ( 71 tests) | KE GH AU ZA LK
US: 10.0 tries (656 tests) | CA AU US MY IN
IE: 10.1 tries (170 tests) | LK TZ NG AU JM
LK: 10.2 tries ( 61 tests) | JM NG ZA HK GH
NZ: 10.3 tries (149 tests) | BD HK SG TZ JM
JM: 10.7 tries ( 87 tests) | KE PH IE CA HK
PH: 10.9 tries ( 75 tests) | SG NZ ZA KE AU




. 



. 



. 



. 



. 



. 



. 



. 



. 



. 
Average Accuracy: 0.07030303030303031
Averaged Average tries: 9.412643265119119
SG:  6.9 tries ( 72 tests) | PH SG GH NZ IE
GH:  7.2 tries ( 78 tests) | GH IE SG AU CA
PK:  8.3 tries ( 97 tests) | PK NG ZA AU LK
BD:  8.5 tries ( 70 tests) | LK BD KE PH MY
CA:  9.0 tries (242 tests) | CA US PH AU KE
KE:  9.3 tries ( 77 tests) | CA KE PH NZ BD
ZA:  9.3 tries ( 79 tests) | IE ZA MY CA NG
US:  9.4 tries (656 tests) | AU CA US MY NZ
AU:  9.5 tries (257 tests) | CA PH NZ AU GB
NG:  9.6 tries ( 71 tests) | HK BD IN KE TZ
NZ:  9.6 tries (149 tests) | CA PH HK US GH
IN:  9.6 tries (159 tests) | US NG IN PK BD
LK:  9.7 tries ( 61 tests) | JM US BD LK AU
GB:  9.9 tries (683 tests) | US GB BD CA PH
MY:  9.9 tries ( 69 tests) | US LK BD NZ PK
HK: 10.0 tries ( 89 tests) | SG PH NG MY TZ
TZ: 10.3 tries ( 59 tests) | GH NG HK IE PH
IE: 10.4 tries (170 tests) | JM IN LK CA BD
PH: 10.7 tries ( 75 tests) | CA JM NZ KE SG
JM: 11.1 tries ( 87 tests) | PK CA US NZ BD




. 



. 



. 



. 



. 



. 



. 



. 



. . 
Average Accuracy: 0.07909090909090909
Averaged Average tries: 9.436044368122289
PK:  7.8 tries ( 97 tests) | PK ZA NG SG US
CA:  8.4 tries (242 tests) | CA IN US ZA AU
KE:  8.4 tries ( 77 tests) | KE MY GH ZA CA
HK:  8.7 tries ( 89 tests) | MY SG CA AU IE
IN:  8.8 tries (159 tests) | BD IN CA LK NG
GH:  8.9 tries ( 78 tests) | GH KE AU JM TZ
BD:  8.9 tries ( 70 tests) | NZ IN BD PK SG
AU:  9.3 tries (257 tests) | US GB CA AU HK
US:  9.4 tries (656 tests) | MY US CA GB AU
PH:  9.4 tries ( 75 tests) | SG LK CA NZ US
SG:  9.4 tries ( 72 tests) | PH US GH SG NG
LK:  9.4 tries ( 61 tests) | ZA IE GB JM TZ
ZA:  9.5 tries ( 79 tests) | ZA US LK HK JM
TZ:  9.6 tries ( 59 tests) | PH TZ NZ NG HK
IE: 10.0 tries (170 tests) | IN LK GH JM IE
GB: 10.2 tries (683 tests) | GB CA AU US LK
NZ: 10.4 tries (149 tests) | JM SG HK IE PH
MY: 10.4 tries ( 69 tests) | NZ PH LK PK CA
JM: 10.6 tries ( 87 tests) | KE US CA GB ZA
NG: 11.2 tries ( 71 tests) | KE HK BD ZA MY
