In [33]:
import pandas as pd

doc_dict = {}
id_dict = {}
word_count_dict = {}
country_set = set()

# pulls text IDs, country codes, document types, and word counts from the excel sheet,
# using it to divide the documents into a dictionary by ID
sources_df = pd.read_excel("./text/sampleSources.xlsx", sheet_name="texts")
for text_id, (country_code, doc_type), word_count in [(l[0], tuple(l[1].split()), l[2]) for l in sources_df[["textID", "country|genre", "# words"]].values.tolist()]:
    with open(f"./text/w_{country_code.lower()}_{doc_type.lower()}.txt", 'r',
              encoding="utf-8") as file:
        # add each text_id to id_dict
        if f"{country_code}_{doc_type}" not in id_dict:
            id_dict[f"{country_code}_{doc_type}"] = [text_id]
        else:
            id_dict[f"{country_code}_{doc_type}"].append(text_id)
        # makes country code set
        country_set.add(country_code)
        # finds correct text_id and adds every line in the document to the dictionary
        IS_DOC = False
        lines = file.readlines()
        for i, line in enumerate(lines):
            if line.strip().startswith(f"##{text_id}"):
                IS_DOC = True
            elif line.strip().startswith("##"):
                IS_DOC = False
            if IS_DOC:
                if text_id not in doc_dict:
                    doc_dict[text_id] = [w.lower() for w in line.split()]
                else:
                    doc_dict[text_id] += [w.lower() for w in line.split()]
        # adds word count to dictionary
        word_count_dict[text_id] = word_count

In [34]:
import copy
from collections import Counter

# make a counter for every word in the corpus
vocab = Counter({})
vocab['<UNK>'] = 0
for doc in doc_dict.values():
    for word in doc:
        if word in vocab:
            vocab[word] += 1
        else:
            vocab[word] = 1

# make a dictionary of sets for every country and record every word used by each country
# also make a word count for every country
vocab_sets = {country_code:set() for country_code in country_set}
country_word_counts = Counter({country_code:0 for country_code in country_set})
for country_code in country_set:
    for text_id in id_dict[f"{country_code}_B"] + id_dict[f"{country_code}_G"]:
        for word in doc_dict[text_id]:
            vocab_sets[country_code].add(word)
        country_word_counts[country_code] += word_count_dict[text_id]

# make new vocabulary sets, removing words that appear in less than
# 25% and 50% of the countries datasets
vocab_25 = vocab.copy()
vocab_50 = vocab.copy()
for word in vocab:
    COUNTRY_COUNT = 0
    for country_code in country_set:
        if word in vocab_sets[country_code]:
            COUNTRY_COUNT+=1
    if COUNTRY_COUNT / len(country_set) < 0.25:
        del vocab_25[word]
    if COUNTRY_COUNT / len(country_set) < 0.5:
        del vocab_50[word]

# Replace any words that appear in less than 25% of the countries’ datasets with the <UNK> token
doc_dict_25 = copy.deepcopy(doc_dict)
for text_id, doc in doc_dict_25.items():
    for i, word in enumerate(doc):
        if word not in vocab_25:
            doc_dict_25[text_id][i] = '<UNK>'
            vocab_25['<UNK>'] += 1

# Replace any words that appear in less than 50% of the countries’ datasets with the <UNK> token
doc_dict_50 = copy.deepcopy(doc_dict)
for text_id, doc in doc_dict_50.items():
    for i, word in enumerate(doc):
        if word not in vocab_50:
            doc_dict_50[text_id][i] = '<UNK>'
            vocab_50['<UNK>'] += 1

In [35]:
from sklearn.feature_extraction.text import CountVectorizer

# make a dataframe with each version of the dataset with their country labels
texts = []
texts_25 = []
texts_50 = []
country_labels = []

for country_code in country_set:
    for text_id in id_dict[f"{country_code}_B"] + id_dict[f"{country_code}_G"]:
        texts.append(" ".join(doc_dict[text_id]))
        texts_25.append(" ".join(doc_dict_25[text_id]))
        texts_50.append(" ".join(doc_dict_50[text_id]))
        country_labels.append(country_code)

data = {
    'texts': texts,
    'texts_25': texts_25,
    'texts_50': texts_50,
    'country_labels': country_labels
}

df = pd.DataFrame(data)

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['texts'])
X_25 = vectorizer.fit_transform(df['texts_25'])
X_50 = vectorizer.fit_transform(df['texts_50'])
y = df['country_labels']

In [36]:
def probability_breakdown(model, X_test, y_test):
    # makes a dictionary of probability Counters for every country label for each country's documents
    most_probable_country = {c:Counter({c:0 for c in country_set}) for c in country_set}
    for test_num, probabilities in enumerate(model.predict_proba(X_test)):
        for i, probability in enumerate(probabilities):
            most_probable_country[y_test.tolist()[test_num]][model.classes_[i]] += probability  

    return most_probable_country

In [37]:
def average_accuracy(y_pred, y_test):
    country_accuracies = {c:0 for c in country_set}
    country_test_count = Counter(y_test)
    for i, country in enumerate(y_test):
        if country == y_pred[i]:
            country_accuracies[country]+=1
    country_accuracies = {country:country_accuracies[country]/country_test_count[country] for country in country_accuracies}

    return country_accuracies

In [38]:
def average_tries(model, X_test, y_test, print_output=False):
    """
    Takes the model and testing data.
    Returns average tries needed to return the right label,
    average tries per country, and number of tests per country.
    """
    
    # Matches each country label to its probability in a dataframe
    df = pd.DataFrame(model.predict_proba(X_test).tolist(), columns=model.classes_.tolist())

    # Sorts each probability distribution for highest probability
    # and records the number of iterations needed to get to the right label
    country_try_count = {c:0 for c in country_set}
    country_test_count = Counter(y_test)
    for i in range(len(df)):
        for try_count, (country_code, _) in enumerate(sorted(df.iloc[i].to_dict().items(), key=lambda item: item[1], reverse=True)):
            if country_code == y_test.tolist()[i]:
                country_try_count[country_code] += try_count+1
                break
    
    # Averages the country try count by number of tests
    country_try_count = {country:country_try_count[country]/country_test_count[country] for country in country_try_count}
    # Computes overal average try count
    avg_tries = sum(country_try_count.values())/len(country_try_count)
    
    # Either returns or prints the results
    if print_output:
        for country, try_count in sorted(country_try_count.items(), key=lambda item: item[1]):
            print(country, f"{try_count:.1f} tries", f"({country_test_count[country]} tests)")
        print("Average number of tries:", avg_tries)
    else:
        return avg_tries, country_test_count

In [39]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
import random

def average_performance(model, X_dataset, y_dataset = y, random_states=10, undersampling=False, scrambling=False):
    """
    Takes the model, dataset, and number of random test splits to test.
    Prints average accuracy and tries between all test splits of the model.
    """
    accuracies = []
    tries = []
    total_country_test_count = {c:0 for c in country_set}
    avg_country_accuracies = {c:0 for c in country_set}
    avg_most_probable_country = {c:Counter({c:0 for c in country_set}) for c in country_set}
    
    if scrambling:
        random.seed = 3
        y_scrambled = y_dataset.tolist()
        random.shuffle(y_scrambled)
        y_dataset = pd.Series(y_scrambled)

    # trains models on the specified number of random splits
    for r in range(random_states):
        X_train, X_test, y_train, y_test = train_test_split(X_dataset, y_dataset, test_size=0.2, random_state=r, stratify=y_dataset)
        # undersampling
        if undersampling:
            X_train, y_train = RandomUnderSampler(random_state=3).fit_resample(X_train, y_train)
        model.fit(X_train, y_train)
        # computes accuracies
        y_pred = model.predict(X_test)
        accuracies.append(accuracy_score(y_test, y_pred))
        country_accuracies = average_accuracy(y_pred, y_test)
        # computes overall and country try counts
        avg_tries, country_test_count = average_tries(model, X_test, y_test)
        tries.append(avg_tries)
        # computes similar countries
        most_probable_country = probability_breakdown(model, X_test, y_test)
        for country in country_set:
            avg_country_accuracies[country] += country_accuracies[country]
            total_country_test_count[country] += country_test_count[country]
            avg_most_probable_country[country] += most_probable_country[country]
        print(".",end=" ")
    print()
    
    print("Average Accuracy:", np.mean(accuracies))
    print("Averaged Average tries:", np.mean(tries))
    # Sorts country try counts and prints them in order
    avg_country_accuracies = {country:avg_country_accuracies[country]/(random_states) for country in avg_country_accuracies}
    for country, accuracy in sorted(avg_country_accuracies.items(), key=lambda item: item[1], reverse=True):
        most_similar_five = " ".join([country for country, _ in avg_most_probable_country[country].most_common(5)])
        print(f"{country}: {accuracy*100:4.1f}% accuracy", f"({total_country_test_count[country]:3d} tests) | {most_similar_five}")

In [40]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression model with class weights balanced
model = LogisticRegression(max_iter=2000, class_weight='balanced')
average_performance(model, X)
average_performance(model, X_25)
average_performance(model, X_50)

. . . . . . . . . . 
Average Accuracy: 0.30454545454545456
Averaged Average tries: 7.225151297977385
US: 46.5% accuracy (680 tests) | US GB CA AU IN
GB: 45.6% accuracy (680 tests) | GB US AU CA IN
AU: 26.9% accuracy (260 tests) | AU GB US IE CA
IE: 25.6% accuracy (180 tests) | IE US GB CA AU
PK: 24.4% accuracy ( 90 tests) | PK US IN GB AU
JM: 22.9% accuracy ( 70 tests) | GB JM US CA NZ
LK: 21.2% accuracy ( 80 tests) | LK GB US AU CA
IN: 21.2% accuracy (170 tests) | GB IN US CA NZ
HK: 20.0% accuracy ( 70 tests) | HK US GB CA AU
NZ: 18.6% accuracy (140 tests) | GB NZ US AU IN
GH: 18.6% accuracy ( 70 tests) | GH GB US KE AU
BD: 18.6% accuracy ( 70 tests) | BD GB IE IN US
CA: 18.3% accuracy (230 tests) | US GB CA AU IE
KE: 15.7% accuracy ( 70 tests) | US KE GB CA AU
TZ: 15.0% accuracy ( 60 tests) | GB TZ US KE IN
SG: 13.8% accuracy ( 80 tests) | US SG GB IN PH
NG: 12.9% accuracy ( 70 tests) | GB US NG AU CA
PH: 11.2% accuracy ( 80 tests) | GB US AU CA SG
ZA: 10.0% accuracy ( 80 tests) | US

In [41]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Classifier with class weights balanced
rf_model = RandomForestClassifier(random_state=3, class_weight='balanced')
average_performance(rf_model, X)
average_performance(rf_model, X_25)
average_performance(rf_model, X_50)

. . . . . . . . . . 
Average Accuracy: 0.3418181818181818
Averaged Average tries: 6.370949687567334
US: 59.6% accuracy (680 tests) | US GB CA AU IN
GB: 53.8% accuracy (680 tests) | GB US AU CA IN
NG: 51.4% accuracy ( 70 tests) | US GB NG AU CA
LK: 42.5% accuracy ( 80 tests) | GB US LK IN AU
IE: 41.1% accuracy (180 tests) | GB US IE AU CA
GH: 28.6% accuracy ( 70 tests) | US GB GH AU CA
BD: 28.6% accuracy ( 70 tests) | US GB BD AU IN
PK: 23.3% accuracy ( 90 tests) | GB US PK CA IN
NZ: 22.9% accuracy (140 tests) | US GB NZ AU CA
JM: 20.0% accuracy ( 70 tests) | US GB JM CA AU
IN: 18.8% accuracy (170 tests) | GB US IN AU CA
AU: 16.9% accuracy (260 tests) | US GB AU CA NZ
MY: 10.0% accuracy ( 70 tests) | US GB MY AU CA
SG:  7.5% accuracy ( 80 tests) | US GB SG AU CA
HK:  7.1% accuracy ( 70 tests) | US GB HK AU CA
TZ:  3.3% accuracy ( 60 tests) | US GB AU IE CA
CA:  3.0% accuracy (230 tests) | US GB CA AU IN
ZA:  2.5% accuracy ( 80 tests) | US GB AU CA ZA
KE:  1.4% accuracy ( 70 tests) | US 

In [42]:
from sklearn.neural_network import MLPClassifier

# Multi-layer Perceptron Model
mlp_model = MLPClassifier(random_state=3)
average_performance(mlp_model,X)
average_performance(mlp_model,X_25)
average_performance(mlp_model,X_50)

. . . . . . . . . . 
Average Accuracy: 0.32757575757575763
Averaged Average tries: 7.338455831608004
US: 61.2% accuracy (680 tests) | US GB CA AU IN
GB: 52.8% accuracy (680 tests) | GB US CA AU IN
CA: 31.3% accuracy (230 tests) | US CA GB AU IN
AU: 24.6% accuracy (260 tests) | US AU GB CA IE
IN: 18.8% accuracy (170 tests) | GB US IN CA AU
PK: 16.7% accuracy ( 90 tests) | GB US PK CA IN
JM: 15.7% accuracy ( 70 tests) | US CA GB JM AU
NG: 15.7% accuracy ( 70 tests) | GB AU US NG CA
IE: 13.9% accuracy (180 tests) | GB US IE CA AU
NZ: 13.6% accuracy (140 tests) | GB US NZ CA AU
LK: 12.5% accuracy ( 80 tests) | GB US LK CA AU
HK: 11.4% accuracy ( 70 tests) | US GB HK CA AU
SG: 10.0% accuracy ( 80 tests) | US GB CA AU SG
PH: 10.0% accuracy ( 80 tests) | GB US CA PH AU
BD: 10.0% accuracy ( 70 tests) | GB US CA BD AU
MY:  7.1% accuracy ( 70 tests) | US GB CA MY AU
GH:  7.1% accuracy ( 70 tests) | GB US AU CA GH
ZA:  3.8% accuracy ( 80 tests) | US GB CA AU JM
KE:  2.9% accuracy ( 70 tests) | US

In [43]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression model with undersampling
model = LogisticRegression(max_iter=2000)
average_performance(model, X, undersampling=True)
average_performance(model, X_25, undersampling=True)
average_performance(model, X_50, undersampling=True)

. . . . . . . . . . 
Average Accuracy: 0.14242424242424243
Averaged Average tries: 7.687717055607081
HK: 25.7% accuracy ( 70 tests) | HK MY JM IE TZ
LK: 25.0% accuracy ( 80 tests) | LK CA TZ PH KE
BD: 24.3% accuracy ( 70 tests) | BD MY KE JM IN
JM: 24.3% accuracy ( 70 tests) | JM ZA NZ PH IN
PK: 23.3% accuracy ( 90 tests) | PK IN HK MY NG
GH: 21.4% accuracy ( 70 tests) | GH KE NG TZ PH
TZ: 18.3% accuracy ( 60 tests) | TZ KE GH AU IE
SG: 17.5% accuracy ( 80 tests) | SG TZ PH IN NZ
KE: 17.1% accuracy ( 70 tests) | KE PH TZ JM ZA
NG: 15.7% accuracy ( 70 tests) | NG TZ GB GH KE
IE: 15.0% accuracy (180 tests) | IE PH GB ZA CA
MY: 14.3% accuracy ( 70 tests) | MY GH SG GB TZ
IN: 14.1% accuracy (170 tests) | IN GB CA BD ZA
ZA: 13.8% accuracy ( 80 tests) | ZA JM US GH IE
GB: 12.9% accuracy (680 tests) | GB NZ CA PH IE
NZ: 12.1% accuracy (140 tests) | PH NZ GB JM IE
US: 11.3% accuracy (680 tests) | CA US GB PH IE
AU: 11.2% accuracy (260 tests) | AU JM PH NG GB
CA: 10.4% accuracy (230 tests) | CA

In [44]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Classifier with undersampling
rf_model = RandomForestClassifier(random_state=3)
average_performance(rf_model, X, undersampling=True)
average_performance(rf_model, X_25, undersampling=True)
average_performance(rf_model, X_50, undersampling=True)

. . . . . . . . . . 
Average Accuracy: 0.22787878787878788
Averaged Average tries: 6.6156307166121735
NG: 70.0% accuracy ( 70 tests) | NG ZA KE GB JM
LK: 55.0% accuracy ( 80 tests) | LK IN GB PH CA
IE: 50.0% accuracy (180 tests) | IE PH GB CA ZA
JM: 47.1% accuracy ( 70 tests) | JM PH GB ZA CA
GH: 45.7% accuracy ( 70 tests) | GH TZ JM ZA PH
BD: 42.9% accuracy ( 70 tests) | BD PK GB CA IE
PK: 37.8% accuracy ( 90 tests) | PK CA GB BD IN
NZ: 32.1% accuracy (140 tests) | NZ PH GB AU SG
HK: 27.1% accuracy ( 70 tests) | HK SG PH AU TZ
SG: 26.2% accuracy ( 80 tests) | SG PH JM GB US
AU: 25.8% accuracy (260 tests) | AU GB PH CA US
GB: 17.8% accuracy (680 tests) | GB PH CA US JM
PH: 16.2% accuracy ( 80 tests) | PH SG GB ZA CA
ZA: 15.0% accuracy ( 80 tests) | ZA GB US PH IN
CA: 14.8% accuracy (230 tests) | CA GB US PH ZA
KE: 14.3% accuracy ( 70 tests) | KE TZ ZA GB GH
MY: 14.3% accuracy ( 70 tests) | MY GB PH AU US
IN: 12.4% accuracy (170 tests) | IN GB LK BD PH
TZ: 10.0% accuracy ( 60 tests) | T

In [45]:
from sklearn.neural_network import MLPClassifier

# Multi-layer Perceptron Model with undersampling
mlp_model = MLPClassifier(random_state=3)
average_performance(mlp_model,X, undersampling=True)
average_performance(mlp_model,X_25, undersampling=True)
average_performance(mlp_model,X_50, undersampling=True)

. . . . . . . . . . 
Average Accuracy: 0.1696969696969697
Averaged Average tries: 7.565531432193836
JM: 52.9% accuracy ( 70 tests) | JM PH HK CA TZ
HK: 38.6% accuracy ( 70 tests) | HK TZ JM US PH
TZ: 38.3% accuracy ( 60 tests) | TZ JM KE CA US
LK: 37.5% accuracy ( 80 tests) | LK JM TZ HK PH
PH: 27.5% accuracy ( 80 tests) | PH JM TZ CA GB
NG: 27.1% accuracy ( 70 tests) | NG TZ JM PH HK
IN: 24.1% accuracy (170 tests) | IN JM PH US TZ
SG: 23.8% accuracy ( 80 tests) | SG PH JM HK US
CA: 18.3% accuracy (230 tests) | PH CA JM US GB
US: 16.3% accuracy (680 tests) | US PH JM CA TZ
GB: 14.9% accuracy (680 tests) | JM PH GB TZ US
PK: 14.4% accuracy ( 90 tests) | PK JM TZ PH GB
NZ: 13.6% accuracy (140 tests) | JM PH NZ TZ CA
KE: 12.9% accuracy ( 70 tests) | TZ PH KE JM US
BD: 12.9% accuracy ( 70 tests) | PH JM BD TZ CA
GH: 11.4% accuracy ( 70 tests) | TZ JM PH GH IN
IE:  7.2% accuracy (180 tests) | JM PH TZ HK GB
ZA:  6.2% accuracy ( 80 tests) | JM TZ PH US HK
AU:  3.8% accuracy (260 tests) | JM 

In [46]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression model with scrambling
model = LogisticRegression(max_iter=2000, class_weight='balanced')
average_performance(model, X, scrambling=True)
average_performance(model, X_25, scrambling=True)
average_performance(model, X_50, scrambling=True)

. . . . . . . . . . 
Average Accuracy: 0.1296969696969697
Averaged Average tries: 10.494141754806718
GB: 30.7% accuracy (680 tests) | GB US AU CA IN
US: 23.1% accuracy (680 tests) | GB US CA AU IN
JM:  8.6% accuracy ( 70 tests) | US GB CA AU JM
CA:  6.5% accuracy (230 tests) | GB US AU CA NZ
IE:  6.1% accuracy (180 tests) | US GB AU IE CA
IN:  5.3% accuracy (170 tests) | GB US CA AU IN
BD:  4.3% accuracy ( 70 tests) | US GB AU CA IE
AU:  4.2% accuracy (260 tests) | GB US CA AU IN
ZA:  3.8% accuracy ( 80 tests) | GB US AU IE PK
PK:  2.2% accuracy ( 90 tests) | GB US AU IN GH
NG:  1.4% accuracy ( 70 tests) | US GB AU IE CA
NZ:  0.7% accuracy (140 tests) | GB US AU IN CA
KE:  0.0% accuracy ( 70 tests) | GB US AU CA IE
HK:  0.0% accuracy ( 70 tests) | US GB CA IN IE
MY:  0.0% accuracy ( 70 tests) | GB US AU IN CA
LK:  0.0% accuracy ( 80 tests) | US GB AU IE NZ
SG:  0.0% accuracy ( 80 tests) | GB US CA HK AU
TZ:  0.0% accuracy ( 60 tests) | US GB IN CA AU
GH:  0.0% accuracy ( 70 tests) | US

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. . . 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


. . . . 
Average Accuracy: 0.1290909090909091
Averaged Average tries: 10.654153955060284
GB: 26.5% accuracy (680 tests) | GB US AU CA NZ
US: 25.6% accuracy (680 tests) | GB US AU CA IE
IE:  8.9% accuracy (180 tests) | US GB AU IE CA
AU:  6.9% accuracy (260 tests) | US GB CA AU IN
BD:  5.7% accuracy ( 70 tests) | US GB CA IE JM
CA:  5.7% accuracy (230 tests) | US GB AU IN CA
PK:  5.6% accuracy ( 90 tests) | GB US AU IN CA
IN:  4.1% accuracy (170 tests) | GB US AU NZ CA
MY:  2.9% accuracy ( 70 tests) | GB US AU NZ IE
NZ:  2.1% accuracy (140 tests) | US GB AU CA IE
TZ:  1.7% accuracy ( 60 tests) | GB US AU BD PK
HK:  1.4% accuracy ( 70 tests) | US AU IE GB CA
GH:  1.4% accuracy ( 70 tests) | GB US AU CA NZ
NG:  1.4% accuracy ( 70 tests) | US CA AU GB IN
KE:  0.0% accuracy ( 70 tests) | GB US AU HK CA
LK:  0.0% accuracy ( 80 tests) | US GB CA AU PH
SG:  0.0% accuracy ( 80 tests) | GB US CA IE NZ
JM:  0.0% accuracy ( 70 tests) | GB US CA AU HK
ZA:  0.0% accuracy ( 80 tests) | US GB AU CA IN

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [47]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Classifier with scrambled labels
rf_model = RandomForestClassifier(random_state=3, class_weight='balanced')
average_performance(rf_model, X, scrambling=True)
average_performance(rf_model, X_25, scrambling=True)
average_performance(rf_model, X_50, scrambling=True)

. . . . . . . . . . 
Average Accuracy: 0.21454545454545454
Averaged Average tries: 10.507599990943978
US: 57.9% accuracy (680 tests) | US GB AU CA IE
GB: 45.7% accuracy (680 tests) | US GB AU CA IE
CA:  1.3% accuracy (230 tests) | US GB AU CA IE
KE:  0.0% accuracy ( 70 tests) | US GB AU CA IE
HK:  0.0% accuracy ( 70 tests) | US GB AU CA IN
MY:  0.0% accuracy ( 70 tests) | US GB AU CA IE
LK:  0.0% accuracy ( 80 tests) | US GB AU CA IE
SG:  0.0% accuracy ( 80 tests) | US GB AU CA IE
JM:  0.0% accuracy ( 70 tests) | US GB AU CA IE
TZ:  0.0% accuracy ( 60 tests) | US GB AU CA IE
IE:  0.0% accuracy (180 tests) | US GB AU CA IN
GH:  0.0% accuracy ( 70 tests) | US GB AU CA IN
IN:  0.0% accuracy (170 tests) | US GB AU CA IN
NZ:  0.0% accuracy (140 tests) | US GB AU CA IE
ZA:  0.0% accuracy ( 80 tests) | US GB AU CA IN
PK:  0.0% accuracy ( 90 tests) | US GB AU CA IN
BD:  0.0% accuracy ( 70 tests) | US GB AU CA IE
NG:  0.0% accuracy ( 70 tests) | US GB AU CA IN
AU:  0.0% accuracy (260 tests) | U

In [48]:
from sklearn.neural_network import MLPClassifier

# Multi-layer Perceptron Model with scrambling
mlp_model = MLPClassifier(random_state=3)
average_performance(mlp_model,X, scrambling=True)
average_performance(mlp_model,X_25, scrambling=True)
average_performance(mlp_model,X_50, scrambling=True)

. . . . . . . . . . 
Average Accuracy: 0.15969696969696973
Averaged Average tries: 10.19422930348282
US: 34.7% accuracy (680 tests) | GB US AU CA IN
GB: 34.3% accuracy (680 tests) | US GB AU CA IN
AU:  9.6% accuracy (260 tests) | US GB AU CA IN
CA:  7.0% accuracy (230 tests) | US GB AU CA IN
NZ:  3.6% accuracy (140 tests) | US GB AU CA IN
IN:  2.9% accuracy (170 tests) | US GB AU CA IN
IE:  1.7% accuracy (180 tests) | US GB AU CA IN
HK:  1.4% accuracy ( 70 tests) | US GB AU PH IE
JM:  1.4% accuracy ( 70 tests) | GB US AU CA LK
PH:  1.2% accuracy ( 80 tests) | GB US AU CA PH
PK:  1.1% accuracy ( 90 tests) | US GB AU CA HK
KE:  0.0% accuracy ( 70 tests) | GB US AU CA IN
MY:  0.0% accuracy ( 70 tests) | GB US AU CA IN
LK:  0.0% accuracy ( 80 tests) | US GB AU MY CA
SG:  0.0% accuracy ( 80 tests) | US GB AU CA IE
TZ:  0.0% accuracy ( 60 tests) | GB US AU CA IE
GH:  0.0% accuracy ( 70 tests) | GB US AU CA IN
ZA:  0.0% accuracy ( 80 tests) | US GB AU CA IN
BD:  0.0% accuracy ( 70 tests) | US