In [19]:
import pandas as pd

doc_dict = {}
id_dict = {}
word_count_dict = {}
country_set = set()

# pulls text IDs, country codes, document types, and word counts from the excel sheet,
# using it to divide the documents into a dictionary by ID
sources_df = pd.read_excel("./text/sampleSources.xlsx", sheet_name="texts")
for text_id, (country_code, doc_type), word_count in [(l[0], tuple(l[1].split()), l[2]) for l in sources_df[["textID", "country|genre", "# words"]].values.tolist()]:
    with open(f"./text/w_{country_code.lower()}_{doc_type.lower()}.txt", 'r',
              encoding="utf-8") as file:
        # add each text_id to id_dict
        if f"{country_code}_{doc_type}" not in id_dict:
            id_dict[f"{country_code}_{doc_type}"] = [text_id]
        else:
            id_dict[f"{country_code}_{doc_type}"].append(text_id)
        # makes country code set
        country_set.add(country_code)
        # finds correct text_id and adds every line in the document to the dictionary
        IS_DOC = False
        lines = file.readlines()
        for i, line in enumerate(lines):
            if line.strip().startswith(f"##{text_id}"):
                IS_DOC = True
            elif line.strip().startswith("##"):
                IS_DOC = False
            if IS_DOC:
                if text_id not in doc_dict:
                    doc_dict[text_id] = [w.lower() for w in line.split()]
                else:
                    doc_dict[text_id] += [w.lower() for w in line.split()]
        # adds word count to dictionary
        word_count_dict[text_id] = word_count

In [20]:
import copy
from collections import Counter

# make a counter for every word in the corpus
vocab = Counter({})
vocab['<UNK>'] = 0
for doc in doc_dict.values():
    for word in doc:
        if word in vocab:
            vocab[word] += 1
        else:
            vocab[word] = 1

# make a dictionary of sets for every country and record every word used by each country
# also make a word count for every country
vocab_sets = {country_code:set() for country_code in country_set}
country_word_counts = Counter({country_code:0 for country_code in country_set})
for country_code in country_set:
    for text_id in id_dict[f"{country_code}_B"] + id_dict[f"{country_code}_G"]:
        for word in doc_dict[text_id]:
            vocab_sets[country_code].add(word)
        country_word_counts[country_code] += word_count_dict[text_id]

# make new vocabulary sets, removing words that appear in less than
# 25% and 50% of the countries datasets
vocab_25 = vocab.copy()
vocab_50 = vocab.copy()
for word in vocab:
    COUNTRY_COUNT = 0
    for country_code in country_set:
        if word in vocab_sets[country_code]:
            COUNTRY_COUNT+=1
    if COUNTRY_COUNT / len(country_set) < 0.25:
        del vocab_25[word]
    if COUNTRY_COUNT / len(country_set) < 0.5:
        del vocab_50[word]

# Replace any words that appear in less than 25% of the countries’ datasets with the <UNK> token
doc_dict_25 = copy.deepcopy(doc_dict)
for text_id, doc in doc_dict_25.items():
    for i, word in enumerate(doc):
        if word not in vocab_25:
            doc_dict_25[text_id][i] = '<UNK>'
            vocab_25['<UNK>'] += 1

# Replace any words that appear in less than 50% of the countries’ datasets with the <UNK> token
doc_dict_50 = copy.deepcopy(doc_dict)
for text_id, doc in doc_dict_50.items():
    for i, word in enumerate(doc):
        if word not in vocab_50:
            doc_dict_50[text_id][i] = '<UNK>'
            vocab_50['<UNK>'] += 1

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

# make a dataframe with each version of the dataset with their country labels
texts = []
texts_25 = []
texts_50 = []
country_labels = []

for country_code in country_set:
    for text_id in id_dict[f"{country_code}_B"] + id_dict[f"{country_code}_G"]:
        texts.append(" ".join(doc_dict[text_id]))
        texts_25.append(" ".join(doc_dict_25[text_id]))
        texts_50.append(" ".join(doc_dict_50[text_id]))
        country_labels.append(country_code)

data = {
    'texts': texts,
    'texts_25': texts_25,
    'texts_50': texts_50,
    'country_labels': country_labels
}

df = pd.DataFrame(data)

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['texts'])
X_25 = vectorizer.fit_transform(df['texts_25'])
X_50 = vectorizer.fit_transform(df['texts_50'])
y = df['country_labels']

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

def train_binary_classifiers(model_type, X_train, y_train, progress_updates=True):
   # choose classifier and make a dictionary of models for every country
    if model_type == "rf":
        model_dict = {c:RandomForestClassifier(random_state=3, class_weight="balanced") for c in country_set}
    elif model_type == "mlp":
        model_dict = {c:MLPClassifier(random_state=3) for c in country_set}
    elif model_type == "lr":
        model_dict = {c:LogisticRegression(max_iter=2000, class_weight="balanced") for c in country_set}
    elif model_type == "nb":
        model_dict = {c:MultinomialNB() for c in country_set}
    
    for country in model_dict:
        # make a copy of y and replace country labels with 0 or 1 depending on current country
        y_train_binary = y_train.tolist()
        for i, label in enumerate(y_train_binary):
            if label != country:
                y_train_binary[i] = 0
            else:
                y_train_binary[i] = 1
        model_dict[country].fit(X_train, y_train_binary)
        if progress_updates:
            print(".",end=" ")
    if progress_updates:
        print()
        
    return model_dict

In [23]:
def bc_predict_proba(model_dict, test):
    probabilities = {c:0 for c in country_set}
    for country in model_dict:
            probabilities[country] = model_dict[country].predict_proba(test)[0][1]
    return probabilities

In [24]:
def probability_breakdown(model_dict, X_test, y_test, print_output=False):
    # makes a dictionary of probability Counters for every country label for each country's documents
    most_probable_country = {c:Counter({c:0 for c in country_set}) for c in country_set}
    for test_num, test in enumerate(X_test):
        for country, probability in bc_predict_proba(model_dict, test).items():
            most_probable_country[y_test.tolist()[test_num]][country] += probability
    # Either returns or prints the results
    if print_output:
        for country in most_probable_country:
            print(f"{country}: {" ".join([country for country, _ in most_probable_country[country].most_common(5)])}")
    else:
        return most_probable_country

In [25]:
def average_accuracy(y_pred, y_test):
    country_accuracies = {c:0 for c in country_set}
    country_test_count = Counter(y_test)
    for i, country in enumerate(y_test):
        if country == y_pred[i]:
            country_accuracies[country]+=1
    country_accuracies = {country:country_accuracies[country]/country_test_count[country] for country in country_accuracies}

    return country_accuracies

In [26]:
def average_tries(model_dict, X_test, y_test, print_output=False):
    """
    Takes the model and testing data.
    Returns average tries needed to return the right label,
    average tries per country, and number of tests per country.
    """
    probabilities = []
    for test in X_test:
            probabilities.append(bc_predict_proba(model_dict, test).values())     

    # Matches each country label to its probability in a dataframe
    df = pd.DataFrame(probabilities, columns=bc_predict_proba(model_dict, X_test[0]).keys())

    # Sorts each probability distribution for highest probability
    # and records the number of iterations needed to get to the right label
    country_try_count = {c:0 for c in country_set}
    country_test_count = Counter(y_test)
    for i in range(len(df)):
        for try_count, (country_code, _) in enumerate(sorted(df.iloc[i].to_dict().items(), key=lambda item: item[1], reverse=True)):
            if country_code == y_test.tolist()[i]:
                country_try_count[country_code] += try_count+1
                break
    
    # Averages the country try count by number of tests
    country_try_count = {country:country_try_count[country]/country_test_count[country] for country in country_try_count}
    # Computes overal average try count
    avg_tries = sum(country_try_count.values())/len(country_try_count)
    
    # Either returns or prints the results
    if print_output:
        for country, try_count in sorted(country_try_count.items(), key=lambda item: item[1]):
            print(country, f"{try_count:.1f} tries", f"({country_test_count[country]} tests)")
        print("Average number of tries:", avg_tries)
    else:
        return avg_tries, country_test_count

In [27]:
def test_binary_classifiers(model_dict, X_test, y_test):
    results = []
    y_pred = []
    for i, test in enumerate(X_test):
        probabilities = Counter({})
        for country in model_dict:
            probabilities[country] = model_dict[country].predict_proba(test)[0][1]
        results.append(int(probabilities.most_common(1)[0][0] == y_test.tolist()[i]))
        y_pred.append(probabilities.most_common(1)[0][0])
    
    most_probable_country = probability_breakdown(model_dict, X_test, y_test)
    avg_tries, country_test_count = average_tries(model_dict, X_test, y_test)
    country_accuracy = average_accuracy(y_pred, y_test)

    print(f"Accuracy: {sum(results)/len(results)}")
    print(f"Average tries: {avg_tries}")
    for country, accuracy in sorted(country_accuracy.items(), key=lambda item: item[1], reverse=True):
        most_similar_five = " ".join([country for country, _ in most_probable_country[country].most_common(5)])
        print(f"{country}: {accuracy*100:4.1f}% accuracy", f"({country_test_count[country]:3d} tests) | {most_similar_five}")

In [28]:
def train_test_binary_classifiers(model_type, dataset = "full"):
    if dataset == "full":
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3, stratify=y)
    elif dataset == "25%":
        X_train, X_test, y_train, y_test = train_test_split(X_25, y, test_size=0.2, random_state=3, stratify=y)
    elif dataset == "50%":
        X_train, X_test, y_train, y_test = train_test_split(X_50, y, test_size=0.2, random_state=3, stratify=y)
    
    model_dict = train_binary_classifiers(model_type, X_train, y_train)
    test_binary_classifiers(model_dict, X_test, y_test)

In [29]:
train_test_binary_classifiers("lr")
train_test_binary_classifiers("lr", "25%")
train_test_binary_classifiers("lr", "50%")

. . . . . . . . . . . . . . . . . . . . 
Accuracy: 0.3333333333333333
Average tries: 7.185838630323924
US: 52.9% accuracy ( 68 tests) | US GB CA NZ IN
GB: 50.0% accuracy ( 68 tests) | GB US AU NG IN
IE: 38.9% accuracy ( 18 tests) | IE GB US PH CA
LK: 37.5% accuracy (  8 tests) | LK GB US NG NZ
TZ: 33.3% accuracy (  6 tests) | TZ KE HK IN IE
JM: 28.6% accuracy (  7 tests) | US ZA JM NZ SG
HK: 28.6% accuracy (  7 tests) | HK CA GB JM US
BD: 28.6% accuracy (  7 tests) | BD GB IN CA PK
AU: 23.1% accuracy ( 26 tests) | US AU GB NZ IN
NZ: 21.4% accuracy ( 14 tests) | GB NZ NG US CA
IN: 17.6% accuracy ( 17 tests) | GB IN US IE TZ
CA: 17.4% accuracy ( 23 tests) | GB US CA AU NZ
KE: 14.3% accuracy (  7 tests) | GB KE US PH IE
GH: 14.3% accuracy (  7 tests) | US GB KE CA AU
NG: 14.3% accuracy (  7 tests) | GH IN ZA GB AU
ZA: 12.5% accuracy (  8 tests) | GB AU US ZA CA
SG: 12.5% accuracy (  8 tests) | SG IN US PH GB
PK: 11.1% accuracy (  9 tests) | US PK GB LK IN
PH:  0.0% accuracy (  8 tests) | 

In [30]:
train_test_binary_classifiers("rf")
train_test_binary_classifiers("rf", "25%")
train_test_binary_classifiers("rf", "50%")

. . . . . . . . . . . . . . . . . . . . 
Accuracy: 0.27575757575757576
Average tries: 7.782404876666385
US: 61.8% accuracy ( 68 tests) | US GB CA AU IN
GB: 61.8% accuracy ( 68 tests) | GB US AU CA IE
SG: 12.5% accuracy (  8 tests) | GB US SG AU CA
IN: 11.8% accuracy ( 17 tests) | GB US IN CA AU
IE: 11.1% accuracy ( 18 tests) | GB US IE AU CA
CA:  4.3% accuracy ( 23 tests) | US GB CA AU IE
AU:  3.8% accuracy ( 26 tests) | GB US AU CA IE
KE:  0.0% accuracy (  7 tests) | GB US AU CA IE
ZA:  0.0% accuracy (  8 tests) | US GB AU CA NZ
PK:  0.0% accuracy (  9 tests) | GB US IN AU PK
TZ:  0.0% accuracy (  6 tests) | US GB CA AU IN
PH:  0.0% accuracy (  8 tests) | GB US AU CA NZ
GH:  0.0% accuracy (  7 tests) | GB US AU CA NZ
LK:  0.0% accuracy (  8 tests) | GB US IE IN LK
NG:  0.0% accuracy (  7 tests) | US GB CA AU NG
NZ:  0.0% accuracy ( 14 tests) | GB US AU CA NZ
JM:  0.0% accuracy (  7 tests) | US GB NZ AU CA
HK:  0.0% accuracy (  7 tests) | US GB AU CA HK
BD:  0.0% accuracy (  7 tests) |

In [31]:
train_test_binary_classifiers("mlp")
train_test_binary_classifiers("mlp", "25%")
train_test_binary_classifiers("mlp", "50%")

. . . . . . . . . . . . . . . . . . . . 
Accuracy: 0.3484848484848485
Average tries: 7.263232561698035
GB: 69.1% accuracy ( 68 tests) | GB US AU IN NZ
US: 57.4% accuracy ( 68 tests) | US GB AU CA HK
LK: 37.5% accuracy (  8 tests) | LK US GB AU IE
HK: 28.6% accuracy (  7 tests) | CA IN US AU HK
BD: 28.6% accuracy (  7 tests) | BD US ZA GB CA
AU: 26.9% accuracy ( 26 tests) | US GB AU IE CA
IE: 22.2% accuracy ( 18 tests) | GB IE US AU CA
TZ: 16.7% accuracy (  6 tests) | US TZ NG GB ZA
GH: 14.3% accuracy (  7 tests) | GB US AU GH NZ
NZ: 14.3% accuracy ( 14 tests) | GB US NZ AU IN
CA: 13.0% accuracy ( 23 tests) | GB US CA BD IE
SG: 12.5% accuracy (  8 tests) | SG US GB AU LK
IN: 11.8% accuracy ( 17 tests) | GB IN US AU CA
PK: 11.1% accuracy (  9 tests) | GB PK LK IN CA
KE:  0.0% accuracy (  7 tests) | GB US KE IE NZ
ZA:  0.0% accuracy (  8 tests) | GB AU NZ US IN
PH:  0.0% accuracy (  8 tests) | GB AU PK US LK
NG:  0.0% accuracy (  7 tests) | GB AU US TZ CA
JM:  0.0% accuracy (  7 tests) | 