In [1]:
import pandas as pd

doc_dict = {}
id_dict = {}
word_count_dict = {}
country_set = set()

# pulls text IDs, country codes, document types, and word counts from the excel sheet,
# using it to divide the documents into a dictionary by ID
sources_df = pd.read_excel("./text/sampleSources.xlsx", sheet_name="texts")
for text_id, (country_code, doc_type), word_count in [(l[0], tuple(l[1].split()), l[2]) for l in sources_df[["textID", "country|genre", "# words"]].values.tolist()]:
    with open(f"./text/w_{country_code.lower()}_{doc_type.lower()}.txt", 'r',
              encoding="utf-8") as file:
        # add each text_id to id_dict
        if f"{country_code}_{doc_type}" not in id_dict:
            id_dict[f"{country_code}_{doc_type}"] = [text_id]
        else:
            id_dict[f"{country_code}_{doc_type}"].append(text_id)
        # makes country code set
        country_set.add(country_code)
        # finds correct text_id and adds every line in the document to the dictionary
        IS_DOC = False
        lines = file.readlines()
        for i, line in enumerate(lines):
            if line.strip().startswith(f"##{text_id}"):
                IS_DOC = True
            elif line.strip().startswith("##"):
                IS_DOC = False
            if IS_DOC:
                if text_id not in doc_dict:
                    doc_dict[text_id] = [w.lower() for w in line.split()]
                else:
                    doc_dict[text_id] += [w.lower() for w in line.split()]
        # adds word count to dictionary
        word_count_dict[text_id] = word_count

In [2]:
import copy
from collections import Counter

# make a counter for every word in the corpus
vocab = Counter({})
vocab['<UNK>'] = 0
for doc in doc_dict.values():
    for word in doc:
        if word in vocab:
            vocab[word] += 1
        else:
            vocab[word] = 1

# make a dictionary of sets for every country and record every word used by each country
# also make a word count for every country
vocab_sets = {country_code:set() for country_code in country_set}
country_word_counts = Counter({country_code:0 for country_code in country_set})
for country_code in country_set:
    for text_id in id_dict[f"{country_code}_B"] + id_dict[f"{country_code}_G"]:
        for word in doc_dict[text_id]:
            vocab_sets[country_code].add(word)
        country_word_counts[country_code] += word_count_dict[text_id]

# make new vocabulary sets, removing words that appear in less than
# 25% and 50% of the countries datasets
vocab_25 = vocab.copy()
vocab_50 = vocab.copy()
for word in vocab:
    COUNTRY_COUNT = 0
    for country_code in country_set:
        if word in vocab_sets[country_code]:
            COUNTRY_COUNT+=1
    if COUNTRY_COUNT / len(country_set) < 0.25:
        del vocab_25[word]
    if COUNTRY_COUNT / len(country_set) < 0.5:
        del vocab_50[word]

# Replace any words that appear in less than 25% of the countries’ datasets with the <UNK> token
doc_dict_25 = copy.deepcopy(doc_dict)
for text_id, doc in doc_dict_25.items():
    for i, word in enumerate(doc):
        if word not in vocab_25:
            doc_dict_25[text_id][i] = '<UNK>'
            vocab_25['<UNK>'] += 1

# Replace any words that appear in less than 50% of the countries’ datasets with the <UNK> token
doc_dict_50 = copy.deepcopy(doc_dict)
for text_id, doc in doc_dict_50.items():
    for i, word in enumerate(doc):
        if word not in vocab_50:
            doc_dict_50[text_id][i] = '<UNK>'
            vocab_50['<UNK>'] += 1

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# make a dataframe with each version of the dataset with their country labels
texts = []
texts_25 = []
texts_50 = []
country_labels = []

for country_code in country_set:
    for text_id in id_dict[f"{country_code}_B"] + id_dict[f"{country_code}_G"]:
        texts.append(" ".join(doc_dict[text_id]))
        texts_25.append(" ".join(doc_dict_25[text_id]))
        texts_50.append(" ".join(doc_dict_50[text_id]))
        country_labels.append(country_code)

data = {
    'texts': texts,
    'texts_25': texts_25,
    'texts_50': texts_50,
    'country_labels': country_labels
}

df = pd.DataFrame(data)

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['texts'])
X_25 = vectorizer.fit_transform(df['texts_25'])
X_50 = vectorizer.fit_transform(df['texts_50'])
y = df['country_labels']

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

def train_binary_classifiers(model_type, X_train, y_train, progress_updates=True):
   # choose classifier and make a dictionary of models for every country
    if model_type == "rf":
        model_dict = {c:RandomForestClassifier(random_state=3, class_weight="balanced") for c in country_set}
    elif model_type == "mlp":
        model_dict = {c:MLPClassifier(random_state=3) for c in country_set}
    elif model_type == "lr":
        model_dict = {c:LogisticRegression(max_iter=2000, class_weight="balanced") for c in country_set}
    elif model_type == "nb":
        model_dict = {c:MultinomialNB() for c in country_set}
    
    for country in model_dict:
        # make a copy of y and replace country labels with 0 or 1 depending on current country
        y_train_binary = y_train.tolist()
        for i, label in enumerate(y_train_binary):
            if label != country:
                y_train_binary[i] = 0
            else:
                y_train_binary[i] = 1
        model_dict[country].fit(X_train, y_train_binary)
        if progress_updates:
            print(".",end=" ")
    if progress_updates:
        print()
        
    return model_dict

In [5]:
def bc_predict_proba(model_dict, test):
    probabilities = {c:0 for c in country_set}
    for country in model_dict:
            probabilities[country] = model_dict[country].predict_proba(test)[0][1]
    return probabilities

In [None]:
def probability_breakdown(model_dict, X_test, y_test, print_output=False):
    # makes a dictionary of probability Counters for every country label for each country's documents
    most_probable_country = {c:Counter({c:0 for c in country_set}) for c in country_set}
    for test_num, test in enumerate(X_test):
        for country, probability in bc_predict_proba(model_dict, test).items():
            most_probable_country[y_test.tolist()[test_num]][country] += probability
    # Either returns or prints the results
    if print_output:
        for country in most_probable_country:
            print(f"{country}: {" ".join([country for country, _ in most_probable_country[country].most_common(5)])}")
    else:
        return most_probable_country

In [None]:
def average_tries(model_dict, X_test, y_test, print_output=False):
    """
    Takes the model and testing data.
    Returns average tries needed to return the right label,
    average tries per country, and number of tests per country.
    """
    probabilities = []
    for test in X_test:
            probabilities.append(bc_predict_proba(model_dict, test).values())     

    # Matches each country label to its probability in a dataframe
    df = pd.DataFrame(probabilities, columns=bc_predict_proba(model_dict, X_test[0]).keys())

    # Sorts each probability distribution for highest probability
    # and records the number of iterations needed to get to the right label
    country_try_count = {c:0 for c in country_set}
    country_test_count = Counter(y_test)
    for i in range(len(df)):
        for try_count, (country_code, _) in enumerate(sorted(df.iloc[i].to_dict().items(), key=lambda item: item[1], reverse=True)):
            if country_code == y_test.tolist()[i]:
                country_try_count[country_code] += try_count+1
                break
    
    # Averages the country try count by number of tests
    country_try_count = {country:country_try_count[country]/country_test_count[country] for country in country_try_count}
    # Computes overal average try count
    avg_tries = sum(country_try_count.values())/len(country_try_count)
    
    # Either returns or prints the results
    if print_output:
        for country, try_count in sorted(country_try_count.items(), key=lambda item: item[1]):
            print(country, f"{try_count:.1f} tries", f"({country_test_count[country]} tests)")
        print("Average number of tries:", avg_tries)
    else:
        return avg_tries, country_try_count, country_test_count

In [8]:
def test_binary_classifiers(model_dict, X_test, y_test):
    results = []
    for i, test in enumerate(X_test):
        probabilities = Counter({})
        for country in model_dict:
            probabilities[country] = model_dict[country].predict_proba(test)[0][1]
        results.append(int(probabilities.most_common(1)[0][0] == y_test.tolist()[i]))
    
    most_probable_country = probability_breakdown(model_dict, X_test, y_test)
    avg_tries, country_try_count, country_test_count = average_tries(model_dict, X_test, y_test)

    print(f"Accuracy: {sum(results)/len(results)}")
    print(f"Average tries: {avg_tries}")
    for country, try_count in sorted(country_try_count.items(), key=lambda item: item[1]):
        most_similar_five = " ".join([country for country, _ in most_probable_country[country].most_common(5)])
        print(f"{country}: {try_count:4.1f} tries", f"({country_test_count[country]:3d} tests) | {most_similar_five}")

In [None]:
def train_test_binary_classifiers(model_type, dataset = "full"):
    if dataset == "full":
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3, stratify=y)
    elif dataset == "25%":
        X_train, X_test, y_train, y_test = train_test_split(X_25, y, test_size=0.2, random_state=3, stratify=y)
    elif dataset == "50%":
        X_train, X_test, y_train, y_test = train_test_split(X_50, y, test_size=0.2, random_state=3, stratify=y)
    
    model_dict = train_binary_classifiers(model_type, X_train, y_train)
    test_binary_classifiers(model_dict, X_test, y_test)

In [12]:
train_test_binary_classifiers("lr")
train_test_binary_classifiers("lr", "25%")
train_test_binary_classifiers("lr", "50%")

. . . . . . . . . . . . . . . . . . . . 
Accuracy: 0.296969696969697
Average tries: 6.879611691011144
US:  2.2 tries ( 56 tests) | US GB NZ IN CA
GB:  3.2 tries ( 59 tests) | GB US CA IE AU
CA:  3.3 tries ( 29 tests) | US GB CA IE AU
NZ:  4.9 tries ( 15 tests) | GB US NZ LK BD
IE:  4.9 tries ( 16 tests) | IE US GB CA AU
HK:  5.4 tries (  7 tests) | GB AU MY US IE
AU:  5.5 tries ( 31 tests) | AU GB US IE CA
NG:  6.2 tries (  6 tests) | US PK GH GB NG
LK:  6.3 tries ( 11 tests) | GB LK AU US IN
IN:  6.5 tries ( 25 tests) | IN GB US CA PK
PK:  6.8 tries ( 12 tests) | GB CA PK US SG
KE:  6.9 tries (  8 tests) | GB CA KE PH US
TZ:  7.4 tries (  8 tests) | GB TZ US CA IN
SG:  7.8 tries (  5 tests) | US GB SG CA IN
MY:  8.4 tries (  8 tests) | GB US MY IE AU
GH:  8.5 tries (  8 tests) | GB GH NG KE PK
BD:  9.5 tries (  6 tests) | US GB JM AU IN
JM:  9.7 tries (  6 tests) | US AU IE GH SG
ZA:  9.7 tries (  9 tests) | US CA GB AU IN
PH: 14.6 tries (  5 tests) | GB US IN IE NZ
. . . . . . . . . 

In [11]:
train_test_binary_classifiers("rf")
train_test_binary_classifiers("rf", "25%")
train_test_binary_classifiers("rf", "50%")

. . . . . . . . . . . . . . . . . . . . 
Accuracy: 0.2787878787878788
Average tries: 7.184698756278768
US:  1.3 tries ( 56 tests) | US GB AU CA IE
GB:  1.3 tries ( 59 tests) | GB US AU CA IE
AU:  3.3 tries ( 31 tests) | GB US AU CA IE
CA:  4.4 tries ( 29 tests) | US GB CA AU IE
IE:  4.8 tries ( 16 tests) | GB US IE CA AU
JM:  5.5 tries (  6 tests) | GB US AU IE JM
NG:  6.0 tries (  6 tests) | GB US NG CA AU
NZ:  6.7 tries ( 15 tests) | GB US AU CA NZ
HK:  6.9 tries (  7 tests) | US GB AU CA IE
IN:  6.9 tries ( 25 tests) | GB US IN AU CA
TZ:  7.2 tries (  8 tests) | GB US CA AU IE
PK:  7.4 tries ( 12 tests) | US GB PK AU IN
BD:  7.7 tries (  6 tests) | GB US AU IE NZ
LK:  7.8 tries ( 11 tests) | GB US CA AU IE
GH:  8.5 tries (  8 tests) | GB US CA AU IE
SG:  9.6 tries (  5 tests) | GB US AU CA SG
PH: 10.0 tries (  5 tests) | GB US CA AU IE
KE: 11.6 tries (  8 tests) | GB US CA AU IE
ZA: 12.2 tries (  9 tests) | GB US IE AU CA
MY: 14.5 tries (  8 tests) | US GB CA AU IE
. . . . . . . . .

In [10]:
train_test_binary_classifiers("mlp")
train_test_binary_classifiers("mlp", "25%")
train_test_binary_classifiers("mlp", "50%")

. . . . . . . . . . . . . . . . . . . . 
Accuracy: 0.3
Average tries: 7.280144423993468
US:  1.7 tries ( 56 tests) | US GB CA AU IE
GB:  2.0 tries ( 59 tests) | GB US CA IE AU
AU:  2.5 tries ( 31 tests) | GB AU US IE TZ
IE:  2.6 tries ( 16 tests) | IE GB US CA AU
CA:  3.7 tries ( 29 tests) | US GB CA IE AU
IN:  5.9 tries ( 25 tests) | IN GB US CA IE
NG:  6.2 tries (  6 tests) | US IE NG GB LK
SG:  6.8 tries (  5 tests) | GB SG US IE AU
NZ:  7.1 tries ( 15 tests) | GB US IE AU CA
LK:  7.1 tries ( 11 tests) | US AU GB IE IN
TZ:  7.1 tries (  8 tests) | US GB TZ AU IE
PK:  7.2 tries ( 12 tests) | GB PK US CA IE
MY:  7.8 tries (  8 tests) | GB IE US AU MY
HK:  8.6 tries (  7 tests) | GB IE US AU CA
GH:  9.2 tries (  8 tests) | GB AU US CA GH
KE:  9.6 tries (  8 tests) | GB CA US AU KE
JM: 10.8 tries (  6 tests) | US GB GH AU SG
PH: 12.2 tries (  5 tests) | GB US IE AU IN
ZA: 13.2 tries (  9 tests) | US IE GB AU TZ
BD: 14.3 tries (  6 tests) | GB US IE ZA AU
. . . . . . . . . . . . . . . . 