In [0]:
from modules.preprocessing import get_dataset, balanced_datasets
from modules.distances import distance, radius, radius_distance
from modules.single_classifier import SingleClassifier
from modules.ensamble_classifier import EnsambleClassifier
from modules.ulits import softmax

import pickle
import pandas as pd
from tqdm import tqdm
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [0]:
!wget https://github.com/ovbystrova/Interference/raw/master/data/original_texts.csv

--2020-03-26 14:19:37--  https://github.com/ovbystrova/Interference/raw/master/data/original_texts.csv
Resolving github.com (github.com)... 192.30.255.113
Connecting to github.com (github.com)|192.30.255.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/ovbystrova/Interference/master/data/original_texts.csv [following]
--2020-03-26 14:19:37--  https://raw.githubusercontent.com/ovbystrova/Interference/master/data/original_texts.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 18891291 (18M) [text/plain]
Saving to: ‘original_texts.csv’


2020-03-26 14:19:38 (45.9 MB/s) - ‘original_texts.csv’ saved [18891291/18891291]



In [0]:
df = get_dataset('/content/original_texts.csv')
df_90texts, df_400texts = balanced_datasets(df)
print(len(df_90texts), len(df_400texts))

100%|██████████| 8/8 [01:04<00:00,  8.09s/it]


810 1600


In [0]:
N = 50

In [0]:
def clean_tqdm():
    for instance in list(tqdm._instances): 
        tqdm._decr_instances(instance)

def balanced_df(df, size):
    """
    balances df and truncates it to size parameter
    """
    df_1 = df.loc[df['language_background'] == 'FL'][:size]
    df_2 = df.loc[df['language_background'] == 'HL'][:size]
    df = pd.concat([df_1, df_2])
    return df

In [0]:
df_90 = balanced_df(df_90texts, N)
df_400 = balanced_df(df_400texts, N)

In [0]:
train_90texts, test_90texts = train_test_split(df_90, train_size=0.8,  random_state=42, stratify=df_90['language_background'])
train_400texts, test_400texts = train_test_split(df_400, train_size=0.8, random_state=42, stratify=df_400['language_background'])

In [0]:
len(train_90texts), len(train_400texts), len(test_90texts), len(test_400texts)

(80, 80, 20, 20)

In [0]:
d = {}
labels_train = []
confidences_train = []
labels_test = []
confidences_test = []

accs_train = {}
accs_test = {}


def run_ensamble(df, hyperparam, train=True, ensamble=None):  
    """
    Прогоняется ансамбль на тренировочной или тестовой выборке. Возвращает зипнутый результат и саму модель
    """  
    clean_tqdm()
    classes = list(df['language_background'].unique())
    p_lengths = [200, 500, 1000, 1500, 2000, 2500, 3000]

    y_true = list(df['language_background'])
    profs = list(df[hyperparam])
    
    if train:
        ensamble = EnsambleClassifier([profs], y_true, p_lengths, classes=classes)
        y_pred = ensamble.forward_ensamble(confidence=True)
    else:
        y_pred = ensamble.forward_multiple(profs, confidence=True)
    return  y_pred, ensamble

def get_classes_confs(y_pred):
    """
    анзипает классы и уверенности из зипнутого y_pred
    """
    # single = SingleClassifier([], [], 200, []) # пустышка
    classes = SingleClassifier.only_classes(y_pred)
    confs = SingleClassifier.only_distances(y_pred)
    return classes, confs

def run_model(df_train, df_test,  train_name, test_name, hyperparam, d=d):
    """
    runs EnsambleClassifier on  given DataFrame (df) with give profiles (hyperparam)
    """

    clean_tqdm()

    y_train_pred, ensamble = run_ensamble(df_train, hyperparam, train=True, ensamble=None)
    d['{}_{}'.format(train_name, hyperparam)] = y_train_pred

    y_test_pred, ensamble = run_ensamble(df_test, hyperparam, train=False, ensamble=ensamble)
    d['{}_{}'.format(test_name, hyperparam)] = y_train_pred

    class_train, conf_train = get_classes_confs(y_train_pred)
    class_test, conf_test = get_classes_confs(y_test_pred)

    labels_train.append(class_train)
    confidences_train.append(conf_train)
    labels_test.append(class_test)
    confidences_test.append(conf_test)

    acc_training = accuracy_score(df_train['language_background'], class_train)
    acc_testing = accuracy_score(df_test['language_background'], class_test)
    accs_train['{}_{}'.format(train_name, hyperparam)] = acc_training
    accs_test['{}_{}'.format(test_name, hyperparam)] = acc_testing
    print()
    print('Train accuracy: {}'.format(acc_training))
    print('Test accuracy: {}'.format(acc_testing))

In [0]:
def back_up(d, labels_train, confs_train, labels_test, confs_test, accs_train, accs_test):
    for el, name in [(d, 'full_d'), (accs_train, 'scores_train'), (accs_test, 'scores_test'), 
                     (labels_train, 'labels_train'), (labels_test, 'labels_test'), 
                     (confs_train, 'confs_train'), (confs_test, 'confs_test')]:        
        with open(name, 'wb') as f:
            pickle.dump(el, f)
    return 'Success'

def load_back_up(full_d, labels_train, confs_train, labels_test, confs_test, scores_train, scores_test):
    with open (full_d, 'rb') as fp:
        d = pickle.load(fp)
    with open (labels_train, 'rb') as fp:
        labels_train = pickle.load(fp)
    with open (confs_train, 'rb') as fp:
        confidences_train = pickle.load(fp)
    with open (labels_test, 'rb') as fp:
        labels_test = pickle.load(fp)
    with open (confs_test, 'rb') as fp:
        confidences_test = pickle.load(fp)
    with open (scores_train, 'rb') as fp:
        accs_train = pickle.load(fp)
    with open (scores_test, 'rb') as fp:
        accs_test = pickle.load(fp)
    return d, labels_train, confidences_train, labels_test, confidences_test, accs_train, accs_test

In [0]:
try:
    d, labels_train, confidences_train, labels_test, confidences_test, accs_train, accs_test = load_back_up('full_d', 'labels_train', 'confs_train', 'labels_test', 'confs_test', 'scores_train', 'scores_test')
except Exception as e:
    print(e)

# 90 texts

In [0]:
# word unigrams
run_model(train_90texts,  test_90texts, 'train_90texts', 'test_90texts', 'word_unigrams')

100%|██████████| 7/7 [02:45<00:00, 23.60s/it]



Train accuracy: 0.4875
Test accuracy: 0.5


In [0]:
# word bigrams
run_model(train_90texts, test_90texts, 'train_90texts', 'test_90texts', 'word_bigrams')

100%|██████████| 7/7 [03:14<00:00, 27.72s/it]



Train accuracy: 0.675
Test accuracy: 0.45


In [0]:
# word trigrams
run_model(train_90texts, test_90texts, 'train_90texts', 'test_90texts', 'word_trigrams')

100%|██████████| 7/7 [03:16<00:00, 28.01s/it]



Train accuracy: 0.6
Test accuracy: 0.55


In [0]:
# Character 3-grams
run_model(train_90texts, test_90texts, 'train_90texts', 'test_90texts', 'character 3-grams')

100%|██████████| 7/7 [17:59<00:00, 154.28s/it]



Train accuracy: 0.4875
Test accuracy: 0.5


In [0]:
# Character 4-grams
run_model(train_90texts, test_90texts, 'train_90texts', 'test_90texts', 'character 4-grams')

100%|██████████| 7/7 [21:57<00:00, 188.17s/it]



Train accuracy: 0.5875
Test accuracy: 0.5


In [0]:
# Character 5-grams
run_model(train_90texts, test_90texts, 'train_90texts', 'test_90texts', 'character 5-grams')

100%|██████████| 7/7 [23:02<00:00, 197.52s/it]



Train accuracy: 0.5625
Test accuracy: 0.5


In [0]:
# Character 6-grams
run_model(train_90texts, test_90texts, 'train_90texts', 'test_90texts','character 6-grams')

100%|██████████| 7/7 [23:43<00:00, 203.32s/it]



Train accuracy: 0.5625
Test accuracy: 0.5


In [0]:
# Character 7-grams
run_model(train_90texts, test_90texts, 'train_90texts', 'test_90texts', 'character 7-grams')

100%|██████████| 7/7 [24:35<00:00, 210.78s/it]



Train accuracy: 0.5875
Test accuracy: 0.5


In [0]:
# Character 8-grams
run_model(train_90texts, test_90texts, 'train_90texts', 'test_90texts', 'character 8-grams')

100%|██████████| 7/7 [24:33<00:00, 210.48s/it]



Train accuracy: 0.65
Test accuracy: 0.55


In [0]:
# Character 9-grams
run_model(train_90texts, test_90texts, 'train_90texts', 'test_90texts', 'character 9-grams')

100%|██████████| 7/7 [24:58<00:00, 214.12s/it]



Train accuracy: 0.65
Test accuracy: 0.5


In [0]:
# Character 10-grams
run_model(train_90texts, test_90texts, 'train_90texts', 'test_90texts', 'character 10-grams')

100%|██████████| 7/7 [24:59<00:00, 214.17s/it]



Train accuracy: 0.675
Test accuracy: 0.5


# 400 texts

In [0]:
# word_unigrams
run_model(train_400texts, test_400texts, 'train_400texts', 'test_400texts', 'word_unigrams')

100%|██████████| 7/7 [02:38<00:00, 22.60s/it]



Train accuracy: 0.4875
Test accuracy: 0.5


In [0]:
# word_bigrams
run_model(train_400texts, test_400texts, 'train_400texts', 'test_400texts', 'word_bigrams')

100%|██████████| 7/7 [03:06<00:00, 26.66s/it]



Train accuracy: 0.6625
Test accuracy: 0.5


In [0]:
# word_trigrams
run_model(train_400texts, test_400texts, 'train_400texts', 'test_400texts', 'word_trigrams')

100%|██████████| 7/7 [03:07<00:00, 26.80s/it]



Train accuracy: 0.575
Test accuracy: 0.5


In [0]:
# character 3-grams
run_model(train_400texts, test_400texts, 'train_400texts', 'test_400texts', 'character 3-grams')

100%|██████████| 7/7 [17:45<00:00, 152.19s/it]



Train accuracy: 0.5
Test accuracy: 0.5


In [0]:
# character 4-grams
run_model(train_400texts, test_400texts, 'train_400texts', 'test_400texts', 'character 4-grams')

100%|██████████| 7/7 [21:46<00:00, 186.60s/it]



Train accuracy: 0.5
Test accuracy: 0.5


In [0]:
# character 5-grams
run_model(train_400texts, test_400texts, 'train_400texts', 'test_400texts', 'character 5-grams')

100%|██████████| 7/7 [23:07<00:00, 198.23s/it]



Train accuracy: 0.525
Test accuracy: 0.45


In [0]:
back_up(d, labels_train, confidences_train, labels_test, confidences_test, accs_train, accs_test)

In [0]:
# character 6-grams
run_model(train_400texts, test_400texts, 'train_400texts', 'test_400texts', 'character 6-grams')
back_up(d, labels_train, confidences_train, labels_test, confidences_test, accs_train, accs_test)

In [0]:
# character 7-grams
run_model(train_400texts, test_400texts, 'train_400texts', 'test_400texts','character 7-grams')
back_up(d, labels_train, confidences_train, labels_test, confidences_test, accs_train, accs_test)

In [0]:
# Character 8-grams
run_model(train_400texts, test_400texts, 'train_400texts', 'test_400texts','character 8-grams')
back_up(d, labels_train, confidences_train, labels_test, confidences_test, accs_train, accs_test)

In [0]:
# character 9-grams
run_model(train_400texts, test_400texts, 'train_400texts', 'test_400texts','character 9-grams')
back_up(d, labels_train, confidences_train, labels_test, confidences_test, accs_train, accs_test)

In [0]:
# character 10-grams
run_model(train_400texts, test_400texts, 'train_400texts', 'test_400texts', 'character 10-grams')
back_up(d, labels_train, confidences_train, labels_test, confidences_test, accs_train, accs_test)

In [104]:
fake = EnsambleClassifier([], [], [], ['HL', 'FL'])
y_train = train_90texts['language_background']
y_test = test_90texts['language_background']
major_train = EnsambleClassifier.majority_vote(labels_train)
major_test = EnsambleClassifier.majority_vote(labels_test)

print('Overall Accuracy on train: {}.  Overall accuracy on test: {}.'.format(accuracy_score(y_train, major_train), accuracy_score(y_test, major_test)))

Overall Accuracy on train: 0.6125.  Overall accuracy on test: 0.5.


In [101]:
results = pd.DataFrame(columns=['Train/Test', 'Text length', 'Profile length', 'Profile type', 'Accuracy score'])
results['Train/Test'] = ['train' for el in accs_train] + ['test' for el in accs_test]
results['Text length'] = [key.split('_')[1] for key in accs_train.keys()] + [key.split('_')[1] for key in accs_test.keys()]
results['Profile length'] = [key[-8:] for key in accs_train.keys()] + [key[-8:] for key in accs_test.keys()]
results['Profile type'] = ['word' if 'word' in key else 'character' for key in accs_train.keys()] + ['word' if 'word' in key else 'character' for key in accs_test.keys()]
results['Accuracy score'] = [value for key, value in accs_train.items()] + [value for key, value in accs_test.items()]

results['Profile length'] = results['Profile length'].apply(lambda x: x[1:] if x.startswith('_') else x)

pd.set_option('display.max_rows', 100) 
results

Unnamed: 0,Train/Test,Text length,Profile length,Profile type,Accuracy score
0,train,90texts,unigrams,word,0.4875
1,train,90texts,bigrams,word,0.675
2,train,90texts,trigrams,word,0.6
3,train,90texts,3-grams,character,0.4875
4,train,90texts,4-grams,character,0.5875
5,train,90texts,5-grams,character,0.5625
6,train,90texts,6-grams,character,0.5625
7,train,90texts,7-grams,character,0.5875
8,train,90texts,8-grams,character,0.65
9,train,90texts,9-grams,character,0.65
