In [0]:
from modules.preprocessing import get_dataset, balanced_datasets
from modules.distances import distance, radius, radius_distance
from modules.single_classifier import SingleClassifier
from modules.ensamble_classifier import EnsambleClassifier
from modules.ulits import softmax

import pandas as pd
from tqdm import tqdm
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
!wget https://github.com/ovbystrova/Interference/raw/master/data/original_texts.csv

--2020-03-26 00:36:32--  https://github.com/ovbystrova/Interference/raw/master/data/original_texts.csv
Resolving github.com (github.com)... 140.82.118.3
Connecting to github.com (github.com)|140.82.118.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/ovbystrova/Interference/master/data/original_texts.csv [following]
--2020-03-26 00:36:32--  https://raw.githubusercontent.com/ovbystrova/Interference/master/data/original_texts.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 18891291 (18M) [text/plain]
Saving to: ‘original_texts.csv’


2020-03-26 00:36:33 (266 MB/s) - ‘original_texts.csv’ saved [18891291/18891291]



In [3]:
df = get_dataset('/content/original_texts.csv')
df_90texts, df_400texts = balanced_datasets(df)
print(len(df_90texts), len(df_400texts))

100%|██████████| 8/8 [00:50<00:00,  6.35s/it]


810 1600


In [0]:
N = 50

In [0]:
def clean_tqdm():
    for instance in list(tqdm._instances): 
        tqdm._decr_instances(instance)

def balanced_df(df, size):
    """
    balances df and truncates it to size parameter
    """
    df_1 = df.loc[df['language_background'] == 'FL'][:size]
    df_2 = df.loc[df['language_background'] == 'HL'][:size]
    df = pd.concat([df_1, df_2])
    return df

In [0]:
df_90 = balanced_df(df_90texts, N)
df_400 = balanced_df(df_400texts, N)

In [0]:
train_90texts, test_90texts = train_test_split(df_90, train_size=0.8,  random_state=42, stratify=df_90['language_background'])
train_400texts, test_400texts = train_test_split(df_400, train_size=0.8, random_state=42, stratify=df_400['language_background'])

In [8]:
len(train_90texts), len(train_400texts), len(test_90texts), len(test_400texts)

(80, 80, 20, 20)

In [0]:
d = {}  # Тут будет словарь, куда запихиваем все после прогона ансамбля в виде датасет_параметр : y_pred
labels_train = []  # Список предсказанных лейблов на тренировочной выборке для будущего major_votinga
confidences_train = []  # Список уверенностей на тренировочной выборке для будущего confidence_summinga
labels_test = []  # То же что и labels_train только для теста
confidences_test = []  # То же что и confidences_train только для теста.

def run_ensamble(df, hyperparam, train=True, ensamble=None):  
    """
    Прогоняется ансамбль на тренировочной или тестовой выборке. Возвращает зипнутый результат и саму модель
    """  
    clean_tqdm()
    classes = list(df['language_background'].unique())
    p_lengths = [200, 500, 1000, 1500, 2000, 2500, 3000]

    y_true = list(df['language_background'])
    profs = list(df[hyperparam])
    
    if train:
        ensamble = EnsambleClassifier([profs], y_true, p_lengths, classes=classes)
        y_pred = ensamble.forward_ensamble(confidence=True)
    else:
        y_pred = ensamble.forward_multiple(profs, confidence=True)
    return  y_pred, ensamble

def get_classes_confs(y_pred):
    """
    анзипает классы и уверенности из зипнутого y_pred
    """
    single = SingleClassifier([], [], 200, []) # пустышка
    classes = single.only_classes(y_pred)
    confs = single.only_distances(y_pred)
    return classes, confs

def run_model(df_train, df_test,  train_name, test_name, hyperparam, d=d):
    """
    runs EnsambleClassifier on  given DataFrame (df) with give profiles (hyperparam)
    """

    clean_tqdm()

    y_train_pred, ensamble = run_ensamble(df_train, hyperparam, train=True, ensamble=None)
    d['{}_{}'.format(train_name, hyperparam)] = y_train_pred # сохраняем в словарь

    y_test_pred, ensamble = run_ensamble(df_test, hyperparam, train=False, ensamble=ensamble)
    d['{}_{}'.format(test_name, hyperparam)] = y_train_pred

    class_train, conf_train = get_classes_confs(y_train_pred)
    class_test, conf_test = get_classes_confs(y_test_pred)

    labels_train.append(class_train)
    confidences_train.append(conf_train)
    labels_test.append(class_test)
    confidences_test.append(conf_test)

# 90 texts

## Тут пока прогон и проверка всего на юниграмах только

In [39]:
# word unigrams
run_model(train_90texts,  test_90texts, 'train_90texts', 'test_90texts', 'word_unigrams')

100%|██████████| 7/7 [02:05<00:00, 17.95s/it]


In [50]:
d

{'test_90texts_word_unigrams': [('HL',
   {'FL': 5.926773712689995, 'HL': 5.767128603785784}),
  ('HL', {'FL': 6.820530607314906, 'HL': 6.468043655921257}),
  ('HL', {'FL': 6.597968856341664, 'HL': 6.361798681115374}),
  ('HL', {'FL': 6.3969459978612395, 'HL': 6.233882349570583}),
  ('HL', {'FL': 6.315295392582355, 'HL': 6.106743489275828}),
  ('HL', {'FL': 6.50831519946556, 'HL': 6.29654215250115}),
  ('HL', {'FL': 6.266213183023599, 'HL': 5.990387634151937}),
  ('HL', {'FL': 6.386139438537597, 'HL': 6.222357362393656}),
  ('HL', {'FL': 6.416607334773855, 'HL': 6.218831033654001}),
  ('HL', {'FL': 6.1224658269857315, 'HL': 5.973351556668457}),
  ('HL', {'FL': 6.398424959478258, 'HL': 6.164924575541501}),
  ('HL', {'FL': 6.818136780559224, 'HL': 6.564636879785557}),
  ('HL', {'FL': 6.024941897262307, 'HL': 5.898668322822153}),
  ('HL', {'FL': 6.754987141207114, 'HL': 6.470542567139235}),
  ('HL', {'FL': 6.513700198731376, 'HL': 6.229646241893422}),
  ('HL', {'FL': 6.088021245165817, 'H

In [51]:
labels_train, labels_test

([['HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'FL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL']],
 [['HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL',
   'HL']])

In [52]:
confidences_train

[[{'FL': 5.926773712689995, 'HL': 5.767128603785784},
  {'FL': 6.820530607314906, 'HL': 6.468043655921257},
  {'FL': 6.597968856341664, 'HL': 6.361798681115374},
  {'FL': 6.3969459978612395, 'HL': 6.233882349570583},
  {'FL': 6.315295392582355, 'HL': 6.106743489275828},
  {'FL': 6.50831519946556, 'HL': 6.29654215250115},
  {'FL': 6.266213183023599, 'HL': 5.990387634151937},
  {'FL': 6.386139438537597, 'HL': 6.222357362393656},
  {'FL': 6.416607334773855, 'HL': 6.218831033654001},
  {'FL': 6.1224658269857315, 'HL': 5.973351556668457},
  {'FL': 6.398424959478258, 'HL': 6.164924575541501},
  {'FL': 6.818136780559224, 'HL': 6.564636879785557},
  {'FL': 6.024941897262307, 'HL': 5.898668322822153},
  {'FL': 6.754987141207114, 'HL': 6.470542567139235},
  {'FL': 6.513700198731376, 'HL': 6.229646241893422},
  {'FL': 6.088021245165817, 'HL': 5.914952006095997},
  {'FL': 6.381090813381801, 'HL': 6.1647509199423975},
  {'FL': 6.02969941328782, 'HL': 5.803224150851024},
  {'FL': 5.886035464996674, 

In [53]:
confidences_test

[[{'FL': 5.994155309697809, 'HL': 5.868225405229339},
  {'FL': 6.335101644113575, 'HL': 6.114077757087771},
  {'FL': 6.485715996954536, 'HL': 6.264400503322316},
  {'FL': 6.840083947856609, 'HL': 6.5490688840489995},
  {'FL': 6.944663809459077, 'HL': 6.699628818394952},
  {'FL': 6.240276800323108, 'HL': 6.019257550591597},
  {'FL': 6.543641939237448, 'HL': 6.348716265849932},
  {'FL': 6.586080716286247, 'HL': 6.267096195836764},
  {'FL': 6.239637749427013, 'HL': 6.054944780398644},
  {'FL': 7.100530276969808, 'HL': 6.767704558858178},
  {'FL': 7.0191041016377635, 'HL': 6.735470433068385},
  {'FL': 6.227198318262248, 'HL': 6.015039326766237},
  {'FL': 6.232662831017666, 'HL': 6.064780260238128},
  {'FL': 6.088566894065094, 'HL': 5.899106319209469},
  {'FL': 6.283727014623443, 'HL': 6.03157484064443},
  {'FL': 6.486412748134617, 'HL': 6.189720805135444},
  {'FL': 6.69455896529586, 'HL': 6.450182056963046},
  {'FL': 5.715014976390236, 'HL': 5.505381697942375},
  {'FL': 6.559142333521546, 

In [0]:
fake = EnsambleClassifier([], [], [], [])
y_train = train_90texts['language_background']
y_test = test_90texts['language_background']
major_train = fake.majority_vote(labels_train)
major_test = fake.majority_vote(labels_test)

In [54]:
accuracy_score(y_train, major_train)

0.4875

In [55]:
fake.confidence_summing(confidences_train)

[{'FL': 5.926773712689995, 'HL': 5.767128603785784},
 {'FL': 6.820530607314906, 'HL': 6.468043655921257},
 {'FL': 6.597968856341664, 'HL': 6.361798681115374},
 {'FL': 6.3969459978612395, 'HL': 6.233882349570583},
 {'FL': 6.315295392582355, 'HL': 6.106743489275828},
 {'FL': 6.50831519946556, 'HL': 6.29654215250115},
 {'FL': 6.266213183023599, 'HL': 5.990387634151937},
 {'FL': 6.386139438537597, 'HL': 6.222357362393656},
 {'FL': 6.416607334773855, 'HL': 6.218831033654001},
 {'FL': 6.1224658269857315, 'HL': 5.973351556668457},
 {'FL': 6.398424959478258, 'HL': 6.164924575541501},
 {'FL': 6.818136780559224, 'HL': 6.564636879785557},
 {'FL': 6.024941897262307, 'HL': 5.898668322822153},
 {'FL': 6.754987141207114, 'HL': 6.470542567139235},
 {'FL': 6.513700198731376, 'HL': 6.229646241893422},
 {'FL': 6.088021245165817, 'HL': 5.914952006095997},
 {'FL': 6.381090813381801, 'HL': 6.1647509199423975},
 {'FL': 6.02969941328782, 'HL': 5.803224150851024},
 {'FL': 5.886035464996674, 'HL': 5.71118707586

## Конец проверки

In [0]:
# word bigrams
run_model(train_90texts, 'train_90texts', 'word_bigrams')

100%|██████████| 7/7 [03:12<00:00, 27.50s/it]


Accuracy_score = 0.675 for dataset train_90texts and hyperparam word_bigrams





In [0]:
# word trigrams
run_model(train_90texts, 'train_90texts', 'word_trigrams')

100%|██████████| 7/7 [03:09<00:00, 27.11s/it]


Accuracy_score = 0.6 for dataset train_90texts and hyperparam word_trigrams





In [0]:
# Character 4-grams
run_model(train_90texts, 'train_90texts', 'character 4-grams')

100%|██████████| 7/7 [21:45<00:00, 186.57s/it]


Accuracy_score = 0.5875 for dataset train_90texts and hyperparam character 4-grams





In [0]:
# Character 5-grams
run_model(train_90texts, 'train_90texts', 'character 5-grams')

100%|██████████| 7/7 [22:59<00:00, 197.02s/it]


Accuracy_score = 0.5625 for dataset train_90texts and hyperparam character 5-grams





In [0]:
# Character 6-grams
run_model(train_90texts, 'train_90texts', 'character 6-grams')

100%|██████████| 7/7 [23:52<00:00, 204.67s/it]


Accuracy_score = 0.5625 for dataset train_90texts and hyperparam character 6-grams





In [0]:
# Character 7-grams
run_model(train_90texts, 'train_90texts', 'character 7-grams')

100%|██████████| 7/7 [24:20<00:00, 208.70s/it]


Accuracy_score = 0.5875 for dataset train_90texts and hyperparam character 7-grams





In [0]:
# Character 8-grams
run_model(train_90texts, 'train_90texts', 'character 8-grams')

100%|██████████| 7/7 [24:56<00:00, 213.77s/it]


Accuracy_score = 0.65 for dataset train_90texts and hyperparam character 8-grams





In [0]:
# Character 9-grams
run_model(train_90texts, 'train_90texts', 'character 9-grams')

100%|██████████| 7/7 [25:06<00:00, 215.27s/it]


Accuracy_score = 0.65 for dataset train_90texts and hyperparam character 9-grams





In [0]:
# Character 10-grams
run_model(train_90texts, 'train_90texts', 'character 10-grams')

 86%|████████▌ | 6/7 [2:35:28<25:54, 1554.78s/it]
  0%|          | 0/7 [2:30:21<?, ?it/s]
100%|██████████| 7/7 [25:08<00:00, 215.45s/it]


Accuracy_score = 0.675 for dataset train_90texts and hyperparam character 10-grams





# 400 texts

In [0]:
# word_unigrams
run_model(train_400texts, 'train_400texts', 'word_unigrams')

100%|██████████| 7/7 [02:41<00:00, 23.13s/it]


Accuracy_score = 0.4875 for dataset train_400texts and hyperparam word_unigrams





In [0]:
# word_bigrams
run_model(train_400texts, 'train_400texts', 'word_bigrams')

100%|██████████| 7/7 [03:11<00:00, 27.30s/it]


Accuracy_score = 0.6625 for dataset train_400texts and hyperparam word_bigrams





In [0]:
# word_trigrams
run_model(train_400texts, 'train_400texts', 'word_trigrams')

100%|██████████| 7/7 [03:12<00:00, 27.45s/it]


Accuracy_score = 0.575 for dataset train_400texts and hyperparam word_trigrams





In [0]:
# character 3-grams
run_model(train_400texts, 'train_400texts', 'character 3-grams')

100%|██████████| 7/7 [18:00<00:00, 154.30s/it]


Accuracy_score = 0.5 for dataset train_400texts and hyperparam character 3-grams





In [0]:
# character 4-grams
run_model(train_400texts, 'train_400texts', 'character 4-grams')

100%|██████████| 7/7 [21:41<00:00, 185.89s/it]


Accuracy_score = 0.5 for dataset train_400texts and hyperparam character 4-grams





In [0]:
# character 5-grams
run_model(train_400texts, 'train_400texts', 'character 5-grams')

100%|██████████| 7/7 [22:57<00:00, 196.75s/it]


Accuracy_score = 0.525 for dataset train_400texts and hyperparam character 5-grams





In [0]:
# character 6-grams
run_model(train_400texts, 'train_400texts', 'character 6-grams')

100%|██████████| 7/7 [18:10<00:00, 155.79s/it]


Accuracy_score = 0.525 for dataset train_400texts and hyperparam character 6-grams





In [0]:
# character 7-grams
run_model(train_400texts, 'train_400texts', 'character 7-grams')

100%|██████████| 7/7 [18:32<00:00, 158.94s/it]


Accuracy_score = 0.625 for dataset train_400texts and hyperparam character 7-grams





In [0]:
# Character 8-grams
run_model(train_400texts, 'train_400texts', 'character 8-grams')

100%|██████████| 7/7 [18:44<00:00, 160.62s/it]


Accuracy_score = 0.65 for dataset train_400texts and hyperparam character 8-grams





In [0]:
# character 9-grams
run_model(train_400texts, 'train_400texts', 'character 9-grams')

100%|██████████| 7/7 [18:43<00:00, 160.55s/it]


Accuracy_score = 0.675 for dataset train_400texts and hyperparam character 9-grams





In [0]:
# character 10-grams
run_model(train_400texts, 'train_400texts', 'character 10-grams')

100%|██████████| 7/7 [18:53<00:00, 161.90s/it]


Accuracy_score = 0.6375 for dataset train_400texts and hyperparam character 10-grams





# Running best models on test data

## N = 50, 0.7 on test_90text with word_bigrams

In [0]:
# word bigrams
run_model(train_90texts, 'train_90texts', 'word_bigrams')
run_model(test_90texts, 'test_90texts', 'word_bigrams')

100%|██████████| 7/7 [02:21<00:00, 20.26s/it]
  0%|          | 0/7 [00:00<?, ?it/s]


Accuracy_score = 0.675 for dataset train_90texts and hyperparam word_bigrams


100%|██████████| 7/7 [00:02<00:00,  3.11it/s]


Accuracy_score = 0.7 for dataset test_90texts and hyperparam word_bigrams





In [0]:
# word_bigrams
# run_model(train_400texts, 'train_400texts', 'word_bigrams')
run_model(test_400texts, 'test_400texts', 'word_bigrams')

100%|██████████| 7/7 [00:02<00:00,  3.07it/s]


Accuracy_score = 0.6 for dataset test_400texts and hyperparam word_bigrams



