<a href="https://colab.research.google.com/github/ovbystrova/Interference/blob/master/Language_Testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from modules.preprocessing import get_dataset, balanced_datasets
from modules.distances import distance, radius, radius_distance
from modules.single_classifier import SingleClassifier
from modules.ensamble_classifier import EnsambleClassifier
from modules.ulits import softmax

import pandas as pd
from tqdm import tqdm
from collections import Counter, defaultdict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import re

In [2]:
!wget https://github.com/ovbystrova/Interference/raw/master/data/original_texts.csv

--2020-03-28 11:13:26--  https://github.com/ovbystrova/Interference/raw/master/data/original_texts.csv
Resolving github.com (github.com)... 192.30.255.113
Connecting to github.com (github.com)|192.30.255.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/ovbystrova/Interference/master/data/original_texts.csv [following]
--2020-03-28 11:13:26--  https://raw.githubusercontent.com/ovbystrova/Interference/master/data/original_texts.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 18891291 (18M) [text/plain]
Saving to: ‘original_texts.csv’


2020-03-28 11:13:27 (61.0 MB/s) - ‘original_texts.csv’ saved [18891291/18891291]



In [3]:
df = get_dataset('/content/original_texts.csv')
df_90texts, df_400texts = balanced_datasets(df)
print(len(df_90texts), len(df_400texts))

100%|██████████| 8/8 [00:54<00:00,  6.83s/it]


810 1600


In [0]:
N = 50

In [0]:
def clean_tqdm():
    for instance in list(tqdm._instances): 
        tqdm._decr_instances(instance)

def balanced_df90(df, size):
    """
    balances df and truncates it to size parameter
    """
    df_1 = df.loc[df['native'] == 'swe'][:size]
    df_2 = df.loc[df['native'] == 'fr'][:size]
    df_3 = df.loc[df['native'] == 'ita'][:size]
    df_4 = df.loc[df['native'] == 'eng'][:size]
    df_5 = df.loc[df['native'] == 'ger'][:size]
    df_6 = df.loc[df['native'] == 'jap'][:size]
    df_7 = df.loc[df['native'] == 'kor'][:size]
    df_8 = df.loc[df['native'] == 'kaz'][:size]
    df_9 = df.loc[df['native'] == 'fin'][:size]
    df = pd.concat([df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9])
    return df

def balanced_df400(df, size):
    """
    balances df and truncates it to size parameter
    """
    df_1 = df.loc[df['native'] == 'eng'][:size]
    df_2 = df.loc[df['native'] == 'jap'][:size]
    df_3 = df.loc[df['native'] == 'kaz'][:size]
    df_4 = df.loc[df['native'] == 'fin'][:size]
    df = pd.concat([df_1, df_2, df_3, df_4])
    return df

# def balanced_df(df, size):
#     """
#     balances df and truncates it to size parameter
#     """
#     df_1 = df.loc[df['language_background'] == 'FL'][:size]
#     df_2 = df.loc[df['language_background'] == 'HL'][:size]
#     df = pd.concat([df_1, df_2])
#     return df

In [0]:
df_90_bal = balanced_df90(df_90texts, N)
df_400_bal = balanced_df400(df_400texts, N)

In [0]:
train_90texts, test_90texts = train_test_split(df_90_bal, train_size=0.8,  random_state=42, stratify=df_90_bal['native'])
train_400texts, test_400texts = train_test_split(df_400_bal, train_size=0.8, random_state=42, stratify=df_400_bal['native'])

In [8]:
len(train_90texts), len(train_400texts), len(test_90texts), len(test_400texts)

(360, 160, 90, 40)

In [0]:
d = {}  # Тут будет словарь, куда запихиваем все после прогона ансамбля в виде датасет_параметр : y_pred
labels_train = []  # Список предсказанных лейблов на тренировочной выборке для будущего major_votinga
confidences_train = []  # Список уверенностей на тренировочной выборке для будущего confidence_summinga
labels_test = []  # То же что и labels_train только для теста
confidences_test = []  # То же что и confidences_train только для теста.

accs_train = {}
accs_test = {}


def run_ensamble(df, hyperparam, train=True, ensamble=None):  
    """
    Прогоняется ансамбль на тренировочной или тестовой выборке. Возвращает зипнутый результат и саму модель
    """  
    clean_tqdm()
    classes = list(df['native'].unique())
    p_lengths = [200, 500, 1000, 1500, 2000, 2500, 3000]

    y_true = list(df['native'])
    profs = list(df[hyperparam])
    
    if train:
        ensamble = EnsambleClassifier([profs], y_true, p_lengths, classes=classes)
        y_pred = ensamble.forward_ensamble(confidence=True)
    else:
        y_pred = ensamble.forward_multiple(profs, confidence=True)
    return  y_pred, ensamble

def get_classes_confs(y_pred):
    """
    анзипает классы и уверенности из зипнутого y_pred
    """
    single = SingleClassifier([], [], 200, []) # пустышка
    classes = single.only_classes(y_pred)
    confs = single.only_distances(y_pred)
    return classes, confs

def run_model(df_train, df_test,  train_name, test_name, hyperparam, d=d):
    """
    runs EnsambleClassifier on  given DataFrame (df) with give profiles (hyperparam)
    """

    clean_tqdm()

    y_train_pred, ensamble = run_ensamble(df_train, hyperparam, train=True, ensamble=None)
    d['{}_{}'.format(train_name, hyperparam)] = y_train_pred

    y_test_pred, ensamble = run_ensamble(df_test, hyperparam, train=False, ensamble=ensamble)
    d['{}_{}'.format(test_name, hyperparam)] = y_train_pred

    class_train, conf_train = get_classes_confs(y_train_pred)
    class_test, conf_test = get_classes_confs(y_test_pred)

    labels_train.append(class_train)
    confidences_train.append(conf_train)
    labels_test.append(class_test)
    confidences_test.append(conf_test)

    acc_training = accuracy_score(df_train['native'], class_train)
    acc_testing = accuracy_score(df_test['native'], class_test)
    accs_train['{}_{}'.format(train_name, hyperparam)] = acc_training
    accs_test['{}_{}'.format(test_name, hyperparam)] = acc_testing
    print()
    print('Train accuracy: {}'.format(acc_training))
    print('Test accuracy: {}'.format(acc_testing))

# 90 texts

In [10]:
# word unigrams
run_model(train_90texts,  test_90texts, 'train_90texts', 'test_90texts', 'word_unigrams')

100%|██████████| 7/7 [40:28<00:00, 346.88s/it]



Train accuracy: 0.3972222222222222
Test accuracy: 0.4222222222222222


In [11]:
# word bigrams
run_model(train_90texts, test_90texts, 'train_90texts', 'test_90texts', 'word_bigrams')

100%|██████████| 7/7 [49:10<00:00, 421.43s/it]



Train accuracy: 0.5583333333333333
Test accuracy: 0.5333333333333333


In [12]:
# word trigrams
run_model(train_90texts, test_90texts, 'train_90texts', 'test_90texts', 'word_trigrams')

100%|██████████| 7/7 [49:39<00:00, 425.59s/it]



Train accuracy: 0.5166666666666667
Test accuracy: 0.4888888888888889


In [13]:
 len(d)

6

In [14]:
len(labels_train), len(labels_test)

(3, 3)

In [15]:
len(confidences_train), len(confidences_test)

(3, 3)

In [0]:
fake = EnsambleClassifier([], [], [], [])
y_train = train_90texts['native']
y_test = test_90texts['native']
major_train = fake.majority_vote(labels_train)
major_test = fake.majority_vote(labels_test)

In [17]:
accuracy_score(y_train, major_train)

0.5277777777777778

In [18]:
accuracy_score(y_test, major_test)

0.5222222222222223

In [19]:
fake.confidence_summing(confidences_train)

[{'eng': 6.348400061390572,
  'fin': 6.634806087838431,
  'fr': 6.616783215010187,
  'ger': 6.711049188910767,
  'ita': 6.3795338139115865,
  'jap': 6.866436201649011,
  'kaz': 6.807556369582763,
  'kor': 6.517311100077013,
  'swe': 6.645755905926498},
 {'eng': 6.351745724264427,
  'fin': 6.567672029056997,
  'fr': 6.510425228120116,
  'ger': 6.852719666708317,
  'ita': 6.323781335472843,
  'jap': 6.49324344083893,
  'kaz': 6.623759159278957,
  'kor': 6.378937805069019,
  'swe': 6.500402356150367},
 {'eng': 6.518861125255686,
  'fin': 6.874361892217413,
  'fr': 6.733122855594688,
  'ger': 6.897791824117846,
  'ita': 6.464253664728409,
  'jap': 7.027386064188357,
  'kaz': 6.972595683876364,
  'kor': 6.721501936484113,
  'swe': 6.756676510602018},
 {'eng': 6.4151865919505315,
  'fin': 6.69311589678269,
  'fr': 6.719845191799019,
  'ger': 6.865954349612183,
  'ita': 6.378419840166102,
  'jap': 6.4462431934042455,
  'kaz': 6.939336384150219,
  'kor': 6.61075689146492,
  'swe': 6.7139148952

## Character n-gram train takes too long to run it

In [0]:
# Character 4-grams
# run_model(train_90texts, test_90texts, 'train_90texts', 'test_90texts', 'character 4-grams')

In [0]:
# Character 5-grams
# run_model(train_90texts, 'train_90texts', 'character 5-grams')

In [0]:
# Character 6-grams
# run_model(train_90texts, 'train_90texts', 'character 6-grams')

In [0]:
# Character 7-grams
# run_model(train_90texts, 'train_90texts', 'character 7-grams')

In [0]:
# Character 8-grams
# run_model(train_90texts, 'train_90texts', 'character 8-grams')

In [0]:
# Character 9-grams
# run_model(train_90texts, 'train_90texts', 'character 9-grams')

In [0]:
# Character 10-grams
# run_model(train_90texts, 'train_90texts', 'character 10-grams')

# df_400texts: 4 most frequent classes, each of length 50

In [0]:
labels_train = []  # Список предсказанных лейблов на тренировочной выборке для будущего major_votinga
confidences_train = []  # Список уверенностей на тренировочной выборке для будущего confidence_summinga
labels_test = []  # То же что и labels_train только для теста
confidences_test = []  # То же что и confidences_train только для теста.

In [29]:
# word_unigrams
run_model(train_400texts, test_400texts, 'train_400texts', 'test_400texts', 'word_unigrams')

100%|██████████| 7/7 [07:52<00:00, 67.55s/it]



Train accuracy: 0.6125
Test accuracy: 0.7


In [30]:
# word_bigrams
run_model(train_400texts, test_400texts, 'train_400texts', 'test_400texts', 'word_bigrams')

100%|██████████| 7/7 [09:41<00:00, 83.06s/it]



Train accuracy: 0.69375
Test accuracy: 0.75


In [31]:
# word_trigrams
run_model(train_400texts, test_400texts, 'train_400texts', 'test_400texts', 'word_trigrams')

100%|██████████| 7/7 [09:47<00:00, 83.90s/it]



Train accuracy: 0.60625
Test accuracy: 0.7


In [0]:
fake = EnsambleClassifier([], [], [], [])
y_train = train_400texts['native']
y_test = test_400texts['native']
major_train = fake.majority_vote(labels_train)
major_test = fake.majority_vote(labels_test)

In [33]:
accuracy_score(y_train, major_train)

0.68125

In [34]:
accuracy_score(y_test, major_test)

0.775

In [0]:
fake.confidence_summing(confidences_train)

In [0]:
accs_wrd = accs_train, accs_test

In [37]:
accs_wrd

({'train_400texts_word_bigrams': 0.69375,
  'train_400texts_word_trigrams': 0.60625,
  'train_400texts_word_unigrams': 0.6125,
  'train_90texts_word_bigrams': 0.5583333333333333,
  'train_90texts_word_trigrams': 0.5166666666666667,
  'train_90texts_word_unigrams': 0.3972222222222222},
 {'test_400texts_word_bigrams': 0.75,
  'test_400texts_word_trigrams': 0.7,
  'test_400texts_word_unigrams': 0.7,
  'test_90texts_word_bigrams': 0.5333333333333333,
  'test_90texts_word_trigrams': 0.4888888888888889,
  'test_90texts_word_unigrams': 0.4222222222222222})

## no char n-gram training

In [0]:
# character 3-grams
# run_model(train_400texts, 'train_400texts', 'character 3-grams')

In [0]:
# character 4-grams
# run_model(train_400texts, 'train_400texts', 'character 4-grams')

In [0]:
# character 5-grams
# run_model(train_400texts, 'train_400texts', 'character 5-grams')

In [0]:
# character 6-grams
# run_model(train_400texts, 'train_400texts', 'character 6-grams')

In [0]:
# character 7-grams
# run_model(train_400texts, 'train_400texts', 'character 7-grams')

In [0]:
# Character 8-grams
# run_model(train_400texts, 'train_400texts', 'character 8-grams')

In [0]:
# character 9-grams
# run_model(train_400texts, 'train_400texts', 'character 9-grams')

In [0]:
# character 10-grams
# run_model(train_400texts, 'train_400texts', 'character 10-grams')

## df_400texts: 4 most frequent classes, each of length 100

In [0]:
df_400_bal_100t = balanced_df400(df_400texts, N * 2)
train_4cl_100t, test_4cl_100t = train_test_split(df_400_bal_100t, train_size=0.8, random_state=42, stratify=df_400_bal_100t['native'])

In [57]:
# word unigrams
run_model(train_4cl_100t, test_4cl_100t, 'train_100t', 'test_100t', 'word_unigrams')

100%|██████████| 7/7 [1:04:40<00:00, 554.31s/it]



Train accuracy: 0.646875
Test accuracy: 0.7


In [70]:
# word bigrams
run_model(train_4cl_100t, test_4cl_100t, 'train_100t', 'test_100t', 'word_bigrams')

100%|██████████| 7/7 [1:17:52<00:00, 667.53s/it]



Train accuracy: 0.6375
Test accuracy: 0.6375


## df_400texts: 4 most frequent classes, each of length 75

In [0]:
df_400_bal_75t = balanced_df400(df_400texts, int(N * 1.5))
train_4cl_75t, test_4cl_75t = train_test_split(df_400_bal_75t, train_size=0.8, random_state=42, stratify=df_400_bal_75t['native'])

In [63]:
# word unigrams
run_model(train_4cl_75t, test_4cl_75t, 'train_75t', 'test_75t', 'word_unigrams')

100%|██████████| 7/7 [28:18<00:00, 242.66s/it]



Train accuracy: 0.5541666666666667
Test accuracy: 0.55


In [78]:
# word bigrams
run_model(train_4cl_75t, test_4cl_75t, 'train_75t', 'test_75t', 'word_bigrams')

100%|██████████| 7/7 [32:31<00:00, 278.82s/it]



Train accuracy: 0.6541666666666667
Test accuracy: 0.6166666666666667


## Accuracy table

In [0]:
accs_dict = defaultdict(list)
for key_set in accs_wrd:
  for key in key_set:
    if 'train' in key:
      accs_dict['train/test mode'].append('train')
    else:
      accs_dict['train/test mode'].append('test')
    cl_len = int(re.search('([0-9]+)', key).group())
    if cl_len == 90:
      cl_num = 9
      cl_len = 50
    elif cl_len == 400:
      cl_num = 4
      cl_len = 50
    else:
      cl_num = 4
    accs_dict['class number'].append(cl_num)
    accs_dict['class length'].append(cl_len)
    accs_dict['ngram_type'].append('word')
    accs_dict['ngram_size'].append(re.search('(?<=word_)\w+', key).group())
    accs_dict['accuracy_score'].append(key_set[key])

In [0]:
accs_df = pd.DataFrame(accs_dict, columns=list(accs_dict.keys()))

In [90]:
accs_df

Unnamed: 0,train/test mode,class number,class length,ngram_type,ngram_size,accuracy_score
0,train,9,50,word,unigrams,0.397222
1,train,9,50,word,bigrams,0.558333
2,train,9,50,word,trigrams,0.516667
3,train,4,50,word,unigrams,0.6125
4,train,4,50,word,bigrams,0.69375
5,train,4,50,word,trigrams,0.60625
6,train,4,100,word,unigrams,0.646875
7,train,4,75,word,unigrams,0.554167
8,train,4,100,word,bigrams,0.6375
9,train,4,75,word,bigrams,0.654167
