In [0]:
from modules.preprocessing import get_dataset, balanced_datasets
from modules.distances import distance, radius, radius_distance
from modules.single_classifier import SingleClassifier
from modules.ensamble_classifier import EnsambleClassifier
from modules.ulits import softmax

import pandas as pd
from tqdm import tqdm
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [0]:
# TODO Переделать в красивое (а не 1000 строчек однотипных)
# TODO Увеличить N

In [0]:
!wget https://github.com/ovbystrova/Interference/raw/master/data/original_texts.csv

In [0]:
df = get_dataset('/content/original_texts.csv')
df_90texts, df_400texts = balanced_datasets(df)
print(len(df_90texts), len(df_400texts))

In [0]:
N = 30

In [0]:
def clean_tqdm():
    for instance in list(tqdm._instances): 
        tqdm._decr_instances(instance)

def balanced_df(df, size):
    """
    balances df and truncates it to size parameter
    """
    df_1 = df.loc[df['language_background'] == 'FL'][:size]
    df_2 = df.loc[df['language_background'] == 'HL'][:size]
    df = pd.concat([df_1, df_2])
    return df

In [0]:
train_90texts, test_90texts = train_test_split(balanced_df(df_90texts, N), train_size=0.8,  random_state=42)
train_400texts, test_400texts = train_test_split(balanced_df(df_400texts,N), train_size=0.8, random_state=42)

In [151]:
len(train_90texts), len(train_400texts), len(test_90texts), len(test_400texts)

(48, 48, 12, 12)

In [0]:
def run_model(df, df_name,  hyperparam):
    """
    runs EnsambleClassifier on  given DataFrame (df) with give profiles (hyperparam)
    """
    classes = list(df['language_background'].unique())
    y_true = list(df['language_background'])
    p_lengths = [200, 500, 1000, 1500, 2000, 2500, 3000]

    clean_tqdm()

    profs = list(df[hyperparam])
    ec = EnsambleClassifier([profs], y_true, p_lengths, classes=classes)
    classifiers = ec.classifiers
    y_pred = ec.forward_ensamble(confidence=False)

    print()
    acc = accuracy_score(y_true, y_pred)
    print('Accuracy_score = {} for dataset {} and hyperparam {}'.format(acc, df_name, hyperparam)) 

# 90 texts

In [153]:
# word unigrams
run_model(train_90texts, 'train_90texts', 'word_unigrams')

100%|██████████| 7/7 [00:28<00:00,  4.03s/it]


Accuracy_score = 0.5416666666666666 for dataset train_90texts and hyperparam word_unigrams





In [154]:
# word bigrams
run_model(train_90texts, 'train_90texts', 'word_bigrams')

100%|██████████| 7/7 [00:33<00:00,  4.72s/it]


Accuracy_score = 0.375 for dataset train_90texts and hyperparam word_bigrams





In [155]:
# word trigrams
run_model(train_90texts, 'train_90texts', 'word_trigrams')

100%|██████████| 7/7 [00:33<00:00,  4.75s/it]


Accuracy_score = 0.4791666666666667 for dataset train_90texts and hyperparam word_trigrams





In [156]:
# character 3-grams
run_model(train_90texts, 'train_90texts', 'character 3-grams')

100%|██████████| 7/7 [03:05<00:00, 26.52s/it]


Accuracy_score = 0.4166666666666667 for dataset train_90texts and hyperparam character 3-grams





In [157]:
# Character 4-grams
run_model(train_90texts, 'train_90texts', 'character 4-grams')

100%|██████████| 7/7 [03:41<00:00, 31.66s/it]


Accuracy_score = 0.625 for dataset train_90texts and hyperparam character 4-grams





In [158]:
# Character 5-grams
run_model(train_90texts, 'train_90texts', 'character 5-grams')

100%|██████████| 7/7 [03:54<00:00, 33.43s/it]


Accuracy_score = 0.5 for dataset train_90texts and hyperparam character 5-grams





In [159]:
# Character 6-grams
run_model(train_90texts, 'train_90texts', 'character 6-grams')

100%|██████████| 7/7 [04:00<00:00, 34.36s/it]


Accuracy_score = 0.5 for dataset train_90texts and hyperparam character 6-grams





In [160]:
# Character 7-grams
run_model(train_90texts, 'train_90texts', 'character 7-grams')

100%|██████████| 7/7 [04:05<00:00, 35.13s/it]


Accuracy_score = 0.5416666666666666 for dataset train_90texts and hyperparam character 7-grams





In [161]:
# Character 8-grams
run_model(train_90texts, 'train_90texts', 'character 8-grams')

100%|██████████| 7/7 [04:12<00:00, 36.05s/it]


Accuracy_score = 0.5208333333333334 for dataset train_90texts and hyperparam character 8-grams





In [162]:
# Character 9-grams
run_model(train_90texts, 'train_90texts', 'character 9-grams')

100%|██████████| 7/7 [04:13<00:00, 36.22s/it]


Accuracy_score = 0.5833333333333334 for dataset train_90texts and hyperparam character 9-grams





In [163]:
# Character 10-grams
run_model(train_90texts, 'train_90texts', 'character 10-grams')

100%|██████████| 7/7 [04:16<00:00, 36.66s/it]


Accuracy_score = 0.6666666666666666 for dataset train_90texts and hyperparam character 10-grams





# 400 texts

In [126]:
# word_unigrams
run_model(train_400texts, 'train_400texts', 'word_unigrams')

100%|██████████| 7/7 [00:08<00:00,  1.19s/it]


Accuracy_score = 0.375 for dataset train_400texts and hyperparam word_unigrams





In [127]:
# word_bigrams
run_model(train_400texts, 'train_400texts', 'word_bigrams')

100%|██████████| 7/7 [00:09<00:00,  1.39s/it]


Accuracy_score = 0.1875 for dataset train_400texts and hyperparam word_bigrams





In [128]:
# word_trigrams
run_model(train_400texts, 'train_400texts', 'word_trigrams')

100%|██████████| 7/7 [00:09<00:00,  1.40s/it]


Accuracy_score = 0.4375 for dataset train_400texts and hyperparam word_trigrams





In [129]:
# character 3-grams
run_model(train_400texts, 'train_400texts', 'character 3-grams')

100%|██████████| 7/7 [00:57<00:00,  8.22s/it]


Accuracy_score = 0.53125 for dataset train_400texts and hyperparam character 3-grams





In [130]:
# character 4-grams
run_model(train_400texts, 'train_400texts', 'character 4-grams')

100%|██████████| 7/7 [01:08<00:00,  9.84s/it]


Accuracy_score = 0.53125 for dataset train_400texts and hyperparam character 4-grams





In [131]:
# character 5-grams
run_model(train_400texts, 'train_400texts', 'character 5-grams')

100%|██████████| 7/7 [01:11<00:00, 10.22s/it]


Accuracy_score = 0.5 for dataset train_400texts and hyperparam character 5-grams





In [132]:
# character 6-grams
run_model(train_400texts, 'train_400texts', 'character 6-grams')

100%|██████████| 7/7 [01:14<00:00, 10.68s/it]


Accuracy_score = 0.5625 for dataset train_400texts and hyperparam character 6-grams





In [133]:
# character 7-grams
run_model(train_400texts, 'train_400texts', 'character 7-grams')

100%|██████████| 7/7 [01:14<00:00, 10.66s/it]


Accuracy_score = 0.46875 for dataset train_400texts and hyperparam character 7-grams





In [134]:
# Character 8-grams
run_model(train_400texts, 'train_400texts', 'character 8-grams')

100%|██████████| 7/7 [01:15<00:00, 10.85s/it]


Accuracy_score = 0.40625 for dataset train_400texts and hyperparam character 8-grams





In [135]:
# character 9-grams
run_model(train_400texts, 'train_400texts', 'character 9-grams')

100%|██████████| 7/7 [01:15<00:00, 10.84s/it]


Accuracy_score = 0.4375 for dataset train_400texts and hyperparam character 9-grams





In [136]:
# character 10-grams
run_model(train_400texts, 'train_400texts', 'character 10-grams')

100%|██████████| 7/7 [01:15<00:00, 10.81s/it]


Accuracy_score = 0.4375 for dataset train_400texts and hyperparam character 10-grams





# Running best models on test data

In [165]:
# 90text: Character 4-grams
run_model(test_90texts, 'test_90texts', 'character 4-grams')

100%|██████████| 7/7 [00:03<00:00,  1.92it/s]


Accuracy_score = 0.5833333333333334 for dataset test_90texts and hyperparam character 4-grams





In [168]:
run_model(test_90texts, 'test_90texts', 'character 10-grams')

100%|██████████| 7/7 [00:04<00:00,  1.72it/s]


Accuracy_score = 0.5 for dataset test_90texts and hyperparam character 10-grams



