In [0]:
from modules.preprocessing import get_dataset, balanced_datasets
from modules.distances import distance, radius, radius_distance
from modules.single_classifier import SingleClassifier
from modules.ensamble_classifier import EnsambleClassifier
from modules.ulits import softmax

import pandas as pd
from tqdm import tqdm
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [0]:
# TODO Переделать в красивое (а не 1000 строчек однотипных)
# TODO Увеличить N

In [0]:
!wget https://github.com/ovbystrova/Interference/raw/master/data/original_texts.csv

In [0]:
df = get_dataset('/content/original_texts.csv')
df_90texts, df_400texts = balanced_datasets(df)
print(len(df_90texts), len(df_400texts))

In [0]:
N = 20

In [0]:
def clean_tqdm():
    for instance in list(tqdm._instances): 
        tqdm._decr_instances(instance)

def balanced_df(df, size):
    """
    balances df and truncates it to size parameter
    """
    df_1 = df.loc[df['language_background'] == 'FL'][:size]
    df_2 = df.loc[df['language_background'] == 'HL'][:size]
    df = pd.concat([df_1, df_2])
    return df

In [0]:
train_90texts, test_90texts = train_test_split(balanced_df(df_90texts, N), train_size=0.8,  random_state=42)
train_400texts, test_400texts = train_test_split(balanced_df(df_400texts,N), train_size=0.8, random_state=42)

In [102]:
len(train_90texts), len(train_400texts), len(test_90texts), len(test_400texts)

(32, 32, 8, 8)

In [0]:
def run_model(df, df_name,  hyperparam):
    """
    runs EnsambleClassifier on  given DataFrame (df) with give profiles (hyperparam)
    """
    classes = list(df['language_background'].unique())
    y_true = list(df['language_background'])
    p_lengths = [200, 500, 1000, 1500, 2000, 2500, 3000]

    clean_tqdm()

    profs = list(df[hyperparam])
    ec = EnsambleClassifier([profs], y_true, p_lengths, classes=classes)
    classifiers = ec.classifiers
    y_pred = ec.forward_ensamble(confidence=False)

    print()
    acc = accuracy_score(y_true, y_pred)
    print('Accuracy_score = {} for dataset {} and hyperparam {}'.format(acc, df_name, hyperparam)) 

# 90 texts

In [104]:
# word unigrams
run_model(train_90texts, 'train_90texts', 'word_unigrams')

100%|██████████| 7/7 [00:08<00:00,  1.24s/it]


Accuracy_score = 0.375 for dataset train_90texts and hyperparam word_unigrams





In [105]:
# word bigrams
run_model(train_90texts, 'train_90texts', 'word_bigrams')

100%|██████████| 7/7 [00:10<00:00,  1.45s/it]


Accuracy_score = 0.1875 for dataset train_90texts and hyperparam word_bigrams





In [106]:
# word trigrams
run_model(train_90texts, 'train_90texts', 'word_trigrams')

100%|██████████| 7/7 [00:09<00:00,  1.41s/it]


Accuracy_score = 0.4375 for dataset train_90texts and hyperparam word_trigrams





In [107]:
# character 3-grams
run_model(train_90texts, 'train_90texts', 'character 3-grams')

100%|██████████| 7/7 [00:56<00:00,  8.07s/it]


Accuracy_score = 0.53125 for dataset train_90texts and hyperparam character 3-grams





In [109]:
# Character 4-grams
run_model(train_90texts, 'train_90texts', 'character 4-grams')

100%|██████████| 7/7 [01:08<00:00,  9.72s/it]


Accuracy_score = 0.53125 for dataset train_90texts and hyperparam character 4-grams





In [110]:
# Character 5-grams
run_model(train_90texts, 'train_90texts', 'character 5-grams')

100%|██████████| 7/7 [01:11<00:00, 10.28s/it]


Accuracy_score = 0.5 for dataset train_90texts and hyperparam character 5-grams





In [111]:
# Character 6-grams
run_model(train_90texts, 'train_90texts', 'character 6-grams')

100%|██████████| 7/7 [01:14<00:00, 10.61s/it]


Accuracy_score = 0.5625 for dataset train_90texts and hyperparam character 6-grams





In [112]:
# Character 7-grams
run_model(train_90texts, 'train_90texts', 'character 7-grams')

100%|██████████| 7/7 [01:15<00:00, 10.82s/it]


Accuracy_score = 0.46875 for dataset train_90texts and hyperparam character 7-grams





In [113]:
# Character 8-grams
run_model(train_90texts, 'train_90texts', 'character 8-grams')

100%|██████████| 7/7 [01:16<00:00, 10.95s/it]


Accuracy_score = 0.40625 for dataset train_90texts and hyperparam character 8-grams





In [114]:
# Character 9-grams
run_model(train_90texts, 'train_90texts', 'character 9-grams')

100%|██████████| 7/7 [01:17<00:00, 11.14s/it]


Accuracy_score = 0.4375 for dataset train_90texts and hyperparam character 9-grams





In [115]:
# Character 10-grams
run_model(train_90texts, 'train_90texts', 'character 10-grams')

100%|██████████| 7/7 [01:19<00:00, 11.31s/it]


Accuracy_score = 0.4375 for dataset train_90texts and hyperparam character 10-grams





# 400 texts

In [126]:
# word_unigrams
run_model(train_400texts, 'train_400texts', 'word_unigrams')

100%|██████████| 7/7 [00:08<00:00,  1.19s/it]


Accuracy_score = 0.375 for dataset train_400texts and hyperparam word_unigrams





In [127]:
# word_bigrams
run_model(train_400texts, 'train_400texts', 'word_bigrams')

100%|██████████| 7/7 [00:09<00:00,  1.39s/it]


Accuracy_score = 0.1875 for dataset train_400texts and hyperparam word_bigrams





In [128]:
# word_trigrams
run_model(train_400texts, 'train_400texts', 'word_trigrams')

100%|██████████| 7/7 [00:09<00:00,  1.40s/it]


Accuracy_score = 0.4375 for dataset train_400texts and hyperparam word_trigrams





In [129]:
# character 3-grams
run_model(train_400texts, 'train_400texts', 'character 3-grams')

100%|██████████| 7/7 [00:57<00:00,  8.22s/it]


Accuracy_score = 0.53125 for dataset train_400texts and hyperparam character 3-grams





In [130]:
# character 4-grams
run_model(train_400texts, 'train_400texts', 'character 4-grams')

100%|██████████| 7/7 [01:08<00:00,  9.84s/it]


Accuracy_score = 0.53125 for dataset train_400texts and hyperparam character 4-grams





In [131]:
# character 5-grams
run_model(train_400texts, 'train_400texts', 'character 5-grams')

100%|██████████| 7/7 [01:11<00:00, 10.22s/it]


Accuracy_score = 0.5 for dataset train_400texts and hyperparam character 5-grams





In [132]:
# character 6-grams
run_model(train_400texts, 'train_400texts', 'character 6-grams')

100%|██████████| 7/7 [01:14<00:00, 10.68s/it]


Accuracy_score = 0.5625 for dataset train_400texts and hyperparam character 6-grams





In [133]:
# character 7-grams
run_model(train_400texts, 'train_400texts', 'character 7-grams')

100%|██████████| 7/7 [01:14<00:00, 10.66s/it]


Accuracy_score = 0.46875 for dataset train_400texts and hyperparam character 7-grams





In [134]:
# Character 8-grams
run_model(train_400texts, 'train_400texts', 'character 8-grams')

100%|██████████| 7/7 [01:15<00:00, 10.85s/it]


Accuracy_score = 0.40625 for dataset train_400texts and hyperparam character 8-grams





In [135]:
# character 9-grams
run_model(train_400texts, 'train_400texts', 'character 9-grams')

100%|██████████| 7/7 [01:15<00:00, 10.84s/it]


Accuracy_score = 0.4375 for dataset train_400texts and hyperparam character 9-grams





In [136]:
# character 10-grams
run_model(train_400texts, 'train_400texts', 'character 10-grams')

100%|██████████| 7/7 [01:15<00:00, 10.81s/it]


Accuracy_score = 0.4375 for dataset train_400texts and hyperparam character 10-grams





# Running best models on test data

In [137]:
# 90text: Character 6-grams
run_model(train_90texts, 'train_90texts', 'character 6-grams')
run_model(test_90texts, 'test_90texts', 'character 6-grams')

100%|██████████| 7/7 [01:13<00:00, 10.53s/it]
  0%|          | 0/7 [00:00<?, ?it/s]


Accuracy_score = 0.5625 for dataset train_90texts and hyperparam character 6-grams


100%|██████████| 7/7 [00:01<00:00,  6.23it/s]


Accuracy_score = 0.25 for dataset test_90texts and hyperparam character 6-grams





In [138]:
run_model(train_90texts, 'train_90texts', 'word_bigrams')
run_model(test_90texts, 'test_90texts', 'word_bigrams')

100%|██████████| 7/7 [00:10<00:00,  1.44s/it]
100%|██████████| 7/7 [00:00<00:00, 41.03it/s]


Accuracy_score = 0.1875 for dataset train_90texts and hyperparam word_bigrams

Accuracy_score = 0.25 for dataset test_90texts and hyperparam word_bigrams





In [141]:
run_model(test_90texts, 'test_90texts', 'word_trigrams')

100%|██████████| 7/7 [00:00<00:00, 39.92it/s]


Accuracy_score = 0.625 for dataset test_90texts and hyperparam word_trigrams



