In [0]:
from modules.preprocessing import get_dataset, balanced_datasets
from modules.distances import distance, radius, radius_distance
from modules.single_classifier import SingleClassifier
from modules.ensamble_classifier import EnsambleClassifier
from modules.ulits import softmax

import pandas as pd
from tqdm import tqdm
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [0]:
!wget https://github.com/ovbystrova/Interference/raw/master/data/original_texts.csv

In [4]:
df = get_dataset('/content/original_texts.csv')
df_90texts, df_400texts = balanced_datasets(df)
print(len(df_90texts), len(df_400texts))

100%|██████████| 8/8 [00:49<00:00,  6.19s/it]


810 1600


In [0]:
N = 50

In [0]:
def clean_tqdm():
    for instance in list(tqdm._instances): 
        tqdm._decr_instances(instance)

def balanced_df(df, size):
    """
    balances df and truncates it to size parameter
    """
    df_1 = df.loc[df['language_background'] == 'FL'][:size]
    df_2 = df.loc[df['language_background'] == 'HL'][:size]
    df = pd.concat([df_1, df_2])
    return df

In [0]:
df_90 = balanced_df(df_90texts, N)
df_400 = balanced_df(df_400texts, N)

In [0]:
train_90texts, test_90texts = train_test_split(df_90, train_size=0.8,  random_state=42, stratify=df_90['language_background'])
train_400texts, test_400texts = train_test_split(df_400, train_size=0.8, random_state=42, stratify=df_400['language_background'])

In [9]:
len(train_90texts), len(train_400texts), len(test_90texts), len(test_400texts)

(80, 80, 20, 20)

In [0]:
def run_model(df, df_name,  hyperparam):
    """
    runs EnsambleClassifier on  given DataFrame (df) with give profiles (hyperparam)
    """
    classes = list(df['language_background'].unique())
    y_true = list(df['language_background'])
    p_lengths = [200, 500, 1000, 1500, 2000, 2500, 3000]

    clean_tqdm()

    profs = list(df[hyperparam])
    ec = EnsambleClassifier([profs], y_true, p_lengths, classes=classes)
    classifiers = ec.classifiers
    y_pred = ec.forward_ensamble(confidence=False)

    print()
    acc = accuracy_score(y_true, y_pred)
    print('Accuracy_score = {} for dataset {} and hyperparam {}'.format(acc, df_name, hyperparam)) 

# 90 texts

In [0]:
# word unigrams
run_model(test_90texts, 'train_90texts', 'word_unigrams')

100%|██████████| 7/7 [00:02<00:00,  2.54it/s]


Accuracy_score = 0.5 for dataset train_90texts and hyperparam word_unigrams





In [0]:
# word bigrams
run_model(train_90texts, 'train_90texts', 'word_bigrams')

100%|██████████| 7/7 [03:12<00:00, 27.50s/it]


Accuracy_score = 0.675 for dataset train_90texts and hyperparam word_bigrams





In [0]:
# word trigrams
run_model(train_90texts, 'train_90texts', 'word_trigrams')

100%|██████████| 7/7 [03:09<00:00, 27.11s/it]


Accuracy_score = 0.6 for dataset train_90texts and hyperparam word_trigrams





In [0]:
# Character 4-grams
run_model(train_90texts, 'train_90texts', 'character 4-grams')

100%|██████████| 7/7 [21:45<00:00, 186.57s/it]


Accuracy_score = 0.5875 for dataset train_90texts and hyperparam character 4-grams





In [0]:
# Character 5-grams
run_model(train_90texts, 'train_90texts', 'character 5-grams')

100%|██████████| 7/7 [22:59<00:00, 197.02s/it]


Accuracy_score = 0.5625 for dataset train_90texts and hyperparam character 5-grams





In [0]:
# Character 6-grams
run_model(train_90texts, 'train_90texts', 'character 6-grams')

100%|██████████| 7/7 [23:52<00:00, 204.67s/it]


Accuracy_score = 0.5625 for dataset train_90texts and hyperparam character 6-grams





In [0]:
# Character 7-grams
run_model(train_90texts, 'train_90texts', 'character 7-grams')

100%|██████████| 7/7 [24:20<00:00, 208.70s/it]


Accuracy_score = 0.5875 for dataset train_90texts and hyperparam character 7-grams





In [0]:
# Character 8-grams
run_model(train_90texts, 'train_90texts', 'character 8-grams')

100%|██████████| 7/7 [24:56<00:00, 213.77s/it]


Accuracy_score = 0.65 for dataset train_90texts and hyperparam character 8-grams





In [0]:
# Character 9-grams
run_model(train_90texts, 'train_90texts', 'character 9-grams')

100%|██████████| 7/7 [25:06<00:00, 215.27s/it]


Accuracy_score = 0.65 for dataset train_90texts and hyperparam character 9-grams





In [0]:
# Character 10-grams
run_model(train_90texts, 'train_90texts', 'character 10-grams')

 86%|████████▌ | 6/7 [2:35:28<25:54, 1554.78s/it]
  0%|          | 0/7 [2:30:21<?, ?it/s]
100%|██████████| 7/7 [25:08<00:00, 215.45s/it]


Accuracy_score = 0.675 for dataset train_90texts and hyperparam character 10-grams





# 400 texts

In [0]:
# word_unigrams
run_model(train_400texts, 'train_400texts', 'word_unigrams')

100%|██████████| 7/7 [02:41<00:00, 23.13s/it]


Accuracy_score = 0.4875 for dataset train_400texts and hyperparam word_unigrams





In [0]:
# word_bigrams
run_model(train_400texts, 'train_400texts', 'word_bigrams')

100%|██████████| 7/7 [03:11<00:00, 27.30s/it]


Accuracy_score = 0.6625 for dataset train_400texts and hyperparam word_bigrams





In [0]:
# word_trigrams
run_model(train_400texts, 'train_400texts', 'word_trigrams')

100%|██████████| 7/7 [03:12<00:00, 27.45s/it]


Accuracy_score = 0.575 for dataset train_400texts and hyperparam word_trigrams





In [0]:
# character 3-grams
run_model(train_400texts, 'train_400texts', 'character 3-grams')

100%|██████████| 7/7 [18:00<00:00, 154.30s/it]


Accuracy_score = 0.5 for dataset train_400texts and hyperparam character 3-grams





In [0]:
# character 4-grams
run_model(train_400texts, 'train_400texts', 'character 4-grams')

100%|██████████| 7/7 [21:41<00:00, 185.89s/it]


Accuracy_score = 0.5 for dataset train_400texts and hyperparam character 4-grams





In [0]:
# character 5-grams
run_model(train_400texts, 'train_400texts', 'character 5-grams')

100%|██████████| 7/7 [22:57<00:00, 196.75s/it]


Accuracy_score = 0.525 for dataset train_400texts and hyperparam character 5-grams





In [11]:
# character 6-grams
run_model(train_400texts, 'train_400texts', 'character 6-grams')

100%|██████████| 7/7 [18:10<00:00, 155.79s/it]


Accuracy_score = 0.525 for dataset train_400texts and hyperparam character 6-grams





In [12]:
# character 7-grams
run_model(train_400texts, 'train_400texts', 'character 7-grams')

100%|██████████| 7/7 [18:32<00:00, 158.94s/it]


Accuracy_score = 0.625 for dataset train_400texts and hyperparam character 7-grams





In [13]:
# Character 8-grams
run_model(train_400texts, 'train_400texts', 'character 8-grams')

100%|██████████| 7/7 [18:44<00:00, 160.62s/it]


Accuracy_score = 0.65 for dataset train_400texts and hyperparam character 8-grams





In [14]:
# character 9-grams
run_model(train_400texts, 'train_400texts', 'character 9-grams')

100%|██████████| 7/7 [18:43<00:00, 160.55s/it]


Accuracy_score = 0.675 for dataset train_400texts and hyperparam character 9-grams





In [15]:
# character 10-grams
run_model(train_400texts, 'train_400texts', 'character 10-grams')

100%|██████████| 7/7 [18:53<00:00, 161.90s/it]


Accuracy_score = 0.6375 for dataset train_400texts and hyperparam character 10-grams





# Running best models on test data

## N = 50, 0.7 on test_90text with word_bigrams

In [16]:
# word bigrams
run_model(train_90texts, 'train_90texts', 'word_bigrams')
run_model(test_90texts, 'test_90texts', 'word_bigrams')

100%|██████████| 7/7 [02:21<00:00, 20.26s/it]
  0%|          | 0/7 [00:00<?, ?it/s]


Accuracy_score = 0.675 for dataset train_90texts and hyperparam word_bigrams


100%|██████████| 7/7 [00:02<00:00,  3.11it/s]


Accuracy_score = 0.7 for dataset test_90texts and hyperparam word_bigrams





In [18]:
# word_bigrams
# run_model(train_400texts, 'train_400texts', 'word_bigrams')
run_model(test_400texts, 'test_400texts', 'word_bigrams')

100%|██████████| 7/7 [00:02<00:00,  3.07it/s]


Accuracy_score = 0.6 for dataset test_400texts and hyperparam word_bigrams



