In [0]:
from modules.preprocessing import get_dataset, balanced_datasets
from modules.distances import distance, radius, radius_distance
from modules.single_classifier import SingleClassifier
from modules.ensamble_classifier import EnsambleClassifier
from modules.ulits import softmax

import pandas as pd
from tqdm import tqdm
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [0]:
!wget https://github.com/ovbystrova/Interference/raw/master/data/original_texts.csv

In [0]:
df = get_dataset('/content/original_texts.csv')
df_90texts, df_400texts = balanced_datasets(df)
print(len(df_90texts), len(df_400texts))

100%|██████████| 8/8 [00:59<00:00,  7.46s/it]


810 1600


In [0]:
N = 10

In [0]:
def clean_tqdm():
    for instance in list(tqdm._instances): 
        tqdm._decr_instances(instance)

def balanced_df90(df, size):
    """
    balances df and truncates it to size parameter
    """
    df_1 = df.loc[df['native'] == 'swe'][:size]
    df_2 = df.loc[df['native'] == 'fr'][:size]
    df_3 = df.loc[df['native'] == 'ita'][:size]
    df_4 = df.loc[df['native'] == 'eng'][:size]
    df_5 = df.loc[df['native'] == 'ger'][:size]
    df_6 = df.loc[df['native'] == 'jap'][:size]
    df_7 = df.loc[df['native'] == 'kor'][:size]
    df_8 = df.loc[df['native'] == 'kaz'][:size]
    df_9 = df.loc[df['native'] == 'fin'][:size]
    df = pd.concat([df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9])
    return df

def balanced_df400(df, size):
    """
    balances df and truncates it to size parameter
    """
    df_1 = df.loc[df['native'] == 'eng'][:size]
    df_2 = df.loc[df['native'] == 'jap'][:size]
    df_3 = df.loc[df['native'] == 'kaz'][:size]
    df_4 = df.loc[df['native'] == 'fin'][:size]
    df = pd.concat([df_1, df_2, df_3, df_4])
    return df

In [0]:
df_90texts = balanced_df(df_90texts, N)
df_400texts = balanced_df(df_400texts, N)

In [0]:
train_90texts, test_90texts = train_test_split(df_90texts, train_size=0.8,  random_state=42, stratify=df_90texts['native'])
train_400texts, test_400texts = train_test_split(df_400texts, train_size=0.8, random_state=42, stratify=df_400texts['native'])

In [0]:
len(train_90texts), len(train_400texts), len(test_90texts), len(test_400texts)

(72, 32, 18, 8)

In [0]:
def run_model(df, df_name,  hyperparam):
    """
    runs EnsambleClassifier on  given DataFrame (df) with give profiles (hyperparam)
    """
    classes = list(df['native'].unique())
    y_true = list(df['native'])
    p_lengths = [200, 500, 1000, 1500, 2000, 2500, 3000]

    clean_tqdm()

    profs = list(df[hyperparam])
    ec = EnsambleClassifier([profs], y_true, p_lengths, classes=classes)
    classifiers = ec.classifiers
    y_pred = ec.forward_ensamble(confidence=False)

    print()
    acc = f1_score(y_true, y_pred, average='micro')
    print('F1_score = {} for dataset {} and hyperparam {}'.format(acc, df_name, hyperparam))

# 90 texts

In [0]:
# word unigrams
run_model(train_90texts, 'train_90texts', 'word_unigrams')

100%|██████████| 7/7 [00:25<00:00,  3.61s/it]


F1_score = 0.3472222222222222 for dataset train_90texts and hyperparam word_unigrams





In [0]:
# word bigrams
run_model(train_90texts, 'train_90texts', 'word_bigrams')

100%|██████████| 7/7 [00:30<00:00,  4.35s/it]


F1_score = 0.3611111111111111 for dataset train_90texts and hyperparam word_bigrams





In [0]:
# word trigrams
run_model(train_90texts, 'train_90texts', 'word_trigrams')

100%|██████████| 7/7 [00:30<00:00,  4.42s/it]


F1_score = 0.2638888888888889 for dataset train_90texts and hyperparam word_trigrams





In [0]:
# character 3-grams
run_model(train_90texts, 'train_90texts', 'character 3-grams')

100%|██████████| 7/7 [02:47<00:00, 23.98s/it]


F1_score = 0.20833333333333334 for dataset train_90texts and hyperparam character 3-grams





# 400 texts

# On test data

In [0]:
# word bigrams
run_model(test_90texts, 'test_90texts', 'word_bigrams')

  return(distance(di, u)/np.max([distance(di, ai) for ai in A]))
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 7/7 [00:00<00:00, 10.01it/s]


F1_score = 0.05555555555555555 for dataset test_90texts and hyperparam word_bigrams



