In [1]:
import torch
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, KFold
print("DEVICE NAME:", torch.cuda.get_device_name(0))
print("DEVICE COUNT:", torch.cuda.device_count())
print("CURRENT DEVICE:", torch.cuda.current_device())
import warnings
warnings.filterwarnings('ignore')

DEVICE NAME: GeForce GTX 1080
DEVICE COUNT: 1
CURRENT DEVICE: 0


In [2]:
from fastai import *
from fastai.text import *
from fastai.core import *
from pathlib import Path
import pandas as pd
import numpy as np
%cd ..
from ulmfit.pretrain_lm import *
from fastai.callbacks import CSVLogger, SaveModelCallback


/home/mkreso/master_thesis/ulmfit-multilingual


In [3]:
wiki_data_path = Path('data/wiki/hr-100/')

# PERFORMANCE OF ULMFiT EMBEDDINGS ON WSD TASK¶

## 1. train-test split (400 in train, 100 in test)

In [4]:
def train_lm(name):    
    data_lm = TextLMDataBunch.from_csv(wiki_data_path, name + '_ulmfit.csv', text_cols=0, bs=30)
    itos, stoi, data_path = data_lm.vocab.itos, data_lm.vocab.stoi, data_lm.path
    pretrained_fnames = ['hr-100-best', 'itos']
    learner = language_model_learner(data_lm, AWD_LSTM, pretrained_fnames=pretrained_fnames, drop_mult=0.9, 
                             model_dir='./models')
    learner.freeze()
    learner.opt_func = partial(optim.Adam, betas=(0.8, 0.99))
    learner.lr_find()
    #learner.recorder.plot(skip_end=15)
    learner.fit_one_cycle(1, 1e-2)
    learner.unfreeze()
    learner.fit_one_cycle(5, 1e-3, moms=(0.8,0.7))
    #learner.save('lm_fine_tuned_wsd')
    learner.save_encoder('ft_enc_wsd_' + name)
    return data_lm

In [5]:
def train_classifier(name, data_lm):
    df_class = pd.read_csv('data/wiki/hr-100/' + name + '_ulmfit.csv')
    df_train, df_test = train_test_split(df_class, test_size=0.2, random_state=0)
    df_train, df_valid = train_test_split(df_train, test_size=0.1, random_state=0)
    data_class = TextClasDataBunch.from_df(path=wiki_data_path,
                                          train_df=df_train,
                                          valid_df=df_valid,
                                           test_df=df_test,
                                          vocab=data_lm.vocab, text_cols='sentence', label_cols='sense_id', bs=30)
    learner = text_classifier_learner(data_class, AWD_LSTM, drop_mult=0.5)
    learner.load_encoder('ft_enc_wsd_' + name)
    learner.freeze()

    learner.fit_one_cycle(1, 1e-2, moms=(0.8, 0.7))
    learner.freeze_to(-2)
    learner.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2), moms=(0.8,0.7))
    learner.freeze_to(-3)
    learner.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3), moms=(0.8,0.7))
    learner.unfreeze()
    learner.fit_one_cycle(5, slice(1e-3/(2.6**4),1e-3), moms=(0.8,0.7))
    
    p = learner.get_preds(DatasetType.Test, ordered=True)[0].numpy()
    predictions = np.argmax(p, axis=1)
    
    df_test['label'] = 0
    for i in range(1, len(data_class.train_ds.classes)):
        df_test.loc[df_test.sense_id == data_class.train_ds.classes[i], 'label' ] = i
    print("ACCURACY -", name + ':', accuracy_score(df_test.label.values, predictions))

In [182]:
data_lm = train_lm('okvir')
train_classifier('okvir', data_lm)

ACCURACY - okvir: 0.94


In [183]:
data_lm = train_lm('prljav')
train_classifier('prljav', data_lm)

ACCURACY - prljav: 0.73


In [184]:
data_lm = train_lm('lak')
train_classifier('lak', data_lm)

ACCURACY - lak: 0.78


In [185]:
data_lm = train_lm('vatra')
train_classifier('vatra', data_lm)

ACCURACY - vatra: 0.84


In [186]:
data_lm = train_lm('brusiti')
train_classifier('brusiti', data_lm)

ACCURACY - brusiti: 0.76


In [187]:
data_lm = train_lm('odlikovati')
train_classifier('odlikovati', data_lm)

ACCURACY - odlikovati: 0.99


In [6]:
def train_lm_v2(name):    
    data_lm = TextLMDataBunch.from_csv(wiki_data_path, name + '_ulmfit.csv', text_cols=0, bs=30)
    itos, stoi, data_path = data_lm.vocab.itos, data_lm.vocab.stoi, data_lm.path
    pretrained_fnames = ['hr-100-best', 'itos']
    learner = language_model_learner(data_lm, AWD_LSTM, pretrained_fnames=pretrained_fnames, drop_mult=0.9, 
                             model_dir='./models')
    
    learner.save_encoder('ft_enc_wsd_' + name)
    return data_lm

In [7]:
data_lm = train_lm_v2('okvir')
train_classifier('okvir', data_lm)

ACCURACY - okvir: 0.87


In [8]:
data_lm = train_lm_v2('prljav')
train_classifier('prljav', data_lm)

ACCURACY - prljav: 0.67


In [9]:
data_lm = train_lm_v2('lak')
train_classifier('lak', data_lm)

ACCURACY - lak: 0.77


In [10]:
data_lm = train_lm_v2('vatra')
train_classifier('vatra', data_lm)

ACCURACY - vatra: 0.85


In [11]:
data_lm = train_lm_v2('brusiti')
train_classifier('brusiti', data_lm)

ACCURACY - brusiti: 0.76


In [12]:
data_lm = train_lm_v2('odlikovati')
train_classifier('odlikovati', data_lm)

ACCURACY - odlikovati: 0.99


## 2. 5 fold cross validation

In [15]:
def cv_ulmfit(name, data_lm):
    df = pd.read_csv('data/wiki/hr-100/' + name + '_ulmfit.csv')
    kf = KFold(n_splits=5, random_state=0, shuffle=True)
    
    features = df.sentence.values
    labels = df.sense_id.values
    accuracies = []
    for train_index, test_index in kf.split(features):
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        df_train = pd.DataFrame({'sentence': X_train, 'sense_id': y_train})
        df_test = pd.DataFrame({'sentence': X_test, 'sense_id': y_test})
        
        df_train, df_valid = train_test_split(df_train, test_size=0.1, random_state=0)
        data_class = TextClasDataBunch.from_df(path=wiki_data_path,
                                          train_df=df_train,
                                          valid_df=df_valid,
                                           test_df=df_test,
                                          vocab=data_lm.vocab, text_cols='sentence', label_cols='sense_id', bs=30)
        learner = text_classifier_learner(data_class, AWD_LSTM, drop_mult=0.5)
        learner.load_encoder('ft_enc_wsd_' + name)
        learner.freeze()

        learner.fit_one_cycle(1, 1e-2, moms=(0.8, 0.7))
        learner.freeze_to(-2)
        learner.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2), moms=(0.8,0.7))
        learner.freeze_to(-3)
        learner.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3), moms=(0.8,0.7))
        learner.unfreeze()
        learner.fit_one_cycle(5, slice(1e-3/(2.6**4),1e-3), moms=(0.8,0.7))

        p = learner.get_preds(DatasetType.Test, ordered=True)[0].numpy()
        predictions = np.argmax(p, axis=1)

        df_test['label'] = 0
        for i in range(1, len(data_class.train_ds.classes)):
            df_test.loc[df_test.sense_id == data_class.train_ds.classes[i], 'label' ] = i
        accuracies.append(accuracy_score(df_test.label.values, predictions))
    print('ACCURACY:', sum(accuracies) / len(accuracies))
    return accuracies

        
                                
                               

In [7]:
data_lm = train_lm('okvir')
cv_ulmfit('okvir', data_lm)

ACCURACY: 0.9019999999999999


[0.84, 0.88, 0.96, 0.89, 0.94]

In [8]:
data_lm = train_lm('prljav')
cv_ulmfit('prljav', data_lm)

ACCURACY: 0.758


[0.76, 0.75, 0.71, 0.81, 0.76]

In [9]:
data_lm = train_lm('lak')
cv_ulmfit('lak', data_lm)

ACCURACY: 0.7999999999999999


[0.77, 0.74, 0.79, 0.86, 0.84]

In [10]:
data_lm = train_lm('vatra')
cv_ulmfit('vatra', data_lm)

ACCURACY: 0.86


[0.86, 0.9, 0.86, 0.82, 0.86]

In [11]:
data_lm = train_lm('brusiti')
cv_ulmfit('brusiti', data_lm)

ACCURACY: 0.792


[0.74, 0.8, 0.79, 0.78, 0.85]

In [12]:
data_lm = train_lm('odlikovati')
cv_ulmfit('odlikovati', data_lm)

ACCURACY: 0.984


[1.0, 0.99, 0.98, 0.98, 0.97]

In [16]:
data_lm = train_lm_v2('okvir')
cv_ulmfit('okvir', data_lm)

ACCURACY: 0.906


[0.86, 0.91, 0.91, 0.92, 0.93]

In [17]:
data_lm = train_lm_v2('prljav')
cv_ulmfit('prljav', data_lm)

ACCURACY: 0.724


[0.69, 0.7, 0.7, 0.78, 0.75]

In [18]:
data_lm = train_lm_v2('lak')
cv_ulmfit('lak', data_lm)

ACCURACY: 0.788


[0.74, 0.76, 0.78, 0.84, 0.82]

In [19]:
data_lm = train_lm_v2('vatra')
cv_ulmfit('vatra', data_lm)

ACCURACY: 0.8280000000000001


[0.81, 0.87, 0.84, 0.81, 0.81]

In [20]:
data_lm = train_lm_v2('brusiti')
cv_ulmfit('brusiti', data_lm)

ACCURACY: 0.784


[0.77, 0.77, 0.78, 0.77, 0.83]

In [21]:
data_lm = train_lm_v2('odlikovati')
cv_ulmfit('odlikovati', data_lm)

ACCURACY: 0.976


[1.0, 0.99, 0.98, 0.95, 0.96]

In [227]:
a = np.array(a)

In [228]:
stats.shapiro(a)

(0.8345866799354553, 0.15050582587718964)

In [229]:
stats.kstest(a,'norm')

KstestResult(statistic=0.7389137003071384, pvalue=0.0025292220467791346)

In [224]:
result = stats.anderson(a, dist='norm')
for i in range(len(result.critical_values)):
    sl, cv = result.significance_level[i], result.critical_values[i]
    if result.statistic < result.critical_values[i]:
        print('%.3f: %.3f, data looks normal (fail to reject H0)' % (sl, cv))
    else:
        print('%.3f: %.3f, data does not look normal (reject H0)' % (sl, cv))


15.000: 0.720, data looks normal (fail to reject H0)
10.000: 0.820, data looks normal (fail to reject H0)
5.000: 0.984, data looks normal (fail to reject H0)
2.500: 1.148, data looks normal (fail to reject H0)
1.000: 1.365, data looks normal (fail to reject H0)
