In [1]:
import torch
print("DEVICE NAME:", torch.cuda.get_device_name(0))
print("DEVICE COUNT:", torch.cuda.device_count())
print("CURRENT DEVICE:", torch.cuda.current_device())

DEVICE NAME: GeForce GTX 1080
DEVICE COUNT: 1
CURRENT DEVICE: 0


In [1]:
from fastai import *
from fastai.text import *
from fastai.core import *
from pathlib import Path
import pandas as pd
import numpy as np
%cd ..
from ulmfit.pretrain_lm import *
from fastai.callbacks import CSVLogger, SaveModelCallback


/home/mkreso/master_thesis/ulmfit-multilingual


In [2]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, KFold
import warnings
warnings.filterwarnings('ignore')

In [3]:
wiki_data_path = Path('data/wiki/hr-100/')

In [5]:
df_class = pd.read_csv('experiments/labeled_distant.csv')

In [6]:
df_class.head()

Unnamed: 0,text,target
0,@diidlina njamiiiiii :(\n,Neg
1,"Cura: sretan ti rođendan,izvoli poklon Dečko:...",Neg
2,Objavi sliku i u opis stavi “Nije nešto“....Pa...,Neg
3,najjace su mi ove turske serije likovi voze bj...,Neg
4,jebem ti fb nemogu ništa ni lajkat -.-\n,Neg


In [7]:
data_lm = load_data(wiki_data_path, 'lm_finetuned', bs=30)

In [12]:
def cv_ulmfit(data_lm):
    df = pd.read_csv('experiments/labeled_distant.csv')
    kf = KFold(n_splits=5, random_state=0, shuffle=True)
    
    features = df.text.values
    labels = df.target.values
    accuracies = []
    for train_index, test_index in kf.split(features):
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        df_train = pd.DataFrame({'text': X_train, 'target': y_train})
        df_test = pd.DataFrame({'text': X_test, 'target': y_test})
        
        df_train, df_valid = train_test_split(df_train, test_size=0.1, random_state=0)
        data_class = TextClasDataBunch.from_df(path=wiki_data_path,
                                          train_df=df_train,
                                          valid_df=df_valid,
                                           test_df=df_test,
                                          vocab=data_lm.vocab, text_cols='text', label_cols='target', bs=30)
        learner = text_classifier_learner(data_class, AWD_LSTM, drop_mult=0.5)
        learner.load_encoder('ft_enc_c4')
        learner.freeze()

        learner.fit_one_cycle(1, 1e-2, moms=(0.8, 0.7))
        learner.freeze_to(-2)
        learner.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2), moms=(0.8,0.7))
        learner.freeze_to(-3)
        learner.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3), moms=(0.8,0.7))
        learner.unfreeze()
        learner.fit_one_cycle(5, slice(1e-3/(2.6**4),1e-3), moms=(0.8,0.7))

        p = learner.get_preds(DatasetType.Test, ordered=True)[0].numpy()
        predictions = np.argmax(p, axis=1)

        df_test['label'] = 0
        for i in range(1, len(data_class.train_ds.classes)):
            df_test.loc[df_test.target == data_class.train_ds.classes[i], 'label' ] = i
        accuracies.append(accuracy_score(df_test.label.values, predictions))
    print('ACCURACY:', sum(accuracies) / len(accuracies))
    return accuracies

        
                                
                               

In [13]:
cv_ulmfit(data_lm)

ACCURACY: 0.9887860104242879


[0.9891089108910891,
 0.9887362297314024,
 0.988983785121921,
 0.9884886743408837,
 0.9886124520361431]