### Dataset
Get data from: https://github.com/ngxbac/aivivn_phanloaisacthaibinhluan

In [1]:
from denver.data.data_source import DenverDataSource
from denver.trainers.language_model_trainer import LanguageModelTrainer
from denver.models.ulmfit_cls import ULMFITClassifier
from denver.trainers.trainer import ModelTrainer

from denver.utils.utils import split_data

2020-08-21 09:40:43,017 	DENVER!


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import pandas as pd

## Path to train data and test data
data_path = './data/sentiment analysis/train.csv'

df = pd.read_csv(data_path, encoding='utf-8')

train_df, test_df = split_data(data=df, pct=0.15, text_cols='comment', label_cols='label')

data_source = DenverDataSource.from_df(train_df=train_df, 
                                       test_df=test_df, 
                                       text_cols='comment', 
                                       label_cols='label', 
                                       is_normalize=True)

In [4]:
## Fine-tune language model on Training Dataset

lm_trainer = LanguageModelTrainer(pretrain='wiki')
lm_trainer.fine_tuning_from_df(data_df=data_source.train.data, 
                               batch_size= 128, 
                               num_epochs=10, 
                               learning_rate=1e-3, 
                               moms=[0.8, 0.7], 
                               drop_mult=0.5)

2020-08-21 09:41:04,368 	Fine-tuning...


epoch,train_loss,valid_loss,accuracy,time
0,5.239437,4.303212,0.288051,00:07
1,4.539239,4.122909,0.300486,00:07


epoch,train_loss,valid_loss,accuracy,time
0,4.233505,4.025914,0.309753,00:08
1,4.122124,3.863956,0.32437,00:08
2,3.99055,3.716242,0.337661,00:08
3,3.873825,3.591134,0.349035,00:08
4,3.762586,3.498703,0.358039,00:08
5,3.677375,3.442542,0.363436,00:08
6,3.607378,3.409115,0.366688,00:08
7,3.582685,3.404857,0.36701,00:08


In [5]:
model_dir = 'models/'
save_file = 'denver-cls_sentiment.pkl'

model = ULMFITClassifier(mode='training',
                         data_source=data_source)

trainer = ModelTrainer(model=model, types='class')
trainer.train(model_dir=model_dir, 
              save_best_model=save_file, 
              learning_rate=2e-2, 
              batch_size=128, 
              num_epochs=14)

2020-08-21 09:43:04,752 	Training...


epoch,train_loss,valid_loss,accuracy,f_beta,precision,recall,time
0,0.368378,0.348543,0.865033,0.865953,0.877758,0.865033,00:07
1,0.354093,0.28002,0.888076,0.888482,0.890063,0.888076,00:07
2,0.302748,0.276439,0.886613,0.887247,0.891776,0.886613,00:07


epoch,train_loss,valid_loss,accuracy,f_beta,precision,recall,time
0,0.325,0.306652,0.872348,0.872477,0.872675,0.872348,00:07
1,0.332467,0.286793,0.875274,0.874352,0.875733,0.875274,00:07
2,0.309183,0.276581,0.889173,0.889842,0.895656,0.889173,00:07


epoch,train_loss,valid_loss,accuracy,f_beta,precision,recall,time
0,0.317142,0.256787,0.897952,0.898335,0.900009,0.897952,00:06


epoch,train_loss,valid_loss,accuracy,f_beta,precision,recall,time
0,0.279005,0.243287,0.901975,0.902363,0.904231,0.901975,00:07


epoch,train_loss,valid_loss,accuracy,f_beta,precision,recall,time
0,0.216033,0.246415,0.901609,0.901995,0.903822,0.901609,00:07
1,0.201051,0.251228,0.901609,0.901937,0.903232,0.901609,00:08
2,0.175887,0.283814,0.900878,0.901304,0.903567,0.900878,00:07
3,0.136547,0.311552,0.895026,0.895347,0.896454,0.895026,00:08
4,0.105959,0.332744,0.893197,0.893529,0.894673,0.893197,00:07
5,0.098058,0.332971,0.89466,0.894997,0.896204,0.89466,00:07


2020-08-21 09:45:00,130 	Save the model !
2020-08-21 09:45:08,121 	Path to saved model: /workspace/nlp_training/phucpx/tutorial/denver/models/denver-cls_sentiment.pkl


2020-08-21 09:45:08,723 	EVALUATE VALID:
+----------------------+----------------------+----------------------+----------------------+
| Accuracy             |  F1-score            |  Precission          |  Recall              |
|----------------------+----------------------+----------------------+----------------------|
| 0.8942940831184387   | 0.8946366310119629   | 0.8958784341812134   | 0.8942941427230835   |




### Test

- 0 - Tích cực
- 1 - Tiêu cực

In [9]:
from pprint import pprint

text = 'Tệ😡 Sản phẩm đứt chỉ tùm lum😡 Rách quá trời chỗ  hàng lũng😡 Shop phục vụ quá tệ😡 Lần cuối mua shop😡'

pred = model.process(sample=text)
pprint(pred)

## get uncertainty core

uncertainty_score = model.get_uncertainty_score(sample=text, n_times=10)
pprint(uncertainty_score)

{'confidence': [{'confidence': 0.03254649043083191, 'name': 0},
                {'confidence': 0.9674534797668457, 'name': 1}],
 'intent': {'confidence': 0.9674534797668457, 'name': 1}}
{'intent': 1,
 'method': 'entropy',
 'text': 'tệ sản phẩm đứt chỉ tùm lum rách quá trời chỗ hàng lũng shop phục vụ '
         'quá tệ lần cuối mua shop ',
 'uncertainty_score': 0.33033305406570435}


### Get predictions for TEST SET

In [10]:
data_df = model.predict_batch_on_df(data='./data/sentiment analysis/test.csv', 
                                    text_cols='comment', 
                                    is_normalize=True)

data_df.to_csv('test_preds.csv', index=False, encoding='utf-8')

2020-08-21 09:45:30,370 	Get-prediction...
