# **Install ktrain and tensorflow gpu**

In [None]:
!pip install ktrain

In [None]:
!pip install tensorflow-gpu

# **Import packages**

In [19]:
import pandas as pd
import ktrain
from ktrain import text
from sklearn.model_selection import train_test_split


In [4]:
import tensorflow as tf
from tensorflow import keras


# **Read dataset as dataframe**

In [5]:
df = pd.read_csv('/content/drive/My Drive/goemotions_aug_dairai_train_cleaned.csv')

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,anger,augmented,cleaned_processed,datasource,fear,joy,sadness
0,206509,0.0,0,i feel that some korea guy are handsome and so...,dairai,0.0,1.0,0.0
1,375242,0.0,0,i put my pen to paper and made a list of thing...,dairai,0.0,0.0,0.0
2,166570,1.0,0,i wish i only had to feel the pain of the pett...,dairai,0.0,0.0,0.0
3,200580,0.0,0,i feel passionate about this journey and stand...,dairai,0.0,1.0,0.0
4,300766,0.0,0,i feel like i have convinced myself of these f...,dairai,0.0,1.0,0.0


In [7]:
label = ['anger', 'fear', 'joy', 'sadness']

In [8]:
df = df.rename(columns={'cleaned_processed': 'text'})

# **split the sub-dataset as trn and val**

In [9]:
train, test = train_test_split(df, test_size=0.33, random_state=42)

plot the distribution, then we will find some problem

In [10]:
from matplotlib import pyplot as plt
import seaborn as sns

In [11]:
cnt_trn = train[label].sum(axis=0)
cnt_val = test[label].sum(axis=0)

In [12]:
cnt_trn

anger      41946.0
fear       27858.0
joy        97480.0
sadness    74704.0
dtype: float64

In [13]:
cnt_val

anger      20630.0
fear       13822.0
joy        48163.0
sadness    36534.0
dtype: float64

In [14]:
train_x = train['text']

In [15]:
train_y = train[label]

In [16]:
test_x = test['text']

In [17]:
test_y = test[label]

# **get the model: albert-base-v2**


1.   use text.transformer to get the transformer instance
2.   preprocess train and test dataset, which will automaticly encode the text, and recognize if it is a multi-label task. the trn and val are made to fit the model you given.
3.   get the classifier(the real model)
4.   wrap it up into a ktrain learner
5.   use the ktrain learner to train the model



In [None]:
MODEL_NAME = 'albert-base-v2'

In [None]:
t = text.Transformer(MODEL_NAME, maxlen=100, class_names=label)

In [None]:
trn = t.preprocess_train(train_x.values, train_y.values)

preprocessing train...
language: en
train sequence lengths:
	mean : 19
	95percentile : 39
	99percentile : 52


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760289.0, style=ProgressStyle(descripti…




Is Multi-Label? True


In [None]:
val = t.preprocess_test(test_x.values, test_y.values)

preprocessing test...
language: en
test sequence lengths:
	mean : 19
	95percentile : 39
	99percentile : 52


In [None]:
model = t.get_classifier()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=63048440.0, style=ProgressStyle(descrip…




In [None]:
tbCallBack = keras.callbacks.TensorBoard(log_dir='./logs/', write_graph=True, write_images=True)

In [None]:
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=64)

In [None]:
checkpoint_path = "/content/drive/My Drive/checkpoint"

In [None]:
learner.fit_onecycle(3e-5, 5, checkpoint_folder=checkpoint_path, callbacks=[tbCallBack])



begin training using onecycle policy with max lr of 3e-05...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f8e54775ba8>

In [None]:
model.summary()

Model: "tf_albert_for_sequence_classification_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
albert (TFAlbertMainLayer)   multiple                  11683584  
_________________________________________________________________
dropout_9 (Dropout)          multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  3076      
Total params: 11,686,660
Trainable params: 11,686,660
Non-trainable params: 0
_________________________________________________________________


# **Save Model as a predictor**

In [None]:
predictor = ktrain.get_predictor(learner.model, preproc=t)

In [None]:
predictor.predict('love surface love lavender')

[('anger', 0.0005517614),
 ('fear', 0.00035501752),
 ('joy', 0.9985045),
 ('sadness', 0.00051341474)]

In [None]:
predictor.save('/content/drive/My Drive/albert_model_on_full_goemotions')

# **load model and predict**
if you want to train it again, use predictor.model to get the model, and wrap it into a ktrain learner, continue the trian process

In [None]:
p = ktrain.load_predictor('/content/drive/My Drive/albert_model_on_full_goemotions')

In [None]:
p.predict('how are you BERT, why are you so training so slow')

[('anger', 0.18881772),
 ('fear', 0.00010721698),
 ('joy', 0.0005183956),
 ('sadness', 0.015007874)]

In [None]:
p.predict('bert is so slow, it annoys me a lot')

[('anger', 0.99906665),
 ('fear', 0.00021670577),
 ('joy', 0.0006373792),
 ('sadness', 0.0020875263)]

In [None]:
p.predict('you are always late, never do that again')

[('anger', 0.8747331),
 ('fear', 0.0018969218),
 ('joy', 0.005476443),
 ('sadness', 0.07453093)]

In [None]:
p.predict('I will be to school tomorrow')

[('anger', 0.0058799037),
 ('fear', 0.0008445064),
 ('joy', 0.6001978),
 ('sadness', 0.0008360504)]

In [None]:
p.predict('Unfortunately, the hiring team has decided not to move forward with your candidacy at this time.')

[('anger', 0.1102086),
 ('fear', 0.00011944358),
 ('joy', 0.005409254),
 ('sadness', 0.018487522)]

In [None]:
res = p.predict('amazon is holding its prime day! lets go shopping')

In [None]:
type(res)

list

# **load the model and continue the training process**

In [21]:
p = ktrain.load_predictor('/content/drive/My Drive/albert_model_on_full_goemotions')

In [22]:
MODEL_NAME = 'albert-base-v2'
t = text.Transformer(MODEL_NAME, maxlen=100, class_names=label)
trn = t.preprocess_train(train_x.values, train_y.values)
learner = ktrain.get_learner(p.model, train_data=trn, batch_size=32)
checkpoint_path = "/content/drive/My Drive/checkpoint_albert_10_27"


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=684.0, style=ProgressStyle(description_…


preprocessing train...
language: en
train sequence lengths:
	mean : 19
	95percentile : 39
	99percentile : 52


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760289.0, style=ProgressStyle(descripti…




Is Multi-Label? True


In [31]:
learner = ktrain.get_learner(predictor.model, train_data=trn, batch_size=64)

In [32]:
learner.fit_onecycle(3e-5, 1, checkpoint_folder=checkpoint_path)



begin training using onecycle policy with max lr of 3e-05...


<tensorflow.python.keras.callbacks.History at 0x7f217071e400>

In [33]:
predictor = ktrain.get_predictor(learner.model, preproc=t)
predictor.save('/content/drive/My Drive/albert_model_on_full_goemotions_10_27')

In [25]:
df.head()

Unnamed: 0.1,Unnamed: 0,anger,augmented,text,datasource,fear,joy,sadness
0,206509,0.0,0,i feel that some korea guy are handsome and so...,dairai,0.0,1.0,0.0
1,375242,0.0,0,i put my pen to paper and made a list of thing...,dairai,0.0,0.0,0.0
2,166570,1.0,0,i wish i only had to feel the pain of the pett...,dairai,0.0,0.0,0.0
3,200580,0.0,0,i feel passionate about this journey and stand...,dairai,0.0,1.0,0.0
4,300766,0.0,0,i feel like i have convinced myself of these f...,dairai,0.0,1.0,0.0


In [28]:
df.describe()

Unnamed: 0.1,Unnamed: 0,anger,augmented,fear,joy,sadness
count,460551.0,460551.0,460551.0,460551.0,460551.0,460551.0
mean,177455.757028,0.135872,0.177524,0.0905,0.316236,0.241532
std,131310.713725,0.342653,0.382112,0.286897,0.465007,0.428013
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,44764.0,0.0,0.0,0.0,0.0,0.0
50%,165070.0,0.0,0.0,0.0,0.0,0.0
75%,293040.5,0.0,0.0,0.0,1.0,0.0
max,420993.0,1.0,1.0,1.0,1.0,1.0


In [30]:
train.describe()

Unnamed: 0.1,Unnamed: 0,anger,augmented,fear,joy,sadness
count,308569.0,308569.0,308569.0,308569.0,308569.0,308569.0
mean,177484.467782,0.135937,0.176826,0.090281,0.31591,0.242098
std,131233.173939,0.342722,0.381522,0.286585,0.464878,0.428354
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,44817.0,0.0,0.0,0.0,0.0,0.0
50%,165245.0,0.0,0.0,0.0,0.0,0.0
75%,292711.0,0.0,0.0,0.0,1.0,0.0
max,420992.0,1.0,1.0,1.0,1.0,1.0
