In [1]:
pip install ktrain

Collecting ktrain
  Downloading ktrain-0.41.3.tar.gz (25.3 MB)
     ---------------------------------------- 25.3/25.3 MB 4.5 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting fastprogress>=0.1.21
  Downloading fastprogress-1.0.3-py3-none-any.whl (12 kB)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     -------------------------------------- 981.5/981.5 kB 4.8 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting jieba
  Downloading jieba-0.42.1.tar.gz (19.2 MB)
     ---------------------------------------- 19.2/19.2 MB 4.6 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting syntok>1.3.3
  Downloading syntok-1.4.4-py3-none-any.whl (24 kB)
Collecting tika
  Downloading tika-2.6.0.tar.gz (27 kB)
  Preparing metadata (setup.py): starte

In [2]:
import pandas as pd
import numpy as np

import ktrain
from ktrain import text

In [4]:
data_train = pd.read_csv(r"C:\Users\mbk02\Documents\Projects\nlp-text-emotion-master\data\data_train.csv", encoding='utf-8')
data_test = pd.read_csv(r"C:\Users\mbk02\Documents\Projects\nlp-text-emotion-master\data\data_test.csv", encoding='utf-8')


In [6]:
data_train.head()

Unnamed: 0,Emotion,Text
0,neutral,There are tons of other paintings that I thin...
1,sadness,"Yet the dog had grown old and less capable , a..."
2,fear,When I get into the tube or the train without ...
3,fear,This last may be a source of considerable disq...
4,anger,She disliked the intimacy he showed towards so...


In [7]:
X_train = data_train.Text.tolist()
X_test = data_test.Text.tolist()

In [8]:
y_train = data_train.Emotion.tolist()
y_test = data_test.Emotion.tolist()

In [17]:
data = data_train._append(data_test, ignore_index=True)

class_names = ['joy', 'sadness', 'fear', 'anger', 'neutral']

print('size of training set: %s' % (len(data_train['Text'])))
print('size of validation set: %s' % (len(data_test['Text'])))
print(data.Emotion.value_counts())

data.head(10)

size of training set: 7934
size of validation set: 3393
Emotion
joy        2326
sadness    2317
anger      2259
neutral    2254
fear       2171
Name: count, dtype: int64


Unnamed: 0,Emotion,Text
0,neutral,There are tons of other paintings that I thin...
1,sadness,"Yet the dog had grown old and less capable , a..."
2,fear,When I get into the tube or the train without ...
3,fear,This last may be a source of considerable disq...
4,anger,She disliked the intimacy he showed towards so...
5,sadness,When my family heard that my Mother's cousin w...
6,joy,Finding out I am chosen to collect norms for C...
7,anger,A spokesperson said : ` Glen is furious that t...
8,neutral,Yes .
9,sadness,"When I see people with burns I feel sad, actua..."


In [18]:
encoding = {
    'joy': 0,
    'sadness': 1,
    'fear': 2,
    'anger': 3,
    'neutral': 4
}

# Integer values for each class
y_train = [encoding[x] for x in y_train]
y_test = [encoding[x] for x in y_test]

In [19]:
(x_train,  y_train), (x_test, y_test), preproc = text.texts_from_array(x_train=X_train, y_train=y_train,
                                                                       x_test=X_test, y_test=y_test,
                                                                       class_names=class_names,
                                                                       preprocess_mode='bert',
                                                                       maxlen=350, 
                                                                       max_features=35000)

downloading pretrained BERT model (uncased_L-12_H-768_A-12.zip)...
[██████████████████████████████████████████████████]
extracting pretrained BERT model...
done.

cleanup downloaded zip...
done.

preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


task: text classification


In [21]:
model = text.text_classifier('bert', train_data=(x_train,y_train), preproc=preproc)

Is Multi-Label? False
maxlen is 350
done.


In [28]:
learner = ktrain.get_learner(model, train_data=(x_train, y_train), 
                             val_data=(x_test, y_test),
                             batch_size=6)

In [30]:
learner.fit_onecycle(2e-5,2)



begin training using onecycle policy with max lr of 2e-05...
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x169147bb370>

In [31]:
learner.validate(val_data=(x_test, y_test), class_names=class_names)

              precision    recall  f1-score   support

         joy       0.85      0.86      0.85       707
     sadness       0.80      0.82      0.81       676
        fear       0.86      0.85      0.86       679
       anger       0.79      0.78      0.79       693
     neutral       0.82      0.81      0.81       638

    accuracy                           0.82      3393
   macro avg       0.82      0.82      0.82      3393
weighted avg       0.82      0.82      0.82      3393



array([[607,  16,  16,  13,  55],
       [ 14, 556,  32,  55,  19],
       [ 17,  30, 578,  41,  13],
       [ 25,  64,  33, 541,  30],
       [ 52,  27,  10,  33, 516]], dtype=int64)

In [32]:
predictor = ktrain.get_predictor(learner.model, preproc)
predictor.get_classes()

['joy', 'sadness', 'fear', 'anger', 'neutral']

In [53]:
import time 

message = 'Awesome sauce'

start_time = time.time() 
prediction = predictor.predict(message)

print('predicted: {} ({:.2f})'.format(prediction, (time.time() - start_time)))

predicted: neutral (0.34)


In [37]:
# let's save the predictor for later use
predictor.save(r"C:\Users\mbk02\Documents\11 Projects\nlp-text-emotion-master\models\bert1")