In [1]:
import ktrain
from ktrain import text
import pandas as pd
import random
import numpy as np
import math

In [2]:
csv_file = '../../data/merged_ktrain_google_en_four.csv'
data = pd.read_csv(csv_file).values
print(len(data))

21589


In [3]:
epochs = 4
learning_rate = 5e-5
batch_size = 32
max_length = 21
max_words = 25000

In [4]:
def split_test_data(data, split=0.1, random_seed=42):
    np.random.seed(random_seed)
    np.random.shuffle(data)
    split_item = math.floor(split * len(data))
    print('split at: ', split_item)
    x_test, y_test = data[:split_item, 0], data[:split_item, 1:]
    x_train, y_train = data[split_item:, 0], data[split_item:, 1:]
    return x_train, y_train, x_test, y_test

In [5]:
x_train, y_train, x_val, y_val = split_test_data(data, split=0.05, random_seed=4242)
print(len(x_train), len(y_train), len(x_val), len(y_val))

split at:  1079
20510 20510 1079 1079


In [6]:
from sklearn.utils import class_weight
def generate_balanced_weights(y_train):
    y_labels = [y.argmax() for y in y_train]
    class_weights = class_weight.compute_class_weight('balanced', np.unique(y_labels), y_labels)
    weight_dict = {}
    for key in range(len(class_weights)):
        weight_dict[key] = class_weights[key]
    return weight_dict

class_weight_dict = generate_balanced_weights(y_train)
print(class_weight_dict)

{0: 2.1765891966465034, 1: 0.49869915140905, 2: 1.8679417122040072}


In [7]:
MODEL = 'distilbert-base-uncased'
transformer = text.Transformer(MODEL, maxlen=max_length, class_names=['less', 'equal', 'more'])
train_data = transformer.preprocess_train(x_train, y_train)
val_data = transformer.preprocess_test(x_val, y_val)

preprocessing train...
language: en
train sequence lengths:
	mean : 9
	95percentile : 15
	99percentile : 18
Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 9
	95percentile : 15
	99percentile : 19


In [8]:
model = transformer.get_classifier()

In [9]:
learner = ktrain.get_learner(model, train_data=train_data, val_data=val_data, batch_size=batch_size)

In [12]:
learner.lr_find(show_plot=True, max_epochs=2)

simulating training for different learning rates... this may take a few moments...
Epoch 1/2
103/640 [===>..........................] - ETA: 17:09 - loss: 1.0913 - accuracy: 0.3796

In [10]:
learner.fit_onecycle(5e-5, epochs=1, class_weight=class_weight_dict)



begin training using onecycle policy with max lr of 5e-05...


<tensorflow.python.keras.callbacks.History at 0x7feabd246280>

In [13]:
learner.view_top_losses(n=10, preproc=transformer)

----------
id:382 | loss:1.23 | true:equal | pred:more)

----------
id:454 | loss:1.23 | true:equal | pred:more)

----------
id:109 | loss:1.22 | true:equal | pred:more)

----------
id:608 | loss:1.22 | true:equal | pred:more)

----------
id:68 | loss:1.22 | true:equal | pred:more)

----------
id:720 | loss:1.22 | true:equal | pred:more)

----------
id:385 | loss:1.22 | true:equal | pred:more)

----------
id:520 | loss:1.22 | true:equal | pred:more)

----------
id:512 | loss:1.22 | true:equal | pred:more)

----------
id:761 | loss:1.21 | true:equal | pred:more)



In [14]:
predictor = ktrain.get_predictor(learner.model, preproc=transformer)

In [15]:
predictor.explain(x_train[741])



In [16]:
confusion = learner.evaluate()

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       156
           1       0.68      0.44      0.53       727
           2       0.18      0.57      0.28       196

    accuracy                           0.40      1079
   macro avg       0.29      0.34      0.27      1079
weighted avg       0.49      0.40      0.41      1079



  precision = _prf_divide(tp_sum, pred_sum,


In [None]:
# print confusion matrix
import matplotlib.pyplot as plt
import seaborn as sn
labels = ['less', 'equal', 'more']
cm_df = pd.DataFrame(confusion, labels, labels)
sn.set(font_scale=1.1, font='Arial')
ax = sn.heatmap(cm_df, cmap="Blues", annot=True, annot_kws={"size": 11}, cbar=False)
ax.set_xlabel("Actual")
ax.set_ylabel("Predicted")
ax.set_title("Confusion Matrix")
plt.show()