### Load libraries

In [1]:
import os
import numpy as np
import tensorflow as tf
import ktrain
from ktrain import text
import pandas as pd

### Import data from pickles

In [2]:
train_df = pd.read_pickle('pickles/train_df.pkl')
test_df = pd.read_pickle('pickles/test_df.pkl')

### Preprocess train and test sets

In [3]:
(X_train, y_train), (X_test, y_test), preproc = text.texts_from_df(train_df=train_df,
                                                                   text_column = 'corpus',
                                                                   label_columns = ['category'],
                                                                   val_df = test_df,
                                                                   maxlen = 500,
                                                                   preprocess_mode = 'bert')

['Automotive', 'Computers', 'Politics', 'Religion', 'Science', 'Sports']
   Automotive  Computers  Politics  Religion  Science  Sports
0         1.0        0.0       0.0       0.0      0.0     0.0
1         1.0        0.0       0.0       0.0      0.0     0.0
2         1.0        0.0       0.0       0.0      0.0     0.0
3         1.0        0.0       0.0       0.0      0.0     0.0
4         1.0        0.0       0.0       0.0      0.0     0.0
['Automotive', 'Computers', 'Politics', 'Religion', 'Science', 'Sports']
   Automotive  Computers  Politics  Religion  Science  Sports
0         1.0        0.0       0.0       0.0      0.0     0.0
1         1.0        0.0       0.0       0.0      0.0     0.0
2         1.0        0.0       0.0       0.0      0.0     0.0
3         1.0        0.0       0.0       0.0      0.0     0.0
4         1.0        0.0       0.0       0.0      0.0     0.0
preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


### Create model

In [4]:
model = text.text_classifier(name='bert', train_data=(X_train, y_train), preproc=preproc)

Is Multi-Label? False
maxlen is 500
done.


### Train model

In [5]:
learner = ktrain.get_learner(model=model, train_data=(X_train, y_train), val_data=(X_test, y_test), batch_size=2)

In [6]:
learner.fit_onecycle(lr=2e-5, epochs=5)



begin training using onecycle policy with max lr of 2e-05...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x13edf77c760>

### Evaluate model

In [7]:
learner.evaluate((X_test, y_test), class_names=preproc.get_classes())

              precision    recall  f1-score   support

  Automotive       0.97      0.89      0.93       794
   Computers       0.94      0.94      0.94      1955
    Politics       0.89      0.87      0.88      1050
    Religion       0.88      0.89      0.88       968
     Science       0.86      0.89      0.87      1579
      Sports       0.97      0.97      0.97       796

    accuracy                           0.91      7142
   macro avg       0.92      0.91      0.91      7142
weighted avg       0.91      0.91      0.91      7142



array([[ 705,   11,   15,    4,   53,    6],
       [   0, 1844,    7,    5,   97,    2],
       [   6,    8,  911,   81,   41,    3],
       [   2,    6,   51,  862,   39,    8],
       [   7,   95,   36,   27, 1406,    8],
       [   5,    6,    2,    2,    6,  775]], dtype=int64)