**Please install ktrain on Google Colab**:
`pip install ktrain`

In [None]:
!pip install ktrain

Collecting ktrain
  Downloading ktrain-0.39.0.tar.gz (25.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.3/25.3 MB[0m [31m49.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting langdetect (from ktrain)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m72.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting syntok>1.3.3 (from ktrain)
  Downloading syntok-1.4.4-py3-none-any.whl (24 kB)
Collecting tika (from ktrain)
  Downloading tika-2.6.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from ktrain)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m66.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting k

In [None]:
#we import pandas as well as ktrain
import pandas as pd
import numpy as np

import ktrain
from ktrain import text

In [None]:
#we also have to prepare train_test_split
from sklearn.model_selection import train_test_split

In [None]:
#importing our dataframe again
df = pd.read_csv("pp_df.csv")

In [None]:
print(len(df))
df = df.dropna()
print(len(df))

2284
2284


In [None]:
df.head()

Unnamed: 0,label,sentence
0,NAME,new piecegoods bazar co limited bombay v commi...
1,CITATION,equivalent citation 1950 all india reporter 16...
2,CITATION,appeal number 66 1949
3,STATUTE,appeal high court judicature bombay reference ...
4,COUNSEL,km munshi np nathvani appel lant mc setalvad a...


In [None]:
#we reinstantiate X and y and call train test split
X = df["sentence"]
Y = df["label"]

In [None]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [None]:
#I have to change our train test split objects into a list of strings.
X_train = X_train.values.tolist()
X_test = X_test.values.tolist()

y_train = y_train.values.tolist()
y_test = y_test.values.tolist()

print("classes to predict")
print(Y.value_counts())

classes to predict
STATUTE      240
COUNSEL      240
RLC          240
ARG          240
FACTS        240
REASONING    240
PRECEDENT    240
RPC          238
JUDGE        128
CITATION     120
NAME         118
Name: label, dtype: int64


In [None]:
#recalling our dictionary from the previous notebook
encoding = {'REASONING':0,
            'FACTS':1,
            'PRECEDENT':2,
            'RPC':3,
            'STATUTE':4,
            'RLC':5,
            'COUNSEL':6,
            'ARG':7,
            'JUDGE':8,
            'CITATION':9,
            'NAME':10}

# Integer values for each class
y_train = [encoding[x] for x in y_train]
y_test = [encoding[x] for x in y_test]

### BERT Specific Preprocessing
* The text must be preprocessed in a specific way for use with BERT. This is accomplished by setting preprocess_mode to ‘bert’. The BERT pre-trained model and vocabulary will be automatically downloaded

* BERT can handle a maximum length of 512, but let's use less to reduce memory and improve speed.

In [None]:
class_names = ['REASONING',
            'FACTS',
            'PRECEDENT',
            'RPC',
            'STATUTE',
            'RLC',
            'COUNSEL',
            'ARG',
            'JUDGE',
            'CITATION',
            'NAME'
               ]

In [None]:
(x_train,  y_train), (x_test, y_test), preproc = text.texts_from_array(x_train=X_train, y_train=y_train,
                                                                       x_test=X_test, y_test=y_test,
                                                                       class_names=class_names,
                                                                       preprocess_mode='bert',
                                                                       maxlen=400,
                                                                       max_features=100000)

downloading pretrained BERT model (uncased_L-12_H-768_A-12.zip)...
[██████████████████████████████████████████████████]
extracting pretrained BERT model...
done.

cleanup downloaded zip...
done.

preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


task: text classification


### Training and Validation of BERT on Contract Clauses

In [None]:
model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc)

Is Multi-Label? False
maxlen is 400




done.


In [None]:
learner = ktrain.get_learner(model, train_data=(x_train, y_train),
                             val_data=(x_test, y_test),
                             batch_size=12)

In [None]:
# #This is like GridSearch but for BERT. Using this, we can tune the learning
# learner.lr_find()

In [None]:
# #This in turn lets us plot a chart on learning rate vs loss rate - and we essentially look for the bottom of the curve.
# #This is similar to "elbowing"
# learner.lr_plot()

In [None]:
learner.fit_onecycle(2e-5, 5)



begin training using onecycle policy with max lr of 2e-05...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x78556008d480>

## Predicting an Unseen Contractual Clause

Let's now test drive BERT on an unseen clause taken from elsewhere to see if it correctly predicts it as a warranty clause.

In [None]:
learner.validate(val_data=(x_test, y_test), class_names=class_names)

              precision    recall  f1-score   support

   REASONING       0.32      0.30      0.31        87
       FACTS       0.55      0.61      0.58        70
   PRECEDENT       0.88      0.57      0.69        75
         RPC       0.53      0.65      0.58        65
     STATUTE       0.79      0.78      0.79        82
         RLC       0.62      0.70      0.66        74
     COUNSEL       0.92      0.98      0.95        84
         ARG       0.64      0.63      0.63        86
       JUDGE       0.98      0.98      0.98        46
    CITATION       1.00      0.87      0.93        46
        NAME       0.95      1.00      0.97        39

    accuracy                           0.70       754
   macro avg       0.74      0.73      0.73       754
weighted avg       0.71      0.70      0.70       754



array([[26, 11,  4, 12,  7,  7,  2, 17,  1,  0,  0],
       [ 7, 43,  0,  5,  2, 10,  0,  3,  0,  0,  0],
       [10,  5, 43,  6,  3,  2,  3,  3,  0,  0,  0],
       [ 8,  2,  0, 42,  1,  5,  2,  5,  0,  0,  0],
       [ 8,  3,  0,  1, 64,  3,  0,  3,  0,  0,  0],
       [ 8,  6,  0,  8,  0, 52,  0,  0,  0,  0,  0],
       [ 0,  0,  1,  1,  0,  0, 82,  0,  0,  0,  0],
       [15,  5,  1,  3,  4,  4,  0, 54,  0,  0,  0],
       [ 0,  0,  0,  1,  0,  0,  0,  0, 45,  0,  0],
       [ 0,  3,  0,  0,  0,  1,  0,  0,  0, 40,  2],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 39]])

In [None]:
predictor = ktrain.get_predictor(learner.model, preproc)
predictor.get_classes()

['REASONING',
 'FACTS',
 'PRECEDENT',
 'RPC',
 'STATUTE',
 'RLC',
 'COUNSEL',
 'ARG',
 'JUDGE',
 'CITATION',
 'NAME']

In [None]:
unseen_clause = "Mangan Lal Deoshi vs Mohammad Moinul Haoque & Others on 1 December, 1950"
unseen_clause

'Mangan Lal Deoshi vs Mohammad Moinul Haoque & Others on 1 December, 1950'

In [None]:
import time

start_time = time.time()
prediction = predictor.predict(unseen_clause)

print('predicted: {} ({:.2f})'.format(prediction, (time.time() - start_time)))

predicted: NAME (0.16)
