# This file shows how to finetune a Transformer model on the training data and run predictions on the validation data. Assumes that requirements.txt, EULA Training and Validation set are in the same folder

The first step is to install the required packages


In [3]:
!pip install -r requirements.txt

Collecting boto3==1.14.31
[?25l  Downloading https://files.pythonhosted.org/packages/bd/83/22bc643490012047408bfeec8422c79ba54ecc089e70c946cf1686e15084/boto3-1.14.31-py2.py3-none-any.whl (129kB)
[K     |████████████████████████████████| 133kB 1.4MB/s 
[?25hCollecting botocore==1.17.31
[?25l  Downloading https://files.pythonhosted.org/packages/ef/09/ad453cb97d14ba9434a863dbd12243e891fb13e22259fa9d30a904093fab/botocore-1.17.31-py2.py3-none-any.whl (6.4MB)
[K     |████████████████████████████████| 6.5MB 3.6MB/s 
Collecting cchardet==2.1.5
[?25l  Downloading https://files.pythonhosted.org/packages/fa/4e/847feebfc3e71c773b23ee06c74687b8c50a5a6d6aaff452a0a4f4eb9a32/cchardet-2.1.5-cp36-cp36m-manylinux1_x86_64.whl (241kB)
[K     |████████████████████████████████| 245kB 20.8MB/s 
Collecting en-core-web-sm-mirror==2.2.5
[?25l  Downloading https://files.pythonhosted.org/packages/26/25/7a59d00a31ecf02a81601b7022cfacee9dbe74ac7307c20ef5de6b58c575/en_core_web_sm_mirror-2.2.5-py3-none-any.whl

Then import the pandas library required to read the CSV and sk_learn to perform a training_test split and measure performance

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

Read the CSV 

In [5]:
training_df = pd.read_csv('EULA_Training_Data_Set_1_v1.csv')

In [6]:
x, y = training_df['Clause Text'].values, training_df['Classification'].values

Split into a training and test set

In [7]:
random_state = 1234
test_ratio = 0.2
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = test_ratio, random_state = random_state)

In [8]:
classes = list(set(y))
classes

[0, 1]

Use the ktrain library and uses distilroberta which is one out of many transformer architectures (the one we empirically found to perform best for this task)

In [9]:
import ktrain
from ktrain import text

In [10]:
import ktrain
from ktrain import text
MODEL_NAME = 'distilroberta-base'
t = text.Transformer(MODEL_NAME, maxlen=500)
trn = t.preprocess_train(x_train, y_train)
val = t.preprocess_test(x_test, y_test)
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)
learner.fit_onecycle(3e-5, 1)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=480.0, style=ProgressStyle(description_…


preprocessing train...
language: en
train sequence lengths:
	mean : 80
	95percentile : 262
	99percentile : 466


  'If this is incorrect, supply class_names argument.')


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




preprocessing test...
language: en
test sequence lengths:
	mean : 81
	95percentile : 265
	99percentile : 512




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=487203636.0, style=ProgressStyle(descri…




begin training using onecycle policy with max lr of 3e-05...
Train for 1051 steps, validate for 50 steps


<tensorflow.python.keras.callbacks.History at 0x7f55b82e8ef0>

After training the model on 80% of the training data, we now measure F1 score and Brier score on the remaining 20% of the test data. Also cap the probabilistic predictions to be between 0 and 1

In [11]:
predictor = ktrain.get_predictor(learner.model, preproc=t)
raw_preds = predictor.predict(x_test)
raw_preds_filtered = [max(0,min(x,1.0)) for x in raw_preds]
threshold = 0.5
class_preds = [1 if x>threshold else 0 for x in raw_preds]


In [12]:
from sklearn.metrics import f1_score, brier_score_loss


In [13]:
f1_score(y_test,class_preds)

0.7412353923205343

In [14]:
raw_preds_filtered = [max(0,min(x,1.0)) for x in raw_preds]

In [15]:
brier_score_loss(y_test, raw_preds_filtered)

0.0692921583346274

Now we proceed to train our model on the full training dataset and run predictions on the validation dataset

In [104]:
val_df = pd.read_csv('val_2.csv')

In [105]:
id_val, x_val = val_df['Clause ID'].values, val_df['Clause Text'].values

In [106]:
x_val

array(["\tthe Customer does not make any admissions (save where required by court order or governmental regulations, and where the Customer is required under the terms of such order or regulations not to first consult with the Company) which may be prejudicial to the defense or settlement of any Claim without the Company's approval (not to be unreasonably withheld or delayed).",
       'Requests. Company will notify Customer before Customer exceeds the Tile Request Use Limit indicated on the Order Form. If Customer exceeds its Tile Request Use Limits during the License Term, Company will invoice Customer for Overages on written notice (which may be by email). If, after 30 days from the date of that written notice, Customer continues to exceed its Tile Request Use Limit, Company may stop providing the Service to the Customerinitiate a claim with the Contracting Officer under the Contract Disputes Act.',
       'We sometimes release beta versions of our website and related services, whic

In [107]:
def cap(x):
  if x>1.0:
    return 1.0
  elif x<0.0:
    return 0
  else:
    return x

In [108]:
unacceptability_val_probability_preds = predictor.predict(x_val)
unacceptability_val_probability_preds_filtered = [float(x) for x in unacceptability_val_probability_preds ]
unacceptability_val_probability_preds_filtered = [cap(x) for x in unacceptability_val_probability_preds]
threshold = 0.5
unacceptability_val_class_preds = [1 if x>threshold else 0 for x in unacceptability_val_probability_preds_filtered]
acceptability_val_probability_preds = [100.0 * (1-x) for x in unacceptability_val_probability_preds_filtered ]

In [109]:
val_df['Prediction'] = unacceptability_val_class_preds
val_df['Probability acceptable (percent)'] = acceptability_val_probability_preds


In [110]:
id_val_copy = []
for i in id_val:
  if i:
    id_val_copy.append(int(i))

In [111]:
len(id_val_copy)

1391

In [112]:
unacceptability_val_class_preds_copy = []
for i in unacceptability_val_class_preds:
  if i:
    unacceptability_val_class_preds_copy.append(int(i))

In [113]:
unacceptability_val_class_preds_copy = []
for i in unacceptability_val_class_preds:
  if i!=None:
    unacceptability_val_class_preds_copy.append(int(i))


In [114]:
acceptability_val_probability_preds_copy = []
for i in acceptability_val_probability_preds:
  if i!=None:
    acceptability_val_probability_preds_copy.append(float(i))

In [124]:
import numpy as np
df = pd.DataFrame() 
df['Clause ID'] = np.array(id_val_copy)
df['Prediction'] =  np.array(unacceptability_val_class_preds_copy)
df['Probability acceptable (percent)'] =  np.array(acceptability_val_probability_preds_copy)


In [131]:
with open('clause_id.txt', 'w') as filehandle:
    for listitem in id_val_copy:
        filehandle.write('%s\n' % listitem)

In [132]:
with open('prediction.txt', 'w') as filehandle:
    for listitem in unacceptability_val_class_preds_copy:
        filehandle.write('%s\n' % listitem)

In [133]:
with open('probability.txt', 'w') as filehandle:
    for listitem in acceptability_val_probability_preds_copy:
        filehandle.write('%s\n' % listitem)