In [2]:
import torch
from transformers import BertTokenizer
from transformers import BertModel, BertConfig, BertForSequenceClassification
from transformers import AdamW
from tqdm.autonotebook import tqdm
import pandas as pd
import numpy as np
import boto3
from sagemaker import get_execution_role
tqdm.pandas()

In [6]:
semi_training_df = pd.read_pickle('semi_super_training_data_with_use_vectors_80_with_summary.zip', compression='zip')

In [7]:
labeled_test_df = pd.read_pickle('training_data_with_use_vectors_with_summary.zip', compression='zip')
print(len(labeled_test_df))

1571


In [8]:
from sklearn.model_selection import train_test_split
#Split the fuzzy matched records to be mixed in with training, the other half exclusively for testin
train_labeled, test_labeled = train_test_split(labeled_test_df, test_size=0.5)

In [9]:
train_text_list = semi_training_df['summary'].tolist() + train_labeled['summary'].tolist() 
train_labels = np.array(semi_training_df['issueArea'].tolist()  + train_labeled['issueArea'].tolist())
test_text_list = test_labeled['summary'].tolist()
test_labels = np.array(test_labeled['issueArea'].tolist())

In [10]:
test_data = list(zip(test_text_list,test_labels))
test_df = pd.DataFrame(test_data, columns=['summary','label'])
test_df.to_pickle('test_data_unseen_bert.zip', compression='zip')
test_df.head()

Unnamed: 0,summary,label
0,[{'summary_text': 'Petitioner might have been ...,9.0
1,[{'summary_text': 'Petitioner was convicted of...,1.0
2,[{'summary_text': 'United States Supreme Court...,1.0
3,"[{'summary_text': 'Petitioner, a Negro with a ...",1.0
4,[{'summary_text': 'The Telecommunications Act ...,8.0


In [11]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix

In [12]:
BERT_MODEL = 'bert-base-cased'
num_labels = 14
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=False)
model = BertForSequenceClassification.from_pretrained(BERT_MODEL, num_labels=num_labels)

In [13]:
device = torch.device('cuda')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [14]:
# Default MAX sequence length for BertModel
MAX_SEQ_LENGTH = 256
def pad_sequences(pad_token, seq_list, max_length):
    return seq_list + [pad_token] * (max_length - len(seq_list))

def pad_special_tokens(tokenized_text_sent):
    if len(tokenized_text_sent) > MAX_SEQ_LENGTH - 2:
           tokenized_text_sent = tokenized_text_sent[0:(MAX_SEQ_LENGTH - 2)]            
    tokenized_text_sent.insert(0,'[CLS]')
    tokenized_text_sent.append('[SEP]')
    return tokenized_text_sent

def tokenize_sentence(summary_text): 
    tokenized_text_sent = tokenizer.tokenize(summary_text[0]['summary_text'])                  
    tokenized_text_sent = pad_special_tokens(tokenized_text_sent)
    return tokenized_text_sent  

tokenized_seq = list(map(tokenize_sentence, train_text_list))
print(tokenized_seq[0])

['[CLS]', 'West', '##ing', '##house', 'Electric', '&', 'Manufacturing', 'Company', 'owned', 'Jones', "'", 'Patent', 'No', '.', '1', ',', '65', '##1', ',', '70', '##9', '.', 'The', 'invention', 'claimed', 'was', 'a', 'bra', '##zing', "'", 'sold', '##er', 'comprising', 'copper', 'and', 'p', '##hos', '##ph', '##orous', 'as', 'the', 'main', 'and', 'essential', 'constituents', "'", 'West', '##ing', '##house', 'sued', 'Mac', '##G', '##re', '##gor', 'for', 'infringement', '.', 'The', 'litigation', 'was', 'settled', ',', 'and', 'Mac', 'Gregor', 'took', 'a', 'license', 'from', 'West', '##ing', 'house', 'author', '##izing', 'Mac', '##G', '##re', '##gor', '##s', 'to', 'make', ',', 'use', ',', 'and', 'sell', 'sold', '##er', 'containing', 'the', 'constituents', 'described', 'in', 'the', 'patent', 'claim', '.', 'Mac', 'Gregor', '##s', 'paid', 'royal', '##ties', 'on', 'sold', '##er', 'he', 'made', 'and', 'sold', 'which', 'contained', 'only', 'p', '##hos', '##ph', '##orous', 'and', 'copper', '.', 'Lat

In [15]:
def convert_tokens_to_tensor(tokenized_sentence_a):        
    tokenized_text = []
    tokenized_text.extend(tokenized_sentence_a)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    indexed_tokens = pad_sequences(0, indexed_tokens, MAX_SEQ_LENGTH)    
    #tokens_tensor = torch.tensor([indexed_tokens])
    #generate the token type ids
    token_type_ids = []
    token_type_a = [0] * len(tokenized_sentence_a)
    token_type_ids.extend(token_type_a)
    token_type_ids = pad_sequences(0, token_type_ids, MAX_SEQ_LENGTH)    
    #token_type_tensor = torch.tensor([token_type_ids])
    #generate the type ids
    input_mask = [1] * len(tokenized_text)
    input_mask = pad_sequences(0, input_mask, MAX_SEQ_LENGTH)    
    #input_tensor = torch.tensor([input_mask])
    return [indexed_tokens, token_type_ids, input_mask]    

converted_tensors = list(map(convert_tokens_to_tensor, tokenized_seq))
print(converted_tensors[0])

[[101, 1537, 1158, 3255, 6763, 111, 17023, 1881, 2205, 2690, 112, 16653, 1302, 119, 122, 117, 2625, 1475, 117, 3102, 1580, 119, 1109, 11918, 2694, 1108, 170, 12418, 6185, 112, 1962, 1200, 9472, 7335, 1105, 185, 15342, 7880, 18133, 1112, 1103, 1514, 1105, 6818, 27808, 112, 1537, 1158, 3255, 13187, 6603, 2349, 1874, 18791, 1111, 23040, 119, 1109, 18184, 1108, 3035, 117, 1105, 6603, 18123, 1261, 170, 5941, 1121, 1537, 1158, 1402, 2351, 4404, 6603, 2349, 1874, 18791, 1116, 1106, 1294, 117, 1329, 117, 1105, 4582, 1962, 1200, 4051, 1103, 27808, 1758, 1107, 1103, 8581, 3548, 119, 6603, 18123, 1116, 3004, 4276, 4338, 1113, 1962, 1200, 1119, 1189, 1105, 1962, 1134, 4049, 1178, 185, 15342, 7880, 18133, 1105, 7335, 119, 2611, 1119, 1310, 1106, 1294, 1105, 4582, 1962, 1468, 2766, 1104, 185, 15342, 7880, 18133, 117, 7335, 117, 1105, 14086, 119, 1124, 1173, 3666, 1111, 1105, 3836, 15674, 1113, 1292, 1160, 2985, 1962, 1468, 1134, 1896, 14086, 1105, 2878, 3569, 1106, 1103, 185, 15342, 7880, 18133, 118

In [16]:

all_input_ids = torch.tensor([f[0] for f in converted_tensors])
all_input_mask = torch.tensor([f[2]for f in converted_tensors])
all_segment_ids = torch.tensor([f[1] for f in converted_tensors])
all_labels = torch.tensor(train_labels, dtype=torch.long)


In [17]:
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)

In [18]:
import math
MAX_GRAD_NORM = 1.0
NUM_TRAIN_EPOCHS = 20
TRAIN_BATCH_SIZE = 10
train_examples_len = len(all_input_ids)
num_train_optimization_steps = math.ceil((train_examples_len / TRAIN_BATCH_SIZE)) * NUM_TRAIN_EPOCHS
LEARNING_RATE = 2e-5
num_warmup_steps=100

In [19]:
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=TRAIN_BATCH_SIZE)

In [20]:
print(num_train_optimization_steps)

9200


In [21]:
# param_optimizer = list(model.named_parameters())
# no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
# optimizer_grouped_parameters = [
#     {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
#     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
#     ]

In [22]:
from transformers import get_linear_schedule_with_warmup
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE,  correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_optimization_steps)

In [23]:
print("***** Running training *****")
print("Num examples = {}".format(train_examples_len))
print("Batch size = {}".format(TRAIN_BATCH_SIZE))
print("Num steps = {}".format(num_train_optimization_steps))

***** Running training *****
Num examples = 4600
Batch size = 10
Num steps = 9200


In [24]:
global_step = 0
nb_tr_steps = 0
tr_loss = 0

In [25]:
from tqdm import tqdm_notebook, trange
from torch.nn import CrossEntropyLoss

model.train()
for _ in trange(int(NUM_TRAIN_EPOCHS), desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm_notebook(train_dataloader, desc="Iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch
        outputs, logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask,labels=label_ids)
        loss_fct = CrossEntropyLoss() 
        loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
        print("\r%f" % loss, end='')        
        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        if (step + 1) % 1 == 0:
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()
            global_step += 1

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=460.0, style=ProgressStyle(description_wi…

1.215272

Epoch:   5%|▌         | 1/20 [07:01<2:13:37, 422.00s/it]

1.749528


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=460.0, style=ProgressStyle(description_wi…

1.200958

Epoch:  10%|█         | 2/20 [14:06<2:06:47, 422.66s/it]

0.427441


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=460.0, style=ProgressStyle(description_wi…

1.657792

Epoch:  15%|█▌        | 3/20 [21:10<1:59:52, 423.12s/it]

0.973304


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=460.0, style=ProgressStyle(description_wi…

0.326275

Epoch:  20%|██        | 4/20 [28:14<1:52:54, 423.41s/it]

1.098479


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=460.0, style=ProgressStyle(description_wi…

0.233220

Epoch:  25%|██▌       | 5/20 [35:18<1:45:53, 423.57s/it]

0.242839


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=460.0, style=ProgressStyle(description_wi…

0.012923

Epoch:  30%|███       | 6/20 [42:22<1:38:51, 423.70s/it]

0.298591


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=460.0, style=ProgressStyle(description_wi…

0.727802

Epoch:  35%|███▌      | 7/20 [49:25<1:31:47, 423.63s/it]

0.095708


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=460.0, style=ProgressStyle(description_wi…

0.032180

Epoch:  40%|████      | 8/20 [56:28<1:24:40, 423.41s/it]

0.577277


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=460.0, style=ProgressStyle(description_wi…

0.162220

Epoch:  45%|████▌     | 9/20 [1:03:31<1:17:34, 423.12s/it]

0.016323


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=460.0, style=ProgressStyle(description_wi…

0.000992

Epoch:  50%|█████     | 10/20 [1:10:33<1:10:28, 422.81s/it]

0.005413


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=460.0, style=ProgressStyle(description_wi…

0.000306

Epoch:  55%|█████▌    | 11/20 [1:17:34<1:03:21, 422.44s/it]

0.001605


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=460.0, style=ProgressStyle(description_wi…

0.000775

Epoch:  60%|██████    | 12/20 [1:24:36<56:16, 422.11s/it]  

0.000523


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=460.0, style=ProgressStyle(description_wi…

0.000666

Epoch:  65%|██████▌   | 13/20 [1:31:37<49:13, 421.88s/it]

0.000335


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=460.0, style=ProgressStyle(description_wi…

0.000190

Epoch:  70%|███████   | 14/20 [1:38:39<42:10, 421.76s/it]

0.000441


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=460.0, style=ProgressStyle(description_wi…

0.000251

Epoch:  75%|███████▌  | 15/20 [1:45:40<35:08, 421.65s/it]

0.002726


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=460.0, style=ProgressStyle(description_wi…

0.000674

Epoch:  80%|████████  | 16/20 [1:52:41<28:06, 421.54s/it]

0.000262


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=460.0, style=ProgressStyle(description_wi…

0.000127

Epoch:  85%|████████▌ | 17/20 [1:59:42<21:04, 421.42s/it]

0.000810


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=460.0, style=ProgressStyle(description_wi…

0.001008

Epoch:  90%|█████████ | 18/20 [2:06:43<14:02, 421.23s/it]

0.000245


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=460.0, style=ProgressStyle(description_wi…

0.000508

Epoch:  95%|█████████▌| 19/20 [2:13:44<07:01, 421.08s/it]

0.000162


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=460.0, style=ProgressStyle(description_wi…

0.000455

Epoch: 100%|██████████| 20/20 [2:20:45<00:00, 422.25s/it]

0.000406





In [26]:
import os
from transformers import WEIGHTS_NAME, CONFIG_NAME
# # Save a trained model
model_out = 'scdb_train_tuned_model'
#os.mkdir(model_out)
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(model_out, WEIGHTS_NAME)
output_config_file = os.path.join(model_out, CONFIG_NAME)
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
vocab = tokenizer.save_vocabulary(model_out)