In [1]:
import torch
import pandas
import spacy
import json
import datetime
import dateutil.parser
from transformers import BertModel, BertConfig, AdamW, BertTokenizer, BertForSequenceClassification
nlp = spacy.load('en_core_web_lg')

device = torch.device("cuda:0")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name()

'GeForce GTX 1050 Ti'

In [2]:
def rawTmdbMovies(filename):
    return json.load(open(filename))

def indexableMovies(filename='tmdb.json'):
    """ Generates TMDB movies, similar to how ES Bulk indexing
        uses a generator to generate bulk index/update actions """
    tmdbMovies = rawTmdbMovies(filename)
    for movieId, tmdbMovie in tmdbMovies.items():
        title = ''
        overview = ''
        vote_average = -1
        if 'vote_average' in tmdbMovie.keys():
            vote_average = tmdbMovie['vote_average']        
        if 'overview' in tmdbMovie.keys() and isinstance(tmdbMovie['overview'], str):
            overview = tmdbMovie['overview']        
        yield movieId,vote_average,overview.strip()

In [3]:
def getData(filename,goodness=6.1):
    sources = []
    labels = []
    notes = []
    sentences = []
    for movieid,vote_average,overview in indexableMovies(filename):
        if(len(overview) and vote_average>0):
            isgood=0
            isnote="*"
            if vote_average>=goodness:
                isgood=1
                isnote=None
            
            doc = nlp(overview)
            passage = "[CLS] "
            for sent in doc.sents:
                passage += sent.text + " [SEP]"
            
            sources.append('tmdb')
            labels.append(isgood)
            notes.append(isnote)
            sentences.append(passage)
            
    df = pandas.DataFrame({
            'source': sources,
            'label': labels,
            'label_notes': notes,
            'sentence': sentences
        })
    return df

In [4]:
df = getData('../tmdb.json',goodness=6.1)

In [5]:
df

Unnamed: 0,source,label,label_notes,sentence
0,tmdb,1,,[CLS] This feature-length special consists of ...
1,tmdb,1,,"[CLS] Raj is a rich, carefree, happy-go-lucky ..."
2,tmdb,1,,[CLS] Framed in the 1940s for the double murde...
3,tmdb,1,,[CLS] High schoolers Mitsuha and Taki are comp...
4,tmdb,1,,"[CLS] Spanning the years 1945 to 1955, a chron..."
5,tmdb,1,,[CLS] Dangal is an extraordinary true story ba...
6,tmdb,1,,[CLS] Under the direction of a ruthless instru...
7,tmdb,1,,[CLS] The true story of how businessman Oskar ...
8,tmdb,1,,[CLS] Spirited Away is an Oscar winning Japane...
9,tmdb,1,,[CLS] In the continuing saga of the Corleone c...


In [6]:
df.to_csv('tmdb-goodness.csv')

In [7]:
sentences = df.sentence.values
labels = df.label.values

In [8]:
pretrained_weights = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights, do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

Tokenize the first sentence:
['[CLS]', 'this', 'feature', '-', 'length', 'special', 'consists', 'of', 'three', 'inter', '##wo', '##ven', 'stories', '.', '[SEP]', 'in', 'a', 'mysterious', 'and', 'remote', 'snowy', 'outpost', ',', 'matt', 'and', 'potter', 'share', 'a', 'christmas', 'meal', ',', 'swap', '##ping', 'creepy', 'tales', 'of', 'their', 'earlier', 'lives', 'in', 'the', 'outside', 'world', '.', '[SEP]', 'matt', 'is', 'a', 'charismatic', 'american', 'trying', 'to', 'bring', 'the', 'reserved', ',', 'secretive', 'potter', 'out', 'of', 'his', 'shell', '.', '[SEP]', 'but', 'are', 'both', 'men', 'who', 'they', 'appear', 'to', 'be', '?', '[SEP]', 'a', 'woman', 'gets', 'thrust', 'into', 'a', 'night', '##mar', '##ish', 'world', 'of', "'", 'smart', "'", 'ga', '##dget', '##ry', '.', '[SEP]', 'plus', 'a', 'look', 'at', 'what', 'would', 'happen', 'if', 'you', 'could', "'", 'block', "'", 'people', 'in', 'real', 'life', '.', '[SEP]']


In [9]:
maxlen = max(map(lambda x: len(x),sentences))
maxlen

1086

In [10]:
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

In [11]:
# Pad our input tokens
from keras_preprocessing.sequence import pad_sequences
input_ids = pad_sequences(input_ids, maxlen=maxlen, dtype="long", truncating="post", padding="post")

In [12]:
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [13]:
# Use train_test_split to split our data into train and validation sets for training
from sklearn.model_selection import train_test_split
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [14]:
# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [16]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 4

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


In [17]:
model = BertForSequenceClassification.from_pretrained(pretrained_weights, num_labels=2)
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [18]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [19]:
from transformers import WarmupLinearSchedule
lr=2e-5
max_grad_norm = 1.0
num_total_steps = 1000
num_warmup_steps = 100

# This variable contains all of the hyperparemeter information our training loop needs
optimizer = AdamW(optimizer_grouped_parameters, lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps)  # PyTorch scheduler

In [20]:
import numpy as np
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [21]:
import io
import matplotlib.pyplot as plt
from tqdm import tqdm, trange

# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  
  
  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()
  
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  

  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
    loss = outputs[0]
    train_loss_set.append(loss.item())    
    # Backward pass
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    scheduler.step()

    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    
  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables 
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
      loss, logits = outputs[:2]
    
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]


RuntimeError: cublas runtime error : library not initialized at /tmp/pip-req-build-58y_cjjl/aten/src/THC/THCGeneral.cpp:216

In [18]:
print("Train loss: {}".format(tr_loss/nb_tr_steps))

Train loss: 0.416932628107627


In [2]:
pretrained_weights = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
model = BertForSequenceClassification.from_pretrained(pretrained_weights)

In [3]:
# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
sentence_0 = "This research was consistent with his findings."
sentence_1 = "His findings were compatible with this research."
sentence_2 = "His findings were not compatible with this research."
inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')

pred_1 = model(inputs_1['input_ids'], token_type_ids=inputs_1['token_type_ids'])[0].argmax().item()
pred_2 = model(inputs_2['input_ids'], token_type_ids=inputs_2['token_type_ids'])[0].argmax().item()

print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")

sentence_1 is not a paraphrase of sentence_0
sentence_2 is not a paraphrase of sentence_0


In [4]:
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]

In [5]:
loss

tensor(0.4957, grad_fn=<NllLossBackward>)

In [6]:
logits

tensor([[-0.1769,  0.2668]], grad_fn=<AddmmBackward>)