In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sys
sys.path.append("/content/drive/MyDrive/CS224N_Reverse_Dictionary-main/src")

In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m65.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m109.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1


In [4]:
from transformers import BertTokenizer, BertForMaskedLM
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.nn import functional as F
from tqdm import tqdm
import torch
import numpy as np
import time
import pandas as pd

from torch.utils.data import random_split, DataLoader, RandomSampler, SequentialSampler
from data import CustomDataset, dataset_tokenized, get_wordvecs
from models.model_2 import Model2

In [5]:
path = "/content/drive/MyDrive/CS224N_Reverse_Dictionary-main/toyset10000.csv"
df = pd.read_csv(path, dtype="string")
df['Definition'] = df['Definition'].astype(str)
df = df[['Word', 'Definition']]
df.shape

(34828, 2)

In [6]:
df[~df['Word'].str.contains('_')]
df['Word'] = df['Word'].apply(lambda x: x.lower())
df['Definition'] = df['Definition'].apply(str.lower)
df[~df['Word'].str.contains('\"')]
df = df.applymap(lambda x: x.replace('\"', ''))
df['Definition'] = df['Definition'].apply(lambda x : x.split(';'))
df = df.explode('Definition')
df.reset_index()
df

Unnamed: 0,Word,Definition
0,pagurus,type genus of the family paguridae
1,vermicular,decorated with wormlike tracery or markings
2,swamp_blackberry,of eastern north america
3,genus_ephestia,small moths whose larvae spin silken tunnels a...
4,tweedle,to handle lightly
...,...,...
34824,hymnal,a collection of hymns
34824,hymnal,a hymn book.
34825,ick,an exclamation of disgust
34826,camarasaurus,a genus of gigantic american jurassic dinosaur...


In [7]:
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [8]:
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
# All 400000 embeddings, each lengh 100
import gensim.downloader as api

wv_from_bin = api.load("glove-wiki-gigaword-100")



In [10]:
contains_glove = []
for _, word in df['Word'].items():
  if word in wv_from_bin:
    contains_glove += [True]
  else:
    contains_glove += [False]
df = df[contains_glove]
df.shape

(32872, 2)

In [11]:
max_len = 0

# For every sentence...
# for sent in combined_data['definition']:

#     # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
#     input_ids = tokenizer.encode(sent, add_special_tokens=True)

#     # Update the maximum sentence length.
#     max_len = max(max_len, len(input_ids))
max_len = 200 # Just set it to 291 since there is no need to run this anymore
print('Max sentence length: ', max_len)

Max sentence length:  200


In [12]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in df['Definition']:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 200,           # Pad & truncate all sentences. Max length is 291 but most are not that long
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [13]:
# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

In [14]:
torch.save(input_ids, 'input_ids_cleaned')
torch.save(attention_masks, 'attention_masks_cleaned')

In [15]:
import numpy as np
labels = []
count = 0
for word in df['Word']:
  try:
    label = wv_from_bin[word]
  except:
    count += 1
    label = [np.random.rand() * 2 - 1 for i in range(100)] # Random vector if word does not exist
  labels.append(label)
labels = np.array(labels)
labels = torch.tensor(labels)
print(count)

0


In [16]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create a 80-20 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size], generator=torch.Generator().manual_seed(224))

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

26,297 training samples
6,575 validation samples


In [17]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 16

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset, generator=torch.Generator().manual_seed(224)), # Select batches pseudo randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [18]:
model = Model2()
model.cuda()

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model2(
  (bert_backend): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [19]:
from transformers import AdamW, BertConfig

# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )



In [20]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
epochs = 10

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [21]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [22]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [23]:
loss_fn = torch.nn.MSELoss(reduction='sum')

In [24]:
import random
import numpy as np

seed_val = 224

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []
total_t0 = time.time()
for epoch_i in range(0, epochs):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    t0 = time.time()
    total_train_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        if step % 50 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            torch.save(model.state_dict(), "model.params.f")
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        model.zero_grad()        

        output = model(b_input_ids, b_input_mask)
        batch_loss = loss_fn(output, b_labels.float())
        total_train_loss += batch_loss.item()
        batch_loss.double().backward()

        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader.dataset)            
    
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))

    top_count = 0
    top_10_count = 0
    top_100_count = 0
        
    print("")
    print("Running Validation...")

    t0 = time.time()
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        with torch.no_grad(): 
            outputs = model(b_input_ids, b_input_mask)
            total_eval_loss += loss_fn(outputs, b_labels.float())
        for i in range(outputs.size(0)): 
            top_100 = wv_from_bin.most_similar(positive=[outputs[i].cpu().numpy().astype(np.float32)], topn=100)
            top_10 = wv_from_bin.most_similar(positive=[outputs[i].cpu().numpy().astype(np.float32)], topn=10)[:10]
            # top_10 = top_100[:10]
            actual = wv_from_bin.most_similar(positive=[b_labels[i].cpu().numpy().astype(np.float32)], topn=1)[0][0]
            
            top_count += int(actual == top_10[0][0])
            top_10_count += int(actual in [pairs[0] for pairs in top_10])
            top_100_count += int(actual in [pairs[0] for pairs in top_100])

    avg_val_accuracy = top_count / len(validation_dataloader.dataset)
    print("  Top 1 Accuracy: {0:.2f}".format(avg_val_accuracy))
    avg_val_10_accuracy = top_10_count / len(validation_dataloader.dataset)
    print("  Top 10 Accuracy: {0:.2f}".format(avg_val_10_accuracy))
    avg_val_100_accuracy = top_100_count / len(validation_dataloader.dataset)
    print("  Top 100 Accuracy: {0:.2f}".format(avg_val_100_accuracy))

    avg_val_loss = total_eval_loss / len(validation_dataloader.dataset)

    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Top 1 Accur.': avg_val_accuracy,
            'Valid. Top 10 Accur.': avg_val_10_accuracy,
            'Valid. Top 100 Accur.': avg_val_100_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch    50  of  1,644.    Elapsed: 0:00:26.
  Batch   100  of  1,644.    Elapsed: 0:00:54.
  Batch   150  of  1,644.    Elapsed: 0:01:23.
  Batch   200  of  1,644.    Elapsed: 0:01:54.
  Batch   250  of  1,644.    Elapsed: 0:02:25.
  Batch   300  of  1,644.    Elapsed: 0:02:55.
  Batch   350  of  1,644.    Elapsed: 0:03:26.
  Batch   400  of  1,644.    Elapsed: 0:03:57.
  Batch   450  of  1,644.    Elapsed: 0:04:28.
  Batch   500  of  1,644.    Elapsed: 0:04:58.
  Batch   550  of  1,644.    Elapsed: 0:05:29.
  Batch   600  of  1,644.    Elapsed: 0:06:00.
  Batch   650  of  1,644.    Elapsed: 0:06:31.
  Batch   700  of  1,644.    Elapsed: 0:07:01.
  Batch   750  of  1,644.    Elapsed: 0:07:33.
  Batch   800  of  1,644.    Elapsed: 0:08:03.
  Batch   850  of  1,644.    Elapsed: 0:08:34.
  Batch   900  of  1,644.    Elapsed: 0:09:05.
  Batch   950  of  1,644.    Elapsed: 0:09:35.
  Batch 1,000  of  1,644.    Elapsed: 0:10:06.
  Batch 1,050  of  1,644.    Elapsed: 0:10:37.


In [None]:
# Just running evaluation
print("")
print("Running Validation...")
top_count = 0
top_10_count = 0
top_100_count = 0
t0 = time.time()

# Put the model in evaluation mode--the dropout layers behave differently
# during evaluation.
model.eval()

# Tracking variables 
total_eval_accuracy = 0
total_eval_loss = 0
nb_eval_steps = 0

# Evaluate data for one epoch
for batch in validation_dataloader:
    
    # Unpack this training batch from our dataloader. 
    #
    # As we unpack the batch, we'll also copy each tensor to the GPU using 
    # the `to` method.
    #
    # `batch` contains three pytorch tensors:
    #   [0]: input ids 
    #   [1]: attention masks
    #   [2]: labels 
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    
    # Tell pytorch not to bother with constructing the compute graph during
    # the forward pass, since this is only needed for backprop (training).
    with torch.no_grad():        

        # Forward pass, calculate logit predictions.
        # token_type_ids is the same as the "segment ids", which 
        # differentiates sentence 1 and 2 in 2-sentence tasks.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        outputs = model(b_input_ids, b_input_mask)
        total_eval_loss += loss_fn(output, b_labels.float())

    # Calculate the accuracy for this batch of test sentences, and
    # accumulate it over all batches.
    for i in range(len(outputs)): # batch_size
      top_100 = wv_from_bin.most_similar(positive=[outputs[i].cpu().numpy().astype(np.float32)], topn=100)
      top_10 = wv_from_bin.most_similar(positive=[outputs[i].cpu().numpy().astype(np.float32)], topn=10)[:10]
      actual = wv_from_bin.most_similar(positive=[b_labels[i].cpu().numpy().astype(np.float32)], topn=1)[0][0]
      top_count += int(actual == top_10[0][0])
      top_10_count += int(actual in [pairs[0] for pairs in top_10])
      top_100_count += int(actual in [pairs[0] for pairs in top_100])
    print(top_count, top_10_count, top_100_count)
    

# Report the final accuracy for this validation run.
avg_val_accuracy = top_count / len(validation_dataloader.dataset)
print("  Top 1 Accuracy: {0:.2f}".format(avg_val_accuracy))
avg_val_10_accuracy = top_10_count / len(validation_dataloader.dataset)
print("  Top 10 Accuracy: {0:.2f}".format(avg_val_10_accuracy))
avg_val_100_accuracy = top_100_count / len(validation_dataloader.dataset)
print("  Top 100 Accuracy: {0:.2f}".format(avg_val_100_accuracy))

# Calculate the average loss over all of the batches.
avg_val_loss = total_eval_loss / len(validation_dataloader)

# Measure how long the validation run took.
validation_time = format_time(time.time() - t0)

print("  Validation Loss: {0:.2f}".format(avg_val_loss))
print("  Validation took: {:}".format(validation_time))

In [None]:
avg_val_accuracy = top_count / len(validation_dataloader.dataset)
print("  Top 1 Accuracy: {0:.2f}".format(avg_val_accuracy))
avg_val_10_accuracy = top_10_count / len(validation_dataloader.dataset)
print("  Top 10 Accuracy: {0:.2f}".format(avg_val_10_accuracy))
avg_val_100_accuracy = top_100_count / len(validation_dataloader.dataset)
print("  Top 100 Accuracy: {0:.2f}".format(avg_val_100_accuracy))

# Calculate the average loss over all of the batches.
avg_val_loss = total_eval_loss / len(validation_dataloader)

# Measure how long the validation run took.
validation_time = format_time(time.time() - t0)

print("  Validation Loss: {0:.2f}".format(avg_val_loss))
print("  Validation took: {:}".format(validation_time))