# Imports and inits

In [2]:
'''Installations'''

!pip install evaluate
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.7.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting datasets>=2.0.0
  Downloadi

In [3]:
import numpy as np
import pandas as pd
import evaluate
import transformers
from tqdm import tqdm
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, TextClassificationPipeline, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer 

In [4]:
'''For Google colab '''

from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
path = '/content/gdrive/MyDrive/CSCI 544/toyset10000.csv'

# Data init

In [None]:
data = pd.read_csv(path, dtype="string")
data['Definition'] = data['Definition'].astype(str)

In [None]:
data

Unnamed: 0,Word,Definition
0,Pagurus,type genus of the family Paguridae
1,vermicular,decorated with wormlike tracery or markings
2,swamp_blackberry,of eastern North America
3,genus_Ephestia,small moths whose larvae spin silken tunnels a...
4,Tweedle,"""To handle lightly; -- said with reference to ..."
...,...,...
34823,nonprofit,not commercially motivated
34824,Hymnal,"""A collection of hymns; a hymn book."""
34825,ick,an exclamation of disgust
34826,Camarasaurus,"""A genus of gigantic American Jurassic dinosau..."


In [None]:
data['Word'] = data['Word'].astype(str)

data['Word'] = data['Word'].apply(lambda x: x.lower())
data['Definition'] = data['Definition'].apply(str.lower)

data[~data['Word'].str.contains('\"')]
data = data.applymap(lambda x: x.replace('\"', ''))

data = data.explode('Definition')

In [None]:
data['Word'].unique()

array(['pagurus', 'vermicular', 'swamp_blackberry', ..., 'ick',
       'camarasaurus', 'first_of_all'], dtype=object)

In [None]:
df_train, df_test = train_test_split(data[['Definition','Word']], test_size=0.2)
df_test, df_eval = train_test_split(df_test[['Definition','Word']], test_size=0.5)

# Model Init


In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-base')
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-base')

In [8]:
max_input = 512
max_target = 1
batch_size = 3

# More data processing

In [None]:
# prefix = "summarize: "
prefix=''
def preprocess_data(data_to_process):
  #get all the definitions
  inputs = [prefix + dialogue for dialogue in data_to_process['Definition']]
  #tokenize the definitions
  model_inputs = tokenizer(inputs,  max_length=max_input, padding='max_length', truncation=True)
  #tokenize the words
  with tokenizer.as_target_tokenizer():
    targets = tokenizer(data_to_process['Word'], max_length=max_target, padding='max_length', truncation=True)
    
  #set labels
  model_inputs['labels'] = targets['input_ids']
  #return the tokenized data
  #input_ids, attention_mask and labels
  return model_inputs

In [None]:
from datasets import load_dataset, load_from_disk
from datasets import Dataset, DatasetDict

In [None]:
df_dict = Dataset.from_dict(df_train)

In [None]:
df_dict_test = Dataset.from_dict(df_test)
df_dict_eval = Dataset.from_dict(df_eval)

In [None]:
df_dict = DatasetDict({"train":df_dict,"test":df_dict_test, "validation": df_dict_eval})

In [None]:
df_dict

DatasetDict({
    train: Dataset({
        features: ['Definition', 'Word'],
        num_rows: 27862
    })
    test: Dataset({
        features: ['Definition', 'Word'],
        num_rows: 3483
    })
    validation: Dataset({
        features: ['Definition', 'Word'],
        num_rows: 3483
    })
})

In [None]:
tokenize_data = df_dict.map(preprocess_data, batched=True)

Map:   0%|          | 0/802 [00:00<?, ? examples/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

In [None]:
collator = transformers.DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
args = Seq2SeqTrainingArguments(
    '/content/gdrive/MyDrive/CSCI 544/bart', #save directory
    evaluation_strategy='epoch',
    learning_rate=1e-4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size= 2,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=10,
    predict_with_generate=True,
    eval_accumulation_steps=3,
    fp16=True #available only with CUDA
    )

trainer = Seq2SeqTrainer(
    model, 
    args,
    train_dataset=tokenize_data['train'],
    eval_dataset=tokenize_data['validation'],
    data_collator=collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
0,No log,1.409677
2,No log,1.466841
2,1.331300,1.648482
4,1.331300,1.684494
4,0.221600,1.827444
6,0.221600,1.852194
6,0.221600,1.844785
8,0.036400,1.821427
8,0.036400,1.783263
9,0.006000,1.762667


TrainOutput(global_step=2000, training_loss=0.3988218041658401, metrics={'train_runtime': 496.1835, 'train_samples_per_second': 16.163, 'train_steps_per_second': 4.031, 'total_flos': 2438945832960000.0, 'train_loss': 0.3988218041658401, 'epoch': 9.98})

In [None]:
import re

# Checking accuracy

In [5]:
model = AutoModelForSeq2SeqLM.from_pretrained('/content/gdrive/MyDrive/USC/CSCI544/Project/carc 2/checkpoint-34500')
tokenizer = AutoTokenizer.from_pretrained('/content/gdrive/MyDrive/USC/CSCI544/Project/carc 2/checkpoint-34500')

model.cuda()

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=

In [6]:
data = pd.read_csv('/content/gdrive/MyDrive/USC/CSCI544/Project/carc 2/test_100.csv', dtype="string")
data['Definition'] = data['Definition'].astype(str)

In [9]:
correct = 0
for idx, row in data.iterrows():
  defn = row['Definition']
  input_ids = tokenizer(defn, return_tensors="pt", max_length=max_input, padding='max_length', truncation=True).input_ids
  outputs = model.generate(input_ids.cuda(), num_return_sequences = 1)
  if row['Word'] in [tokenizer.decode(outputs[i], skip_special_tokens=True) for i in range(len(outputs))]:
    correct += 1

print('Top 1 accuracy: ' + str(correct/len(data)))



Top 1 accuracy: 0.23


In [11]:
correct = 0
for idx, row in data.iterrows():
  defn = row['Definition']
  input_ids = tokenizer(defn, return_tensors="pt", max_length=max_input, padding='max_length', truncation=True).input_ids
  outputs = model.generate(input_ids.cuda(), num_return_sequences = 10, num_beams=10, num_beam_groups=5, diversity_penalty=10.0, no_repeat_ngram_size = 3)
  if row['Word'] in [tokenizer.decode(outputs[i], skip_special_tokens=True) for i in range(len(outputs))]:
    correct += 1

print('Top 10 accuracy: ' + str(correct/len(data)))

Top 10 accuracy: 0.39


In [10]:
correct = 0
for idx, row in data.iterrows():
  defn = row['Definition']
  input_ids = tokenizer(defn, return_tensors="pt", max_length=max_input, padding='max_length', truncation=True).input_ids
  outputs = model.generate(input_ids.cuda(), num_return_sequences = 100, num_beams=100, num_beam_groups=50, diversity_penalty=10.0, no_repeat_ngram_size = 3)
  if row['Word'] in [tokenizer.decode(outputs[i], skip_special_tokens=True) for i in range(len(outputs))]:
    correct += 1

print('Top 100 accuracy: ' + str(correct/len(data)))

Top 100 accuracy: 0.48
