# Imports and inits

In [1]:
'''Installations'''

!pip install evaluate
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import numpy as np
import pandas as pd
import evaluate
import transformers
from tqdm import tqdm
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, TextClassificationPipeline, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer 

In [3]:
'''For Google colab '''

from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [42]:
path = '/content/gdrive/MyDrive/CSCI 544/toyset300.csv'

# Data init

In [43]:
data = pd.read_csv(path, dtype="string")
data['Definition'] = data['Definition'].astype(str)

In [44]:
data

Unnamed: 0,Word,Definition
0,gamecock,a rooster trained for cockfighting
1,gamecock,a fighting cock a rooster used in cockfighting
2,gamecock,the male game fowl
3,gamecock,a cock bred from a fighting stock or strain a ...
4,gamecock,someone who is a very fierce fighter
...,...,...
998,Pentoic,"""Pertaining to or desingating an acid (called..."
999,low-lying,having a small elevation above the ground or h...
1000,Extruded,"""of Extrude"""
1001,high_life,excessive spending


In [45]:
data['Word'].unique()

<StringArray>
[         'gamecock',       'Nuttall_oak',  'neuroepithelioma',
     'Niggardliness',          'Tenpenny',          'Compress',
   'lesser_galangal',          'go_after',            'Esnecy',
         'Novelette',
 ...
         'euclidean',   'Physiologically',        'frightened',
           'outwear', 'false_dragon_head',           'Pentoic',
         'low-lying',          'Extruded',         'high_life',
      'dance_lesson']
Length: 300, dtype: string

In [46]:
df_train, df_test = train_test_split(data[['Definition','Word']], test_size=0.2)
df_test, df_eval = train_test_split(df_test[['Definition','Word']], test_size=0.5)

# Model Init


In [47]:
model = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-base')
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-base')

In [48]:
max_input = 512
max_target = 1
batch_size = 3

# More data processing

In [50]:
# prefix = "summarize: "
prefix=''
def preprocess_data(data_to_process):
  #get all the definitions
  inputs = [prefix + dialogue for dialogue in data_to_process['Definition']]
  #tokenize the definitions
  model_inputs = tokenizer(inputs,  max_length=max_input, padding='max_length', truncation=True)
  #tokenize the words
  with tokenizer.as_target_tokenizer():
    targets = tokenizer(data_to_process['Word'], max_length=max_target, padding='max_length', truncation=True)
    
  #set labels
  model_inputs['labels'] = targets['input_ids']
  #return the tokenized data
  #input_ids, attention_mask and labels
  return model_inputs

In [51]:
from datasets import load_dataset, load_from_disk
from datasets import Dataset, DatasetDict

In [52]:
df_dict = Dataset.from_dict(df_train)

In [53]:
df_dict_test = Dataset.from_dict(df_test)
df_dict_eval = Dataset.from_dict(df_eval)

In [54]:
df_dict = DatasetDict({"train":df_dict,"test":df_dict_test, "validation": df_dict_eval})

In [55]:
df_dict

DatasetDict({
    train: Dataset({
        features: ['Definition', 'Word'],
        num_rows: 802
    })
    test: Dataset({
        features: ['Definition', 'Word'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['Definition', 'Word'],
        num_rows: 101
    })
})

In [56]:
tokenize_data = df_dict.map(preprocess_data, batched=True)

Map:   0%|          | 0/802 [00:00<?, ? examples/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

In [57]:
collator = transformers.DataCollatorForSeq2Seq(tokenizer, model=model)

In [59]:
args = Seq2SeqTrainingArguments(
    '/content/gdrive/MyDrive/CSCI 544/bart', #save directory
    evaluation_strategy='epoch',
    learning_rate=1e-4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size= 2,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=10,
    predict_with_generate=True,
    eval_accumulation_steps=3,
    fp16=True #available only with CUDA
    )

trainer = Seq2SeqTrainer(
    model, 
    args,
    train_dataset=tokenize_data['train'],
    eval_dataset=tokenize_data['validation'],
    data_collator=collator,
    tokenizer=tokenizer,
)

In [60]:
trainer.train()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
0,No log,1.409677
2,No log,1.466841
2,1.331300,1.648482
4,1.331300,1.684494
4,0.221600,1.827444
6,0.221600,1.852194
6,0.221600,1.844785
8,0.036400,1.821427
8,0.036400,1.783263
9,0.006000,1.762667


TrainOutput(global_step=2000, training_loss=0.3988218041658401, metrics={'train_runtime': 496.1835, 'train_samples_per_second': 16.163, 'train_steps_per_second': 4.031, 'total_flos': 2438945832960000.0, 'train_loss': 0.3988218041658401, 'epoch': 9.98})

In [68]:
import re

# Checking accuracy

In [85]:
# model = AutoModelForSeq2SeqLM.from_pretrained('/content/gdrive/MyDrive/CSCI 544/bart/checkpoint-1500')
# tokenizer = AutoTokenizer.from_pretrained('/content/gdrive/MyDrive/CSCI 544/bart/checkpoint-1500')

In [86]:
input_ids = tokenizer(df_test.iloc[0]['Definition'], return_tensors="pt", max_length=max_input, padding='max_length', truncation=True).input_ids
outputs = model.generate(input_ids.cuda(), num_return_sequences = 10, num_beams=10, num_beam_groups=5, diversity_penalty=10.0, no_repeat_ngram_size = 3)

In [88]:
[tokenizer.decode(outputs[i], skip_special_tokens=True) for i in range(len(outputs))]

['Lethargical',
 'Zymology',
 'Trillachan',
 'Composer',
 'Butchering',
 'Transplant',
 'Mittimus',
 'Letharggin',
 ' laundering',
 'Trillessee']

In [82]:
correct = 0
for idx, row in df_train.iterrows():
  defn = row['Definition']
  input_ids = tokenizer(defn, return_tensors="pt", max_length=max_input, padding='max_length', truncation=True).input_ids
  outputs = model.generate(input_ids.cuda(), num_return_sequences = 10, num_beams=10, num_beam_groups=5, diversity_penalty=10.0, no_repeat_ngram_size = 3)
  if row['Word'] in [tokenizer.decode(outputs[i], skip_special_tokens=True) for i in range(len(outputs))]:
    correct += 1

print('Train accuracy: ' + str(correct/len(df_train)))



Train accuracy: 0.9538653366583542


In [83]:
correct = 0
for idx, row in df_eval.iterrows():
  defn = row['Definition']
  input_ids = tokenizer(defn, return_tensors="pt", max_length=max_input, padding='max_length', truncation=True).input_ids
  outputs = model.generate(input_ids.cuda(), num_return_sequences = 10, num_beams=10, num_beam_groups=5, diversity_penalty=10.0, no_repeat_ngram_size = 3)
  if row['Word'] in [tokenizer.decode(outputs[i], skip_special_tokens=True) for i in range(len(outputs))]:
    correct += 1

print('Validation accuracy: ' + str(correct/len(df_eval)))

Validation accuracy: 0.6831683168316832


In [84]:
correct = 0
for idx, row in df_test.iterrows():
  defn = row['Definition']
  input_ids = tokenizer(defn, return_tensors="pt", max_length=max_input, padding='max_length', truncation=True).input_ids
  outputs = model.generate(input_ids.cuda(), num_return_sequences = 10, num_beams=10, num_beam_groups=5, diversity_penalty=10.0, no_repeat_ngram_size = 3)
  if row['Word'] in [tokenizer.decode(outputs[i], skip_special_tokens=True) for i in range(len(outputs))]:
    correct += 1

print('Test accuracy: ' + str(correct/len(df_test)))

Test accuracy: 0.73
