# Imports and inits

In [1]:
'''Installations'''

!pip install evaluate
!pip install transformers
!pip install sentencepiece
!pip install rouge_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import numpy as np
import pandas as pd
import evaluate
import transformers
from tqdm import tqdm
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, TextClassificationPipeline, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, GenerationConfig, PegasusForConditionalGeneration, PegasusTokenizer 

In [3]:
'''For Google colab '''

from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [4]:
path = '/content/gdrive/MyDrive/CSCI 544/toyset300.csv'

# Data init

In [7]:
data = pd.read_csv(path, dtype="string")
data['Definition'] = data['Definition'].astype(str)

In [8]:
data

Unnamed: 0,Word,Definition
0,gamecock,a rooster trained for cockfighting
1,gamecock,a fighting cock a rooster used in cockfighting
2,gamecock,the male game fowl
3,gamecock,a cock bred from a fighting stock or strain a ...
4,gamecock,someone who is a very fierce fighter
...,...,...
998,Pentoic,"""Pertaining to or desingating an acid (called..."
999,low-lying,having a small elevation above the ground or h...
1000,Extruded,"""of Extrude"""
1001,high_life,excessive spending


In [9]:
data['Word'].unique()

<StringArray>
[         'gamecock',       'Nuttall_oak',  'neuroepithelioma',
     'Niggardliness',          'Tenpenny',          'Compress',
   'lesser_galangal',          'go_after',            'Esnecy',
         'Novelette',
 ...
         'euclidean',   'Physiologically',        'frightened',
           'outwear', 'false_dragon_head',           'Pentoic',
         'low-lying',          'Extruded',         'high_life',
      'dance_lesson']
Length: 300, dtype: string

In [10]:
df_train, df_test = train_test_split(data[['Definition','Word']], test_size=0.2)
df_test, df_eval = train_test_split(df_test[['Definition','Word']], test_size=0.5)

# Model Init


In [11]:
model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')

In [12]:
max_input = 128
max_target = 1
batch_size = 4

# More data processing

In [13]:
# prefix = "summarize: "
prefix=''
def preprocess_data(data_to_process):
  #get all the definitions
  inputs = [prefix + dialogue for dialogue in data_to_process['Definition']]
  #tokenize the definitions
  model_inputs = tokenizer(inputs,  max_length=max_input, padding=True, truncation=True)
  #tokenize the words
  
  with tokenizer.as_target_tokenizer():
    targets = tokenizer([d for d in data_to_process['Word']], padding=True, truncation=True)
    
  #set labels
  targets["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in targets["input_ids"]
        ]
  model_inputs['labels'] = targets['input_ids']
  #return the tokenized data
  #input_ids, attention_mask and labels
  return model_inputs

In [14]:
from datasets import load_dataset, load_from_disk
from datasets import Dataset, DatasetDict

In [15]:
df_dict = Dataset.from_dict(df_train)

In [16]:
df_dict_test = Dataset.from_dict(df_test)
df_dict_eval = Dataset.from_dict(df_eval)

In [17]:
df_dict = DatasetDict({"train":df_dict,"test":df_dict_test, "validation": df_dict_eval})

In [18]:
df_dict

DatasetDict({
    train: Dataset({
        features: ['Definition', 'Word'],
        num_rows: 802
    })
    test: Dataset({
        features: ['Definition', 'Word'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['Definition', 'Word'],
        num_rows: 101
    })
})

In [19]:
tokenize_data = df_dict.map(preprocess_data, batched=True,remove_columns=["Definition", "Word"])

Map:   0%|          | 0/802 [00:00<?, ? examples/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

In [20]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# collator = transformers.DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id= -100)

In [21]:
training_args = TrainingArguments(
      output_dir='pegasus',           # output directory
      num_train_epochs=10,           # total number of training epochs
      per_device_train_batch_size=1,   # batch size per device during training, can increase if memory allows
      per_device_eval_batch_size=1,    # batch size for evaluation, can increase if memory allows
      save_steps=500,                  # number of updates steps before checkpoint saves
      save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
      evaluation_strategy='steps',     # evaluation strategy to adopt during training
      eval_steps=500,                  # number of update steps before evaluation
      warmup_steps=500,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='./logs',            # directory for storing logs
      logging_steps=10,
    )

trainer = Trainer(
  model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
  args=training_args,                  # training arguments, defined above
  train_dataset=tokenize_data['train'],         # training dataset
  eval_dataset=tokenize_data['validation'],            # evaluation dataset
  tokenizer=tokenizer
)

In [22]:
trainer.train()



Step,Training Loss,Validation Loss
500,2.4133,3.182317
1000,1.2182,2.272161
1500,0.7547,2.496849
2000,1.5802,2.198765
2500,1.3773,2.344773
3000,0.5194,2.537253
3500,0.7335,2.590166
4000,0.0008,2.735644
4500,0.1954,3.056968
5000,0.0017,3.151531


TrainOutput(global_step=8020, training_loss=0.7981399659986762, metrics={'train_runtime': 3524.0839, 'train_samples_per_second': 2.276, 'train_steps_per_second': 2.276, 'total_flos': 1991473051729920.0, 'train_loss': 0.7981399659986762, 'epoch': 10.0})

In [None]:
!nvidia-smi

Mon Apr 10 21:52:41 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 526.56       Driver Version: 526.56       CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0  On |                  N/A |
| N/A   56C    P8    13W /  N/A |   5858MiB /  6144MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import re

# Checking accuracy

In [None]:
# model = AutoModelForSeq2SeqLM.from_pretrained('/content/gdrive/MyDrive/CSCI 544/bart/checkpoint-1500')
# tokenizer = AutoTokenizer.from_pretrained('/content/gdrive/MyDrive/CSCI 544/bart/checkpoint-1500')

In [None]:
# generation_config = GenerationConfig(max_length=2,num_return_sequences = 10, num_beams=10, num_beam_groups=5, diversity_penalty=10.0, no_repeat_ngram_size = 3, )

In [23]:
df_test.iloc[3]['Definition'], df_test.iloc[3]['Word']

('an operation in which tissue or an organ is transplanted', 'transplant')

In [30]:
input_ids = tokenizer('large body of water',return_tensors="pt", max_length=max_input, padding='max_length', truncation=True).input_ids
outputs = model.generate(input_ids.cuda(), num_return_sequences = 10, num_beams=10, num_beam_groups=5, diversity_penalty=10.0, no_repeat_ngram_size = 3)

In [31]:
[tokenizer.decode(outputs[i], skip_special_tokens=True) for i in range(len(outputs))]

['river_head',
 'large_body',
 'island',
 'undertide',
 'islandr',
 'noggin',
 'islet',
 'sea turtle',
 'islander',
 'bodyboarder']

In [32]:
correct = 0
for idx, row in df_train.iterrows():
  defn = row['Definition']
  input_ids = tokenizer(defn, return_tensors="pt", max_length=max_input, padding='max_length', truncation=True).input_ids
  outputs = model.generate(input_ids.cuda(), num_return_sequences = 10, num_beams=10, num_beam_groups=5, diversity_penalty=10.0, no_repeat_ngram_size = 3)
  if row['Word'] in [tokenizer.decode(outputs[i], skip_special_tokens=True) for i in range(len(outputs))]:
    correct += 1

print('Train accuracy: ' + str(correct/len(df_train)))

Train accuracy: 0.9576059850374065


In [33]:
correct = 0
for idx, row in df_eval.iterrows():
  defn = row['Definition']
  input_ids = tokenizer(defn, return_tensors="pt", max_length=max_input, padding='max_length', truncation=True).input_ids
  outputs = model.generate(input_ids.cuda(), num_return_sequences = 10, num_beams=10, num_beam_groups=5, diversity_penalty=10.0, no_repeat_ngram_size = 3)
  if row['Word'] in [tokenizer.decode(outputs[i], skip_special_tokens=True) for i in range(len(outputs))]:
    correct += 1

print('Validation accuracy: ' + str(correct/len(df_eval)))

Validation accuracy: 0.6237623762376238


In [34]:
correct = 0
for idx, row in df_test.iterrows():
  defn = row['Definition']
  input_ids = tokenizer(defn, return_tensors="pt", max_length=max_input, padding='max_length', truncation=True).input_ids
  outputs = model.generate(input_ids.cuda(), num_return_sequences = 10, num_beams=10, num_beam_groups=5, diversity_penalty=10.0, no_repeat_ngram_size = 3)
  if row['Word'] in [tokenizer.decode(outputs[i], skip_special_tokens=True) for i in range(len(outputs))]:
    correct += 1

print('Test accuracy: ' + str(correct/len(df_test)))

Test accuracy: 0.7
