# Imports and inits

In [1]:
'''Installations'''

!pip install evaluate
!pip install transformers
!pip install sentencepiece
!pip install rouge_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dill
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m4.9 MB/s[0m eta [3

In [2]:
import numpy as np
import pandas as pd
import evaluate
import transformers
from tqdm import tqdm
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, TextClassificationPipeline, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, GenerationConfig, PegasusForConditionalGeneration, PegasusTokenizer 

In [3]:
'''For Google colab '''

from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [4]:
path = '/content/gdrive/MyDrive/CSCI 544/toyset10000.csv'

# Data init

In [5]:
data = pd.read_csv(path, dtype="string")
data['Definition'] = data['Definition'].astype(str)

In [6]:
data

Unnamed: 0,Word,Definition
0,Pagurus,type genus of the family Paguridae
1,vermicular,decorated with wormlike tracery or markings
2,swamp_blackberry,of eastern North America
3,genus_Ephestia,small moths whose larvae spin silken tunnels a...
4,Tweedle,"""To handle lightly; -- said with reference to ..."
...,...,...
34823,nonprofit,not commercially motivated
34824,Hymnal,"""A collection of hymns; a hymn book."""
34825,ick,an exclamation of disgust
34826,Camarasaurus,"""A genus of gigantic American Jurassic dinosau..."


In [7]:
data['Word'].unique()

<StringArray>
[           'Pagurus',         'vermicular',   'swamp_blackberry',
     'genus_Ephestia',            'Tweedle',        'steelmaking',
        'on_the_spot',             'Contex',            'Everych',
 'biological_science',
 ...
       'perphenazine',           'Immantle',         'stubbiness',
    'Emeto-cathartic',           'Pin-eyed',          'nonprofit',
             'Hymnal',                'ick',       'Camarasaurus',
       'first_of_all']
Length: 10000, dtype: string

In [8]:
data['Word'] = data['Word'].astype(str)

data['Word'] = data['Word'].apply(lambda x: x.lower())
data['Definition'] = data['Definition'].apply(str.lower)

data[~data['Word'].str.contains('\"')]
data = data.applymap(lambda x: x.replace('\"', ''))

data = data.explode('Definition')

In [9]:
df_train, df_test = train_test_split(data[['Definition','Word']], test_size=0.2)
df_test, df_eval = train_test_split(df_test[['Definition','Word']], test_size=0.5)

# Model Init


In [None]:
model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')

In [18]:
max_input = 128
max_target = 1
batch_size = 4

# More data processing

In [None]:
# prefix = "summarize: "
prefix=''
def preprocess_data(data_to_process):
  #get all the definitions
  inputs = [prefix + dialogue for dialogue in data_to_process['Definition']]
  #tokenize the definitions
  model_inputs = tokenizer(inputs,  max_length=max_input, padding=True, truncation=True)
  #tokenize the words
  
  with tokenizer.as_target_tokenizer():
    targets = tokenizer([d for d in data_to_process['Word']], padding=True, truncation=True)
    
  #set labels
  targets["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in targets["input_ids"]
        ]
  model_inputs['labels'] = targets['input_ids']
  #return the tokenized data
  #input_ids, attention_mask and labels
  return model_inputs

In [10]:
from datasets import load_dataset, load_from_disk
from datasets import Dataset, DatasetDict

In [11]:
df_dict = Dataset.from_dict(df_train)

In [12]:
df_dict_test = Dataset.from_dict(df_test)
df_dict_eval = Dataset.from_dict(df_eval)

In [13]:
df_dict = DatasetDict({"train":df_dict,"test":df_dict_test, "validation": df_dict_eval})

In [14]:
df_dict

DatasetDict({
    train: Dataset({
        features: ['Definition', 'Word'],
        num_rows: 27862
    })
    test: Dataset({
        features: ['Definition', 'Word'],
        num_rows: 3483
    })
    validation: Dataset({
        features: ['Definition', 'Word'],
        num_rows: 3483
    })
})

In [None]:
tokenize_data = df_dict.map(preprocess_data, batched=True,remove_columns=["Definition", "Word"])

Map:   0%|          | 0/802 [00:00<?, ? examples/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

In [None]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# collator = transformers.DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id= -100)

In [None]:
training_args = TrainingArguments(
      output_dir='pegasus',           # output directory
      num_train_epochs=10,           # total number of training epochs
      per_device_train_batch_size=1,   # batch size per device during training, can increase if memory allows
      per_device_eval_batch_size=1,    # batch size for evaluation, can increase if memory allows
      save_steps=500,                  # number of updates steps before checkpoint saves
      save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
      evaluation_strategy='steps',     # evaluation strategy to adopt during training
      eval_steps=500,                  # number of update steps before evaluation
      warmup_steps=500,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='./logs',            # directory for storing logs
      logging_steps=10,
    )

trainer = Trainer(
  model=model,                         # the instantiated 🤗 Transformers model to be trained
  args=training_args,                  # training arguments, defined above
  train_dataset=tokenize_data['train'],         # training dataset
  eval_dataset=tokenize_data['validation'],            # evaluation dataset
  tokenizer=tokenizer
)

In [None]:
trainer.train()



Step,Training Loss,Validation Loss
500,2.4133,3.182317
1000,1.2182,2.272161
1500,0.7547,2.496849
2000,1.5802,2.198765
2500,1.3773,2.344773
3000,0.5194,2.537253
3500,0.7335,2.590166
4000,0.0008,2.735644
4500,0.1954,3.056968
5000,0.0017,3.151531


TrainOutput(global_step=8020, training_loss=0.7981399659986762, metrics={'train_runtime': 3524.0839, 'train_samples_per_second': 2.276, 'train_steps_per_second': 2.276, 'total_flos': 1991473051729920.0, 'train_loss': 0.7981399659986762, 'epoch': 10.0})

In [None]:
!nvidia-smi

Mon Apr 10 21:52:41 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 526.56       Driver Version: 526.56       CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0  On |                  N/A |
| N/A   56C    P8    13W /  N/A |   5858MiB /  6144MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [15]:
model = PegasusForConditionalGeneration.from_pretrained('/content/gdrive/MyDrive/CSCI 544/checkpoint-55500')
tokenizer = PegasusTokenizer.from_pretrained('/content/gdrive/MyDrive/CSCI 544/checkpoint-55500')

# Checking accuracy on test set

In [16]:
data = pd.read_csv('/content/gdrive/MyDrive/CSCI 544/testset.csv', dtype="string")
data['Definition'] = data['Definition'].astype(str)

In [20]:
model.cuda()

PegasusForConditionalGeneration(
  (model): PegasusModel(
    (shared): Embedding(96103, 1024, padding_idx=0)
    (encoder): PegasusEncoder(
      (embed_tokens): Embedding(96103, 1024, padding_idx=0)
      (embed_positions): PegasusSinusoidalPositionalEmbedding(1024, 1024)
      (layers): ModuleList(
        (0-15): 16 x PegasusEncoderLayer(
          (self_attn): PegasusAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_no

In [21]:
correct = 0
for idx, row in data.iterrows():
  defn = row['Definition']
  input_ids = tokenizer(defn, return_tensors="pt", max_length=max_input, padding='max_length', truncation=True).input_ids
  outputs = model.generate(input_ids.cuda(), num_return_sequences = 1)
  if row['Word'] in [tokenizer.decode(outputs[i], skip_special_tokens=True) for i in range(len(outputs))]:
    correct += 1

print('Top 1 accuracy: ' + str(correct/len(data)))



Top 1 accuracy: 0.33


In [22]:
correct = 0
for idx, row in data.iterrows():
  defn = row['Definition']
  input_ids = tokenizer(defn, return_tensors="pt", max_length=max_input, padding='max_length', truncation=True).input_ids
  outputs = model.generate(input_ids.cuda(), num_return_sequences = 10, num_beams=10, num_beam_groups=5, diversity_penalty=10.0, no_repeat_ngram_size = 3)
  if row['Word'] in [tokenizer.decode(outputs[i], skip_special_tokens=True) for i in range(len(outputs))]:
    correct += 1

print('Top 10 accuracy: ' + str(correct/len(data)))

Top 10 accuracy: 0.57


In [23]:
correct = 0
for idx, row in data.iterrows():
  defn = row['Definition']
  input_ids = tokenizer(defn, return_tensors="pt", max_length=max_input, padding='max_length', truncation=True).input_ids
  outputs = model.generate(input_ids.cuda(), num_return_sequences = 100, num_beams=100, num_beam_groups=50, diversity_penalty=10.0, no_repeat_ngram_size = 3)
  if row['Word'] in [tokenizer.decode(outputs[i], skip_special_tokens=True) for i in range(len(outputs))]:
    correct += 1

print('Top 100 accuracy: ' + str(correct/len(data)))

Top 100 accuracy: 0.73
