# Imports and inits

In [1]:
'''Installations'''

!pip install evaluate
!pip install transformers
!pip install sentencepiece
!pip install rouge_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import numpy as np
import pandas as pd
import evaluate
import transformers
from tqdm import tqdm
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, TextClassificationPipeline, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer 

In [3]:
'''For Google colab '''

from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [4]:
path = '/content/gdrive/MyDrive/CSCI 544/toyset1000.csv'

# Data init

In [5]:
data = pd.read_csv(path, dtype="string")
data['Definition'] = data['Definition'].astype(str)

In [6]:
data

Unnamed: 0,Word,Definition
0,Forelay,"""To lay down beforehand."""
1,Forelay,"""To waylay. See Forlay."""
2,levelheaded,characteristically self composed and sensible
3,levelheaded,alternative spelling of level headed
4,levelheaded,exercising or showing good judgment or common ...
...,...,...
4039,megalomania,an obsession with grandiose or extravagant thi...
4040,megalomania,a form of mental alienation in which the patie...
4041,megalomania,a form of insane delusion the subjects of whic...
4042,megalomania,a psychological state characterized by delusio...


In [7]:
data['Word'].unique()

<StringArray>
[     'Forelay',  'levelheaded',        'Culpe',      'Kuenlun',
         'Envy',     'Solecize',        'Smoky',      'Surtout',
      'psyllid',      'cow_oak',
 ...
      'Soundly',       'Mellon',    'Triopidae', 'bell_the_cat',
  'Megalopolis',  'unreceptive',     'Moderato',    'Evitation',
  'Hennotannic',  'megalomania']
Length: 1000, dtype: string

In [8]:
df_train, df_test = train_test_split(data[['Definition','Word']], test_size=0.2)
df_test, df_eval = train_test_split(df_test[['Definition','Word']], test_size=0.5)

# Model Init


In [29]:
model = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-base')
tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')

In [30]:
max_input = 128
max_target = 1
batch_size = 8

# More data processing

In [31]:
prefix = "summarize: "
# prefix=''
def preprocess_data(data_to_process):
  #get all the definitions
  inputs = [prefix + dialogue for dialogue in data_to_process['Definition']]
  #tokenize the definitions
  model_inputs = tokenizer(inputs,  max_length=max_input, padding='max_length', truncation=True)
  #tokenize the words
  with tokenizer.as_target_tokenizer():
    targets = tokenizer(data_to_process['Word'], max_length=max_target, padding='max_length', truncation=True)
    
  #set labels
  targets["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in targets["input_ids"]
        ]
  model_inputs['labels'] = targets['input_ids']
  #return the tokenized data
  #input_ids, attention_mask and labels
  return model_inputs

In [32]:
from datasets import load_dataset, load_from_disk
from datasets import Dataset, DatasetDict

In [33]:
df_dict = Dataset.from_dict(df_train)

In [34]:
df_dict_test = Dataset.from_dict(df_test)
df_dict_eval = Dataset.from_dict(df_eval)

In [35]:
df_dict = DatasetDict({"train":df_dict,"test":df_dict_test, "validation": df_dict_eval})

In [36]:
df_dict

DatasetDict({
    train: Dataset({
        features: ['Definition', 'Word'],
        num_rows: 3235
    })
    test: Dataset({
        features: ['Definition', 'Word'],
        num_rows: 404
    })
    validation: Dataset({
        features: ['Definition', 'Word'],
        num_rows: 405
    })
})

In [37]:
tokenize_data = df_dict.map(preprocess_data, batched=True)

Map:   0%|          | 0/3235 [00:00<?, ? examples/s]



Map:   0%|          | 0/404 [00:00<?, ? examples/s]

Map:   0%|          | 0/405 [00:00<?, ? examples/s]

In [38]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [39]:
collator = transformers.DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id= -100)

In [40]:
args = Seq2SeqTrainingArguments(
    '/content/gdrive/MyDrive/CSCI 544/t5', #save directory
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=False #available only with CUDA
    )

trainer = Seq2SeqTrainer(
    model, 
    args,
    train_dataset=tokenize_data['train'],
    eval_dataset=tokenize_data['validation'],
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [41]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,0.0,0.0,0.0,0.0,0.0,1.0


KeyboardInterrupt: ignored

In [None]:
import re

# Checking accuracy

In [None]:
# model = AutoModelForSeq2SeqLM.from_pretrained('/content/gdrive/MyDrive/CSCI 544/bart/checkpoint-1500')
# tokenizer = AutoTokenizer.from_pretrained('/content/gdrive/MyDrive/CSCI 544/bart/checkpoint-1500')

In [86]:
input_ids = tokenizer(prefix + df_train.iloc[0]['Definition'], return_tensors="pt", max_length=max_input, padding='max_length', truncation=True).input_ids
outputs = model.generate(input_ids.cuda(), num_return_sequences = 10, num_beams=10, num_beam_groups=5, diversity_penalty=10.0, no_repeat_ngram_size = 3)



In [87]:
[tokenizer.decode(outputs[i], skip_special_tokens=True) for i in range(len(outputs))]

['',
 '',
 '',
 '',
 '',
 'Resurrection of the slaying',
 'Resurrection of the king',
 'Ishmuel',
 'Resurrection of the Jews',
 'Ishmuzhiq']

In [None]:
correct = 0
for idx, row in df_train.iterrows():
  defn = row['Definition']
  input_ids = tokenizer(defn, return_tensors="pt", max_length=max_input, padding='max_length', truncation=True).input_ids
  outputs = model.generate(input_ids.cuda(), num_return_sequences = 10, num_beams=10, num_beam_groups=5, diversity_penalty=10.0, no_repeat_ngram_size = 3)
  if row['Word'] in [tokenizer.decode(outputs[i], skip_special_tokens=True) for i in range(len(outputs))]:
    correct += 1

print('Train accuracy: ' + str(correct/len(df_train)))



Train accuracy: 0.9538653366583542


In [None]:
correct = 0
for idx, row in df_eval.iterrows():
  defn = row['Definition']
  input_ids = tokenizer(defn, return_tensors="pt", max_length=max_input, padding='max_length', truncation=True).input_ids
  outputs = model.generate(input_ids.cuda(), num_return_sequences = 10, num_beams=10, num_beam_groups=5, diversity_penalty=10.0, no_repeat_ngram_size = 3)
  if row['Word'] in [tokenizer.decode(outputs[i], skip_special_tokens=True) for i in range(len(outputs))]:
    correct += 1

print('Validation accuracy: ' + str(correct/len(df_eval)))

Validation accuracy: 0.6831683168316832


In [None]:
correct = 0
for idx, row in df_test.iterrows():
  defn = row['Definition']
  input_ids = tokenizer(defn, return_tensors="pt", max_length=max_input, padding='max_length', truncation=True).input_ids
  outputs = model.generate(input_ids.cuda(), num_return_sequences = 10, num_beams=10, num_beam_groups=5, diversity_penalty=10.0, no_repeat_ngram_size = 3)
  if row['Word'] in [tokenizer.decode(outputs[i], skip_special_tokens=True) for i in range(len(outputs))]:
    correct += 1

print('Test accuracy: ' + str(correct/len(df_test)))

Test accuracy: 0.73
