# Imports and inits

In [1]:
'''Installations'''
!pip install evaluate
!pip install transformers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.7.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39

In [2]:
import numpy as np
import pandas as pd
import evaluate
import transformers
from tqdm import tqdm
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, TextClassificationPipeline, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer 
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration

In [3]:
'''For Google colab '''

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
path = 'datasets/toyset.csv'

In [5]:
mypath = '/content/gdrive/MyDrive/CSCI544/Project/'
path = mypath + path
path

'/content/gdrive/MyDrive/CSCI544/Project/datasets/toyset.csv'

# Data init

In [6]:
data = pd.read_csv(path, dtype="string")
data['Definition'] = data['Definition'].astype(str)

In [7]:
data.head()

Unnamed: 0.1,Unnamed: 0,Word,Definition
0,732989,Geographical,"""Of or pertaining to geography."""
1,746894,Inextricableness,"""The state of being inextricable."""
2,776875,Papuars,"""The native black race of Papua or New Guinea ..."
3,845314,dark-coated,covered with dark hair
4,692652,Cesura,"""See Caesura."""


In [8]:
data['Word'].unique()

<StringArray>
[          'Geographical',       'Inextricableness',                'Papuars',
            'dark-coated',                 'Cesura',              'platelike',
               'Quinible',               'Pattered',     'Henrik_Johan_Ibsen',
      'black-tailed_deer',             'Pyrolaceae',       'basilar_membrane',
            'Latisternal',         'carpet_sweeper',      'collateral_damage',
              'Phytogeny',         'oyster_cracker',        'alkaline-loving',
   'family_Istiophoridae',      "Thirty_Years'_War",                 'Entame',
          'Philadelphian',          'genus_Crateva',              'mirroring',
                 'Alegge',             'irritating',                 'Cortef',
                  'betel',              'Widowhood',              'Supplyant',
            'making_love',       'genus_Ptychozoon',           'unmechanical',
         'bipinnate_leaf',              'Collyrium',         'hyperextension',
                'Saucing',            

In [9]:
df_train, df_test = train_test_split(data[['Definition','Word']], test_size=0.2)
df_test, df_eval = train_test_split(df_test[['Definition','Word']], test_size=0.5)

# Model Init


In [10]:
tokenizer = AutoTokenizer.from_pretrained("HUPD/hupd-t5-small")
# model = T5ForConditionalGeneration.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("HUPD/hupd-t5-small")

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.46M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/9.59k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.81k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/243M [00:00<?, ?B/s]

In [11]:
max_input = 512
max_target = 1
batch_size = 3

# More data processing

In [12]:
def preprocess_data(data_to_process):
  #get all the definitions
  inputs = [dialogue for dialogue in data_to_process['Definition']]
  #tokenize the definitions
  model_inputs = tokenizer(inputs,  max_length=max_input, padding='max_length', truncation=True)
  #tokenize the words
  with tokenizer.as_target_tokenizer():
    targets = tokenizer(data_to_process['Word'], max_length=max_target, padding='max_length', truncation=True)
    
  #set labels
  model_inputs['labels'] = targets['input_ids']
  #return the tokenized data
  #input_ids, attention_mask and labels
  return model_inputs

In [13]:
import datasets
from datasets import load_dataset, load_from_disk
from datasets import Dataset

In [14]:
df_dict = Dataset.from_dict(df_train)

In [15]:
df_dict_test = Dataset.from_dict(df_test)
df_dict_eval = Dataset.from_dict(df_eval)

In [16]:
df_dict = datasets.DatasetDict({"train":df_dict,"test":df_dict_test, "validation": df_dict_eval})

In [17]:
df_dict

DatasetDict({
    train: Dataset({
        features: ['Definition', 'Word'],
        num_rows: 190
    })
    test: Dataset({
        features: ['Definition', 'Word'],
        num_rows: 24
    })
    validation: Dataset({
        features: ['Definition', 'Word'],
        num_rows: 24
    })
})

In [18]:
tokenize_data = df_dict.map(preprocess_data, batched=True)

Map:   0%|          | 0/190 [00:00<?, ? examples/s]



Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

In [19]:
# collator = transformers.DataCollatorForSeq2Seq(tokenizer, model=model)
collator = transformers.DataCollatorForSeq2Seq(tokenizer, model=model)

In [20]:
# path = '/Users/marta/Documents/*USC/CSCI 544/Project/ReverseDictionary-main'

In [21]:
args = Seq2SeqTrainingArguments(
    mypath, #save directory
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size= 2,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=100,
    predict_with_generate=True,
    eval_accumulation_steps=3,
    # bf16=True,
    fp16=False #available only with CUDA
    )


trainer = Seq2SeqTrainer(
    model, 
    args,
    train_dataset=tokenize_data['train'],
    eval_dataset=tokenize_data['validation'],
    data_collator=collator,
    tokenizer=tokenizer,
)

In [22]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
0,No log,1.895011
2,No log,0.211505
2,No log,0.003391
4,No log,0.000104
4,No log,2.7e-05
6,No log,1.2e-05
6,No log,7e-06
8,No log,4e-06
8,No log,2e-06
10,No log,2e-06


TrainOutput(global_step=4700, training_loss=0.05593195368951939, metrics={'train_runtime': 846.0506, 'train_samples_per_second': 22.457, 'train_steps_per_second': 5.555, 'total_flos': 2544425867673600.0, 'train_loss': 0.05593195368951939, 'epoch': 98.95})

In [23]:
import re

# Checking accuracy

In [24]:
  model_inputs = tokenizer('make repairs renovations revisions or adjustments to',  max_length=max_input, padding='max_length', truncation=True)
  raw_pred, _, _ = trainer.predict([model_inputs])

In [25]:
tokenizer.decode(raw_pred[0])

'<pad></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [26]:
correct = 0
for idx, row in df_train.iterrows():
  defn = row['Definition']
  model_inputs = tokenizer(defn,  max_length=max_input, padding='max_length', truncation=True)
  raw_pred, _, _ = trainer.predict([model_inputs])
  pred = re.search('<s>.*</s>',tokenizer.decode(raw_pred[0])).group()[3:-4]
  if pred == row['Word']:
    correct += 1

print('Train accuracy: ' + str(correct/len(df_train)))

AttributeError: ignored

In [None]:
correct = 0
for idx, row in df_eval.iterrows():
  defn = row['Definition']
  model_inputs = tokenizer(defn,  max_length=max_input, padding='max_length', truncation=True)
  raw_pred, _, _ = trainer.predict([model_inputs])
  pred = re.search('<s>.*</s>',tokenizer.decode(raw_pred[0])).group()[3:-4]
  if pred == row['Word']:
    correct += 1

print('Validation accuracy: ' + str(correct/len(df_eval)))

Validation accuracy: 0.5416666666666666


In [None]:
correct = 0
for idx, row in df_test.iterrows():
  defn = row['Definition']
  model_inputs = tokenizer(defn,  max_length=max_input, padding='max_length', truncation=True)
  raw_pred, _, _ = trainer.predict([model_inputs])
  pred = re.search('<s>.*</s>',tokenizer.decode(raw_pred[0])).group()[3:-4]
  if pred == row['Word']:
    correct += 1

print('Test accuracy: ' + str(correct/len(df_test)))

Test accuracy: 0.4583333333333333
