# Imports and inits

In [1]:
'''Installations'''

!pip install evaluate
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.7.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting responses<0.19
  Down

In [2]:
import numpy as np
import pandas as pd
import evaluate
import transformers
from tqdm import tqdm
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, TextClassificationPipeline, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer 

In [3]:
'''For Google colab '''

# from google.colab import drive
# drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
path = 'datasets/toyset.csv'

# Data init

In [18]:
data = pd.read_csv(path, dtype="string")
data['Definition'] = data['Definition'].astype(str)

In [19]:
data

Unnamed: 0.1,Unnamed: 0,Word,Definition
0,732989,Geographical,"""Of or pertaining to geography."""
1,746894,Inextricableness,"""The state of being inextricable."""
2,776875,Papuars,"""The native black race of Papua or New Guinea ..."
3,845314,dark-coated,covered with dark hair
4,692652,Cesura,"""See Caesura."""
...,...,...,...
233,635325,olive,a tree of some other species of olea or of som...
234,635339,olive,evergreen tree cultivated in the mediterranean...
235,635304,olive,an evergreen tree olea europaea cultivated sin...
236,635313,olive,the tree has been cultivated for its fruit for...


In [69]:
data['Word'].unique()

<StringArray>
[          'Geographical',       'Inextricableness',                'Papuars',
            'dark-coated',                 'Cesura',              'platelike',
               'Quinible',               'Pattered',     'Henrik_Johan_Ibsen',
      'black-tailed_deer',             'Pyrolaceae',       'basilar_membrane',
            'Latisternal',         'carpet_sweeper',      'collateral_damage',
              'Phytogeny',         'oyster_cracker',        'alkaline-loving',
   'family_Istiophoridae',      "Thirty_Years'_War",                 'Entame',
          'Philadelphian',          'genus_Crateva',              'mirroring',
                 'Alegge',             'irritating',                 'Cortef',
                  'betel',              'Widowhood',              'Supplyant',
            'making_love',       'genus_Ptychozoon',           'unmechanical',
         'bipinnate_leaf',              'Collyrium',         'hyperextension',
                'Saucing',            

In [None]:
df_train, df_test = train_test_split(data[['Definition','Word']], test_size=0.2)
df_test, df_eval = train_test_split(df_test[['Definition','Word']], test_size=0.5)

# Model Init


In [11]:
model = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-base')
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-base')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [12]:
max_input = 512
max_target = 1
batch_size = 3

# More data processing

In [13]:
def preprocess_data(data_to_process):
  #get all the definitions
  inputs = [dialogue for dialogue in data_to_process['Definition']]
  #tokenize the definitions
  model_inputs = tokenizer(inputs,  max_length=max_input, padding='max_length', truncation=True)
  #tokenize the words
  with tokenizer.as_target_tokenizer():
    targets = tokenizer(data_to_process['Word'], max_length=max_target, padding='max_length', truncation=True)
    
  #set labels
  model_inputs['labels'] = targets['input_ids']
  #return the tokenized data
  #input_ids, attention_mask and labels
  return model_inputs

In [31]:
from datasets import load_dataset, load_from_disk
from datasets import Dataset

In [49]:
df_dict = Dataset.from_dict(df_train)

In [50]:
df_dict_test = Dataset.from_dict(df_test)
df_dict_eval = Dataset.from_dict(df_eval)

In [51]:
df_dict = datasets.DatasetDict({"train":df_dict,"test":df_dict_test, "validation": df_dict_eval})

In [52]:
df_dict

DatasetDict({
    train: Dataset({
        features: ['Definition', 'Word'],
        num_rows: 190
    })
    test: Dataset({
        features: ['Definition', 'Word'],
        num_rows: 24
    })
    validation: Dataset({
        features: ['Definition', 'Word'],
        num_rows: 24
    })
})

In [53]:
tokenize_data = df_dict.map(preprocess_data, batched=True)

Map:   0%|          | 0/190 [00:00<?, ? examples/s]



Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

In [54]:
collator = transformers.DataCollatorForSeq2Seq(tokenizer, model=model)

In [89]:
args = Seq2SeqTrainingArguments(
    '/content/gdrive/MyDrive/CSCI 544/bart', #save directory
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size= 2,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=100,
    predict_with_generate=True,
    eval_accumulation_steps=3,
    fp16=True #available only with CUDA
    )

trainer = Seq2SeqTrainer(
    model, 
    args,
    train_dataset=tokenize_data['train'],
    eval_dataset=tokenize_data['validation'],
    data_collator=collator,
    tokenizer=tokenizer,
)

In [90]:
trainer.train()



Epoch,Training Loss,Validation Loss
0,No log,2.294132
2,No log,2.203245
2,No log,2.038478
4,No log,2.074744
4,No log,2.054013
6,No log,2.041033
6,No log,2.089122
8,No log,2.101217
8,No log,2.390281
10,No log,2.268765


TrainOutput(global_step=4700, training_loss=0.05484153521821854, metrics={'train_runtime': 1195.9395, 'train_samples_per_second': 15.887, 'train_steps_per_second': 3.93, 'total_flos': 5731522707456000.0, 'train_loss': 0.05484153521821854, 'epoch': 98.95})

In [145]:
import re

# Checking accuracy

In [156]:
correct = 0
for idx, row in df_train.iterrows():
  defn = row['Definition']
  model_inputs = tokenizer(defn,  max_length=max_input, padding='max_length', truncation=True)
  raw_pred, _, _ = trainer.predict([model_inputs])
  pred = re.search('<s>.*</s>',tokenizer.decode(raw_pred[0])).group()[3:-4]
  if pred == row['Word']:
    correct += 1

print('Train accuracy: ' + str(correct/len(df_train)))

Train accuracy: 0.9894736842105263


In [157]:
correct = 0
for idx, row in df_eval.iterrows():
  defn = row['Definition']
  model_inputs = tokenizer(defn,  max_length=max_input, padding='max_length', truncation=True)
  raw_pred, _, _ = trainer.predict([model_inputs])
  pred = re.search('<s>.*</s>',tokenizer.decode(raw_pred[0])).group()[3:-4]
  if pred == row['Word']:
    correct += 1

print('Validation accuracy: ' + str(correct/len(df_eval)))

Validation accuracy: 0.5416666666666666


In [159]:
correct = 0
for idx, row in df_test.iterrows():
  defn = row['Definition']
  model_inputs = tokenizer(defn,  max_length=max_input, padding='max_length', truncation=True)
  raw_pred, _, _ = trainer.predict([model_inputs])
  pred = re.search('<s>.*</s>',tokenizer.decode(raw_pred[0])).group()[3:-4]
  if pred == row['Word']:
    correct += 1

print('Test accuracy: ' + str(correct/len(df_test)))

Test accuracy: 0.4583333333333333
