<a href="https://colab.research.google.com/github/kevinscaria/InstructABSA/blob/main/ATE_Training_%26_Inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Libraries

In [1]:
import os
import torch  
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

from InstructABSA.data_prep import DatasetLoader, xml_to_dataframe
from InstructABSA.utils import T5Generator, T5Classifier
from instructions import InstructionsHandler

try:
    import google.colab
    from google.colab import drive
    drive.mount('/content/drive', force_remount = True)
    IN_COLAB = True
    if IN_COLAB:
        !pip install transformers
        !pip install datasets
        !pip install evaluate
        !pip install sentencepiece

        root_path = 'Enter drive path'
except:
    IN_COLAB = False
    root_path = 'D:\GitHub\Thesis\InstructABSA'

use_mps = True if torch.has_mps else False
os.chdir(root_path)

task_name = 'joint_task'
experiment_name = 'lapt2014_iabsa1'
model_checkpoint = 'allenai/tk-instruct-base-def-pos'
print('Experiment Name: ', experiment_name)
model_out_path = './Models'
model_out_path = os.path.join(model_out_path, task_name, f"{model_checkpoint.replace('/', '')}-{experiment_name}")
print('Model output path: ', model_out_path)


Experiment Name:  lapt2014_iabsa1
Model output path:  ./Models\joint_task\allenaitk-instruct-base-def-pos-lapt2014_iabsa1


## Training

In [2]:
# Load the data
id_train_file_path = r"D:\GitHub\Thesis\data\raw\ABSA16_Restaurants_Train_SB1_v2.xml"
id_test_file_path = r"D:\GitHub\Thesis\data\raw\EN_REST_SB1_TEST.xml.gold"

in_dir = "data/"
#id_tr_df = pd.read_csv(id_train_file_path)
#id_te_df = pd.read_csv(id_test_file_path)

try: 
  trial_df = pd.read_parquet(os.path.join(in_dir, 'trial_data.parquet'))
  id_tr_df = pd.read_parquet(os.path.join(in_dir, 'train_data.parquet'))
  id_te_df = pd.read_parquet(os.path.join(in_dir, 'test_data.parquet'))
except:
  trial_df = xml_to_dataframe('https://alt.qcri.org/semeval2016/task5/data/uploads/trial-data/english-trial/restaurants_trial_english_sl.xml')
  id_tr_df = xml_to_dataframe(id_train_file_path, output_file = 'train_data', in_dir = in_dir)
  id_te_df = xml_to_dataframe(id_test_file_path, output_file = 'test_data', in_dir = in_dir)

# Get the input text into the required format using Instructions
instruct_handler = InstructionsHandler()

# Set instruction_set1 for InstructABSA-1 and instruction_set2 for InstructABSA-2
instruct_handler.load_instruction_set1()

# Set bos_instruct1 for lapt14 and bos_instruct2 for rest14. For other datasets, modify the insructions.py file.
loader = DatasetLoader(id_tr_df, id_te_df)
if loader.train_df_id is not None:
  loader.train_df_id = loader.create_data_in_joint_task_format(loader.train_df_id, 'term', 'polarity', 'raw_text', 'aspectTerms', instruct_handler.joint['bos_instruct1'], instruct_handler.joint['eos_instruct'])
if loader.test_df_id is not None:
  loader.test_df_id = loader.create_data_in_joint_task_format(loader.test_df_id, 'term', 'polarity', 'raw_text', 'aspectTerms', instruct_handler.joint['bos_instruct1'], instruct_handler.joint['eos_instruct'])

In [3]:
# Create T5 utils object
t5_exp = T5Generator(model_checkpoint)

# Tokenize Dataset
id_ds, id_tokenized_ds, ood_ds, ood_tokenized_ds = loader.set_data_for_training_semeval(t5_exp.tokenize_function_inputs)

# Training arguments
training_args = {
    'output_dir':model_out_path,
    'evaluation_strategy':"epoch",
    'learning_rate':5e-5,
    'lr_scheduler_type':'cosine',
    'per_device_train_batch_size':8,
    'per_device_eval_batch_size':16,
    'num_train_epochs':4,
    'weight_decay':0.01,
    'warmup_ratio':0.1,
    'save_strategy':'no',
    'load_best_model_at_end':False,
    'push_to_hub':False,
    'eval_accumulation_steps':1,
    'predict_with_generate':True,
    'use_mps_device':use_mps
}

                                                               

In [4]:
# Train model
try:
  # Model inference - Loading from Checkpoint
  t5_exp = T5Generator(model_out_path)
except:
  model_trainer = t5_exp.train(id_tokenized_ds, **training_args)
  t5_exp = T5Generator(model_out_path)

Trainer device: cpu

Model training started ....


  0%|          | 0/176 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
                                                  
 25%|██▌       | 44/176 [16:58<39:12, 17.82s/it]

{'eval_loss': 0.7152167558670044, 'eval_runtime': 90.7003, 'eval_samples_per_second': 0.992, 'eval_steps_per_second': 0.066, 'epoch': 1.0}


                                                  
 50%|█████     | 88/176 [33:32<25:36, 17.46s/it]

{'eval_loss': 0.579865574836731, 'eval_runtime': 90.5987, 'eval_samples_per_second': 0.993, 'eval_steps_per_second': 0.066, 'epoch': 2.0}


                                                  
 75%|███████▌  | 132/176 [49:26<12:07, 16.54s/it]

{'eval_loss': 0.5551343560218811, 'eval_runtime': 91.2518, 'eval_samples_per_second': 0.986, 'eval_steps_per_second': 0.066, 'epoch': 3.0}


                                                   
100%|██████████| 176/176 [1:05:43<00:00, 20.77s/it]

{'eval_loss': 0.5580582022666931, 'eval_runtime': 91.6035, 'eval_samples_per_second': 0.982, 'eval_steps_per_second': 0.065, 'epoch': 4.0}
{'train_runtime': 3943.7576, 'train_samples_per_second': 0.355, 'train_steps_per_second': 0.045, 'train_loss': 0.8249359997836027, 'epoch': 4.0}


100%|██████████| 176/176 [1:05:43<00:00, 22.41s/it]


## Inference

In [12]:
# Get the input text into the required format using Instructions
instruct_handler = InstructionsHandler()

# Set instruction_set1 for InstructABSA-1 and instruction_set2 for InstructABSA-2
instruct_handler.load_instruction_set1()

# Set bos_instruct1 for lapt14 and bos_instruct2 for rest14. For other datasets, modify the insructions.py file.
loader = DatasetLoader(id_tr_df, id_te_df)
if loader.train_df_id is not None:
    loader.train_df_id = loader.create_data_in_joint_task_format(loader.train_df_id, 'term', 'polarity', 'raw_text', 'aspectTerms', instruct_handler.joint['bos_instruct1'], instruct_handler.joint['eos_instruct'])
if loader.test_df_id is not None:
    loader.test_df_id = loader.create_data_in_joint_task_format(loader.test_df_id, 'term', 'polarity', 'raw_text', 'aspectTerms', instruct_handler.joint['bos_instruct1'], instruct_handler.joint['eos_instruct'])

In [13]:
# Model inference - Loading from Checkpoint
t5_exp = T5Generator(model_out_path)

# Tokenize Datasets
id_ds, id_tokenized_ds, ood_ds, ood_tokenzed_ds = loader.set_data_for_training_semeval(t5_exp.tokenize_function_inputs)

# Get prediction labels - Training set   
id_tr_pred_labels = t5_exp.get_labels(tokenized_dataset = id_tokenized_ds, sample_set = 'train', batch_size = 16)
id_tr_labels = [i.strip() for i in id_ds['train']['labels']]

# Get prediction labels - Testing set
id_te_pred_labels = t5_exp.get_labels(tokenized_dataset = id_tokenized_ds, sample_set = 'test', batch_size = 16)
id_te_labels = [i.strip() for i in id_ds['test']['labels']]

                                                               

Model loaded to:  cpu


100%|██████████| 22/22 [16:35<00:00, 45.26s/it]


Model loaded to:  cpu


100%|██████████| 6/6 [04:13<00:00, 42.31s/it]


In [15]:
p, r, f1 = t5_exp.get_metrics(id_tr_labels, id_tr_pred_labels)
print('Train Precision: ', p)
print('Train Recall: ', r)
print('Train F1: ', f1)

p, r, f1 = t5_exp.get_metrics(id_te_labels, id_te_pred_labels)
print('Test Precision: ', p)
print('Test Recall: ', r)
print('Test F1: ', f1)

Train Precision:  0.5877408056042032
Train Recall:  0.6682596575069694
Train F1:  0.6254193067461796
Test Precision:  0.5196304849884527
Test Recall:  0.5226480836236934
Test F1:  0.5211349160393747
