Check the version of Python

In [1]:
!python -V

Python 3.7.13


## Data Preparation

Prepare the data for training and testing. We can read files from local folder if we run the model on our computer.

In [2]:
# Setup the data path. You may need to change this depending on where you keep the files on your local
directory = r'C:\Users\Asus\Downloads\Subtask3_ABSA_Aspect_Category_Sentiment_Classification\data'
laptop_train = directory+'\ABSA16_Laptops_Train_SB1_v2.csv'
restaurant_train = directory+'\ABSA16_Restaurants_Train_SB1_v2.csv'
laptop_test = directory+'\EN_LAPT_SB1_TEST.csv'
restaurant_test = directory+'\EN_REST_SB1_TEST.csv'

Or we can run file on Colab

In [3]:
# codes to mount your google drive folder
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/SMU_MITB_NLP/

Mounted at /content/drive
/content/drive/MyDrive/SMU_MITB_NLP


In [4]:
# paths to files in Google drive
laptop_train = './Group Project/ABSA16_Laptops_Train_SB1_v2.csv'
restaurant_train = './Group Project/ABSA16_Restaurants_Train_SB1_v2.csv'
laptop_test = './Group Project/EN_LAPT_SB1_TEST.csv'
restaurant_test = './Group Project/EN_REST_SB1_TEST.csv'

Functions to process raw data files

In [5]:
#@title Raw data processing
# This function turns the raw data in csv format into data frame .
# Originally, the text reviews are in different columns. The function will consolidate the columns containing text into one.
# The same are performed for category columns and polarity columns
import pandas as pd
def data_processor(input_path, count):
    temp = pd.read_csv(input_path)
    df = pd.DataFrame()
    for i in range(0,count):
        try:
            #get the column names
            text = "sentences/sentence/{}/text".format(i)
            category = "sentences/sentence/{}/Opinions/Opinion/0/_category".format(i)
            polarity = 'sentences/sentence/{}/Opinions/Opinion/0/_polarity'.format(i)
            a = temp.loc[:,[text, category, polarity]]
            #rename the columns
            a.rename(columns = {text:'input_text', category:'category', polarity:'target_text'}, inplace = True)
            #append each combination of (text, category, polarity) to df
            df = pd.concat([df, a], ignore_index=True)
        except:
            pass
    #Select only non-nan rows
    df = df[df['target_text'].notna()]
    df = df[df['category'].notna()]
    df = df[df['input_text'].notna()]
    df = df.reset_index(drop=True)
    #Reformat the columns
    df = df.astype({"input_text": str,"category": str, "target_text": str})
    #Extract term from category
    df[['term', 'temp']] = df['category'].str.split('#', n=1, expand=True)
    df.drop(columns=['category', 'temp'], inplace=True)
    #Rearrange the columns
    df = df[['input_text', 'term', 'target_text']]
    #Remove spaces at two ends of the text
    df["input_text"] = df["input_text"].str.strip()
    df["term"] = df["term"].str.strip()
    df["target_text"] = df["target_text"].str.strip()
    return df

In [6]:
# This function splits data into train and valid test with a ratio of 80:20
def data_split(df,train_size=0.8,random_state=2022):
    train_df = df.sample(frac=train_size,random_state=random_state)
    valid_df = df.drop(train_df.index)
    train_df = train_df.reset_index(drop=True)
    valid_df = valid_df.reset_index(drop=True)
    return train_df, valid_df

We'll create 3 datasets for train, valid and test. The valid dataset will be used while training. 

In [9]:
# Process the data
df_laptop = data_processor(laptop_train, 100) #100 is an arbitrary number. We just need a number that is big enough to cover all the columns containing text in the raw data.
df_rest = data_processor(restaurant_train, 100)
test_df_laptop = data_processor(laptop_test, 100)
test_df_rest = data_processor(restaurant_test, 100)

# Create train and validation dataset
train_df_laptop, valid_df_laptop = data_split(df_laptop)
train_df_rest, valid_df_rest = data_split(df_rest)

# Concatenating both laptop and restaurant datasets for training at the same time
train_df = pd.concat([train_df_laptop, train_df_rest], ignore_index=True)
valid_df = pd.concat([valid_df_laptop, valid_df_rest], ignore_index=True)
# Concatenating both laptop and restaurant datasets for test time
test_df = pd.concat([test_df_laptop, test_df_rest], ignore_index=True)

## Setup Seq2Seq model

In [None]:
# Install transformer and simpletransformers
!pip install transformers
!pip install simpletransformers

In [11]:
# Import necessary libraries
import os
import math
import random
import logging
logger = logging.getLogger(__name__)

from tqdm.auto import tqdm, trange #to create progress bar

import torch

from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler

from simpletransformers.config.model_args import Seq2SeqArgs
from simpletransformers.seq2seq.seq2seq_utils import SimpleSummarizationDataset

import pandas as pd
import numpy as np

from transformers import BartForConditionalGeneration, BartTokenizer, BartConfig, AdamW, get_linear_schedule_with_warmup



In [12]:
# In this project, we will use BART model. 
# Let's see what the model arguments look like
'''
from simpletransformers.config.model_args import Seq2SeqArgs
encoder_decoder_name="facebook/bart-base"
arg=Seq2SeqArgs()
y = arg.load(encoder_decoder_name)

print(arg)
'''
# Results:
'''
Seq2SeqArgs(adafactor_beta1=None, adafactor_clip_threshold=1.0, adafactor_decay_rate=-0.8, adafactor_eps=(1e-30, 0.001), adafactor_relative_step=True, adafactor_scale_parameter=True, adafactor_warmup_init=True, adam_epsilon=1e-08, best_model_dir='outputs/best_model', cache_dir='cache_dir/', config={}, cosine_schedule_num_cycles=0.5, custom_layer_parameters=[], custom_parameter_groups=[], dataloader_num_workers=0, do_lower_case=False, dynamic_quantize=False, early_stopping_consider_epochs=False, early_stopping_delta=0, early_stopping_metric='eval_loss', early_stopping_metric_minimize=True, early_stopping_patience=3, encoding=None, eval_batch_size=8, evaluate_during_training=False, evaluate_during_training_silent=True, evaluate_during_training_steps=2000, evaluate_during_training_verbose=False, evaluate_each_epoch=True, fp16=True, gradient_accumulation_steps=1, learning_rate=4e-05, local_rank=-1, logging_steps=50, loss_type=None, loss_args={}, manual_seed=None, max_grad_norm=1.0, max_seq_length=128, model_name=None, model_type=None, multiprocessing_chunksize=-1, n_gpu=1, no_cache=False, no_save=False, not_saved_args=[], num_train_epochs=1, optimizer='AdamW', output_dir='outputs/', overwrite_output_dir=False, polynomial_decay_schedule_lr_end=1e-07, polynomial_decay_schedule_power=1.0, process_count=6, quantized_model=False, reprocess_input_data=True, save_best_model=True, save_eval_checkpoints=True, save_model_every_epoch=True, save_optimizer_and_scheduler=True, save_steps=2000, scheduler='linear_schedule_with_warmup', silent=False, skip_special_tokens=True, tensorboard_dir=None, thread_count=None, tokenizer_name=None, tokenizer_type=None, train_batch_size=8, train_custom_parameters_only=False, use_cached_eval_features=False, use_early_stopping=False, use_hf_datasets=False, use_multiprocessing=True, use_multiprocessing_for_evaluation=True, wandb_kwargs={}, wandb_project=None, warmup_ratio=0.06, warmup_steps=0, weight_decay=0.0, model_class='Seq2SeqModel', base_marian_model_name=None, dataset_class=None, dataset_cache_dir=None, do_sample=False, early_stopping=True, evaluate_generated_text=False, faiss_d=768, faiss_m=128, include_title_in_knowledge_dataset=True, length_penalty=2.0, max_length=20, max_steps=-1, num_beams=1, num_return_sequences=1, rag_embed_batch_size=16, repetition_penalty=1.0, save_knowledge_dataset=True, save_knowledge_dataset_with_checkpoints=False, split_text_character=' ', split_text_n=100, src_lang='en_XX', tgt_lang='ro_RO', top_k=None, top_p=None, use_multiprocessed_decoding=False)
'''

"\nSeq2SeqArgs(adafactor_beta1=None, adafactor_clip_threshold=1.0, adafactor_decay_rate=-0.8, adafactor_eps=(1e-30, 0.001), adafactor_relative_step=True, adafactor_scale_parameter=True, adafactor_warmup_init=True, adam_epsilon=1e-08, best_model_dir='outputs/best_model', cache_dir='cache_dir/', config={}, cosine_schedule_num_cycles=0.5, custom_layer_parameters=[], custom_parameter_groups=[], dataloader_num_workers=0, do_lower_case=False, dynamic_quantize=False, early_stopping_consider_epochs=False, early_stopping_delta=0, early_stopping_metric='eval_loss', early_stopping_metric_minimize=True, early_stopping_patience=3, encoding=None, eval_batch_size=8, evaluate_during_training=False, evaluate_during_training_silent=True, evaluate_during_training_steps=2000, evaluate_during_training_verbose=False, evaluate_each_epoch=True, fp16=True, gradient_accumulation_steps=1, learning_rate=4e-05, local_rank=-1, logging_steps=50, loss_type=None, loss_args={}, manual_seed=None, max_grad_norm=1.0, max_

In [13]:
#@title Sequence to Sequence Class
class Seq2SeqModel:
    def __init__(
        self,
        encoder_decoder_type="bart",
        encoder_decoder_name="facebook/bart-base",
        args=None,
        use_cuda=True,
        cuda_device=0
    ):
        self.args = self._load_model_args(encoder_decoder_name) # The _load_model_args function is defined below after the init function
        
        # Update the Seq2Seq arguments
        if isinstance(args, dict):
            self.args.update_from_dict(args) # The update_from_dict is a function of class ModelArgs
        elif isinstance(args, Seq2SeqArgs):
            self.args = args
        """
        The function "update_from_dict" is as below:
        def update_from_dict(self, new_values):
            if isinstance(new_values, dict):
                for key, value in new_values.items():
                    setattr(self, key, value)
            else:
                raise (TypeError(f"{new_values} is not a Python dict."))
        """
        # Setup seed values
        if self.args.manual_seed:
            random.seed(self.args.manual_seed)
            np.random.seed(self.args.manual_seed)
            torch.manual_seed(self.args.manual_seed)
            if self.args.n_gpu > 0:
                torch.cuda.manual_seed_all(self.args.manual_seed)
        
        # Setup cuda
        if use_cuda:
            if torch.cuda.is_available():
                if cuda_device == -1:
                    self.device = torch.device("cuda")
                else:
                    self.device = torch.device(f"cuda:{cuda_device}")
            else:
                raise ValueError(
                    "'use_cuda' set to True when cuda is unavailable."
                    "Make sure CUDA is available or set `use_cuda=False`."
                )
        else:
            self.device = "cpu"
        
        if not use_cuda:
            self.args.fp16 = False
        
        self.results = {}
        
        config_class, model_class, tokenizer_class = (BartConfig, BartForConditionalGeneration, BartTokenizer)
        
        self.model = model_class.from_pretrained(encoder_decoder_name)
        self.encoder_tokenizer = tokenizer_class.from_pretrained(encoder_decoder_name)
        self.decoder_tokenizer = self.encoder_tokenizer
        self.config = self.model.config
        self.args.model_name = encoder_decoder_name
        self.args.model_type = encoder_decoder_type

    def _load_model_args(self, input_dir):
        args = Seq2SeqArgs()
        args.load(input_dir)
        return args

Utility functions to be used in the Seq2SeqModel

In [14]:
#@title predict_val and predict_test functions
# This function will assess the model on valid_df dataset
def predict_val(model, device):
    candidate_list = ["positive", "neutral", "negative"]

    model.eval()
    model.config.use_cache = False
    # using BART for tokenize
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
    # read validation data
    count = 0
    for i in range(len(valid_df)):
        score_list = []
        # create input and tokenize the inputs
        x, term, golden_polarity = valid_df.loc[i,'input_text'],valid_df.loc[i,'term'],valid_df.loc[i,'target_text']
        input_ids = tokenizer([x] * 3, return_tensors='pt')['input_ids']
        # create and tokenize the target_list
        target_list = ["The sentiment polarity of " + term.lower() + " is " + candi.lower() + " ." for candi in candidate_list]
        output_ids = tokenizer(target_list, return_tensors='pt', padding=True, truncation=True)['input_ids']
        # evaluate every line in output_id
        with torch.no_grad():
            output = model(input_ids=input_ids.to(device), decoder_input_ids=output_ids.to(device))[0] 
            # each output will have a shape of (3,11,50265) because 
            # the each target "The sentiment polarity of ... is ... ." has 8 words, generating 9 tokens since one of which generates 2 tokens, 
            # and others 2 tokens for [CLS] at the beginning and [SEP] at the end
            logits = output.softmax(dim=-1).to('cpu').numpy()
        for i in range(3):
            score = 1
            for j in range(logits[i].shape[0] - 2):
                score *= logits[i][j][output_ids[i][j + 1]]
                # note that output/logit of a word is a prediction for the next word
                # thus we will get the value corresponding to [j+1]
            score_list.append(score)

        predict = candidate_list[np.argmax(score_list)]
        if predict == golden_polarity:
            count += 1
    return count/len(valid_df)

# This function will return the model accuracy on the test dataset
def predict_test(model, device):
    predicted_vs_actual_y_list = []
    candidate_list = ["positive", "neutral", "negative"]

    model.eval()
    model.config.use_cache = False
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
    count = 0
    for i in range(len(test_df)):
        score_list = []
        x, term, golden_polarity = test_df.loc[i,'input_text'],test_df.loc[i,'term'],test_df.loc[i,'target_text']
        input_ids = tokenizer([x] * 3, return_tensors='pt')['input_ids']
        target_list = ["The sentiment polarity of " + term.lower() + " is " + candi.lower() + " ." for candi in candidate_list]

        output_ids = tokenizer(target_list, return_tensors='pt', padding=True, truncation=True)['input_ids']
        with torch.no_grad():
            output = model(input_ids=input_ids.to(device), decoder_input_ids=output_ids.to(device))[0]
            logits = output.softmax(dim=-1).to('cpu').numpy()
        for i in range(3):
            score = 1
            for j in range(logits[i].shape[0] - 2):
                score *= logits[i][j][output_ids[i][j + 1]]
            score_list.append(score)

        predict = candidate_list[np.argmax(score_list)]
        if predict == golden_polarity:
            count += 1

        predicted_vs_actual_y_list.append((x, term, golden_polarity, predict))

    return (predicted_vs_actual_y_list, count/len(test_df))


In [15]:
#@title Some utility functions
def load_and_cache_examples(seq2seq_model, data, evaluate=False, no_cache=False, verbose=True, silent=False):
    """
    Creates a T5Dataset from data.

    Utility function for train() and eval() methods. Not intended to be used directly.
    """
    encoder_tokenizer = seq2seq_model.encoder_tokenizer
    decoder_tokenizer = seq2seq_model.decoder_tokenizer
    args = seq2seq_model.args

    if not no_cache:
        no_cache = args.no_cache

    if not no_cache:
        os.makedirs(seq2seq_model.args.cache_dir, exist_ok=True)

    mode = "dev" if evaluate else "train"

    return SimpleSummarizationDataset(encoder_tokenizer, seq2seq_model.args, data, mode)

def _save_model_args(seq2seq_model, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    seq2seq_model.args.save(output_dir)

def _save_model(seq2seq_model, output_dir=None, optimizer=None, scheduler=None, model=None, results=None):
    if not output_dir:
        output_dir = seq2seq_model.args.output_dir
    os.makedirs(output_dir, exist_ok=True)

    logger.info(f"Saving model into {output_dir}")

    if model and not seq2seq_model.args.no_save:
        # Take care of distributed/parallel training
        model_to_save = model.module if hasattr(model, "module") else model
        _save_model_args(seq2seq_model, output_dir)

            
        os.makedirs(os.path.join(output_dir), exist_ok=True)
        model_to_save.save_pretrained(output_dir)
        seq2seq_model.config.save_pretrained(output_dir)

        seq2seq_model.encoder_tokenizer.save_pretrained(output_dir)
            
        torch.save(seq2seq_model.args, os.path.join(output_dir, "training_args.bin"))
        if optimizer and scheduler and seq2seq_model.args.save_optimizer_and_scheduler:
            torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
            torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))

    if results:
        output_eval_file = os.path.join(output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            for key in sorted(results.keys()):
                writer.write("{} = {}\n".format(key, str(results[key])))

def _move_model_to_device(seq2seq_model):
    seq2seq_model.model.to(seq2seq_model.device)

def _get_inputs_dict(seq2seq_model, batch):
    device = seq2seq_model.device
        
    pad_token_id = seq2seq_model.encoder_tokenizer.pad_token_id
    source_ids, source_mask, y = batch["source_ids"], batch["source_mask"], batch["target_ids"]
    y_ids = y[:, :-1].contiguous()
    labels = y[:, 1:].clone()
    labels[y[:, 1:] == pad_token_id] = -100

    inputs = {
                "input_ids": source_ids.to(device),
                "attention_mask": source_mask.to(device),
                "decoder_input_ids": y_ids.to(device),
                "labels": labels.to(device),
            }

    return inputs

def get_named_parameters(self):
    return [n for n, p in self.model.named_parameters()]

In [16]:
#@title Evaluate Function
# Evaluates the model on eval_dataset.
# Utility function to be used by the eval_model() method. Not intended to be used directly.
def evaluate(seq2seq_model, eval_dataset, output_dir, verbose=True, silent=False):

    model = seq2seq_model.model
    args = seq2seq_model.args
    eval_output_dir = output_dir

    results = {}

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, disable=args.silent or silent, desc="Running Evaluation"):
        # batch = tuple(t.to(device) for t in batch)

        inputs = _get_inputs_dict(seq2seq_model,batch)
        with torch.no_grad():
            outputs = model(**inputs)
            loss = outputs[0]
            eval_loss += loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps

    results["eval_loss"] = eval_loss

    output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        for key in sorted(results.keys()):
            writer.write("{} = {}\n".format(key, str(results[key])))

    return results

In [17]:
#@title Predict Function
def predict(seq2seq_model, to_predict):
    """
    Performs predictions on a list of text.

    Args:
        to_predict: A python list of text (str) to be sent to the model for prediction. Note that the prefix should be prepended to the text.

    Returns:
        preds: A python list of the generated sequences.
    """  

    _move_model_to_device(seq2seq_model)

    all_outputs = []
    # Batching
    for batch in [
        to_predict[i : i + seq2seq_model.args.eval_batch_size] for i in range(0, len(to_predict), seq2seq_model.args.eval_batch_size)
    ]:
        
        input_ids = seq2seq_model.encoder_tokenizer.batch_encode_plus(
                batch, max_length=seq2seq_model.args.max_seq_length, pad_to_max_length=True, return_tensors="pt",
            )["input_ids"]
        input_ids = input_ids.to(seq2seq_model.device)

        
        outputs = seq2seq_model.model.generate(
                input_ids=input_ids,
                num_beams=seq2seq_model.args.num_beams,
                max_length=seq2seq_model.args.max_length,
                length_penalty=seq2seq_model.args.length_penalty,
                early_stopping=seq2seq_model.args.early_stopping,
                repetition_penalty=seq2seq_model.args.repetition_penalty,
                do_sample=seq2seq_model.args.do_sample,
                top_k=seq2seq_model.args.top_k,
                top_p=seq2seq_model.args.top_p,
                num_return_sequences=seq2seq_model.args.num_return_sequences,
            )

        all_outputs.extend(outputs.cpu().numpy())
    
    outputs = [
            seq2seq_model.decoder_tokenizer.decode(output_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
            for output_id in all_outputs
        ]

    return outputs

In [18]:
def eval_model(seq2seq_model, eval_data, output_dir=None, verbose=True, silent=False):
    """
    Evaluates the model on eval_data. Saves results to output_dir.

    Args:
        eval_data: Pandas DataFrame containing the 2 columns - `input_text`, `target_text`.
                    - `input_text`: The input text sequence.
                    - `target_text`: The target text sequence.
        output_dir: The directory where model files will be saved. If not given, self.args.output_dir will be used.
        verbose: If verbose, results will be printed to the console on completion of evaluation.
        silent: If silent, tqdm progress bars will be hidden.
        
    Returns:
        results: Dictionary containing evaluation results.
    """  
    model = seq2seq_model.model
    args = seq2seq_model.args

    if not output_dir:
        output_dir = args.output_dir

    _move_model_to_device(seq2seq_model)

    eval_dataset = load_and_cache_examples(seq2seq_model,eval_data, evaluate=True, verbose=verbose, silent=silent)
    os.makedirs(output_dir, exist_ok=True)

    result = evaluate(seq2seq_model,eval_dataset, output_dir, verbose=verbose, silent=silent)
    seq2seq_model.results.update(result)

    if seq2seq_model.args.evaluate_generated_text:
        to_predict = eval_data["input_text"].tolist()
        preds = predict(seq2seq_model, to_predict)

    if verbose:
        logger.info(seq2seq_model.results)

    return seq2seq_model.results

In [19]:
def train(seq2seq_model, train_dataset, output_dir, best_accuracy, show_running_loss=True, eval_data=None, verbose=True
    ):
    """
    Trains the model on train_dataset.

    Utility function to be used by the train_model() method. Not intended to be used directly.
    """

    model = seq2seq_model.model
    args = seq2seq_model.args

    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset,
        sampler=train_sampler,
        batch_size=args.train_batch_size,
        num_workers=seq2seq_model.args.dataloader_num_workers,
        )

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    no_decay = ["bias", "LayerNorm.weight"]

    optimizer_grouped_parameters = []
    custom_parameter_names = set()

    if not seq2seq_model.args.train_custom_parameters_only:
        optimizer_grouped_parameters.extend(
                [
                    {
                        "params": [
                            p
                            for n, p in model.named_parameters()
                            if n not in custom_parameter_names and not any(nd in n for nd in no_decay)
                        ],
                        "weight_decay": args.weight_decay,
                    },
                    {
                        "params": [
                            p
                            for n, p in model.named_parameters()
                            if n not in custom_parameter_names and any(nd in n for nd in no_decay)
                        ],
                        "weight_decay": 0.0,
                    },
                ]
            )

    warmup_steps = math.ceil(t_total * args.warmup_ratio)
    args.warmup_steps = warmup_steps if args.warmup_steps == 0 else args.warmup_steps
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
        )

    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    logger.info(" Training started")
    print("Training started")
        
    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.silent, mininterval=0)
    epoch_number = 0
    best_eval_metric = None
    early_stopping_counter = 0
    steps_trained_in_current_epoch = 0
    epochs_trained = 0

    if args.fp16:
        from torch.cuda import amp

        scaler = amp.GradScaler()

    model.train()
    for current_epoch in train_iterator:
        if epochs_trained > 0:
            epochs_trained -= 1
            continue
        train_iterator.set_description(f"Epoch {epoch_number + 1} of {args.num_train_epochs}")
        batch_iterator = tqdm(
                train_dataloader,
                desc=f"Running Epoch {epoch_number} of {args.num_train_epochs}",
                disable=args.silent,
                mininterval=0,
            )
        for step, batch in enumerate(batch_iterator):
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue
            # batch = tuple(t.to(device) for t in batch)

            inputs = _get_inputs_dict(seq2seq_model,batch)
            if args.fp16:
                with amp.autocast():
                    outputs = model(**inputs)
                    # model outputs are always tuple in pytorch-transformers (see doc)
                    loss = outputs[0]
            else:
                outputs = model(**inputs)
                # model outputs are always tuple in pytorch-transformers (see doc)
                loss = outputs[0]

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training

            current_loss = loss.item()

            if show_running_loss:
                batch_iterator.set_description(
                        f"Epochs {epoch_number}/{args.num_train_epochs}. Running Loss: {current_loss:9.4f}"
                    )

            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                    scaler.scale(loss).backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                if args.fp16:
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    logging_loss = tr_loss

                if args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step))

                    _save_model(seq2seq_model, output_dir_current, optimizer, scheduler, model=model)

                if args.evaluate_during_training and (
                    args.evaluate_during_training_steps > 0
                    and global_step % args.evaluate_during_training_steps == 0
                ):
                    # Only evaluate when single GPU otherwise metrics may not average well
                    results = eval_model(
                            seq2seq_model,
                            eval_data,
                            verbose=verbose and args.evaluate_during_training_verbose,
                            silent=args.evaluate_during_training_silent
                        )

                    output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step))

                    if args.save_eval_checkpoints:
                        _save_model(seq2seq_model,output_dir_current, optimizer, scheduler, model=model, results=results)

                    if not best_eval_metric:
                        best_eval_metric = results[args.early_stopping_metric]
                        if args.save_best_model:
                                _save_model(seq2seq_model, args.best_model_dir, optimizer, scheduler, model=model, results=results)
                    if best_eval_metric and args.early_stopping_metric_minimize:
                        if results[args.early_stopping_metric] - best_eval_metric < args.early_stopping_delta:
                            best_eval_metric = results[args.early_stopping_metric]
                            if args.save_best_model:
                                _save_model(
                                    seq2seq_model, args.best_model_dir, optimizer, scheduler, model=model, results=results
                                    )
                            early_stopping_counter = 0

        epoch_number += 1
        output_dir_current = os.path.join(output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number))


        accuracy = predict_val(model, seq2seq_model.device)
        print("Validation Accuracy: ", accuracy)
        print('batch: '+str(args.train_batch_size)+' accumulation_steps: '+str(args.gradient_accumulation_steps)+\
            ' lr: '+str(args.learning_rate)+' epochs: '+str(args.num_train_epochs)+' epoch: '+str(epoch_number))
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            print('---test dataset----')
            predicted_vs_actual_y_list, test_acc = predict_test(model, seq2seq_model.device)
            print("Test Accuracy: ", test_acc)

            with open('./ABSA_best_accuracy.txt', 'a') as f0:
                f0.writelines('batch: '+str(args.train_batch_size)+' accumulation_steps: '+str(args.gradient_accumulation_steps)+\
                                  ' lr: '+str(args.learning_rate)+' epochs: '+str(args.num_train_epochs)+' epoch: '+str(epoch_number)+' val_accuracy: '+str(best_accuracy)+\
                                  ' test_accuracy: '+str(test_acc)+'\n')
                
            f10 = open('./best_test_data_predicted_vs_actual.txt', 'w')
            for t in predicted_vs_actual_y_list:
                line = '\t'.join(str(x) for x in t)
                f10.write(line + '\n')
            f10.close()
 


        if args.save_model_every_epoch or args.evaluate_during_training:
            os.makedirs(output_dir_current, exist_ok=True)

        if args.save_model_every_epoch:
            _save_model(seq2seq_model, output_dir_current, optimizer, scheduler, model=model)

        if args.evaluate_during_training:
            results = eval_model(
                    seq2seq_model,
                    eval_data,
                    verbose=verbose and args.evaluate_during_training_verbose,
                    silent=args.evaluate_during_training_silent
                )

            if args.save_eval_checkpoints:
                _save_model(seq2seq_model, output_dir_current, optimizer, scheduler, results=results)

            if not best_eval_metric:
                best_eval_metric = results[args.early_stopping_metric]
                if args.save_best_model:
                    _save_model(seq2seq_model, args.best_model_dir, optimizer, scheduler, model=model, results=results)
                        
            if best_eval_metric and args.early_stopping_metric_minimize:
                if results[args.early_stopping_metric] - best_eval_metric < args.early_stopping_delta:
                    best_eval_metric = results[args.early_stopping_metric]
                    if args.save_best_model:
                        _save_model(seq2seq_model, args.best_model_dir, optimizer, scheduler, model=model, results=results)
                    early_stopping_counter = 0

    print("Best Validation Accuracy: ", best_accuracy)
    return global_step, tr_loss / global_step, best_accuracy



In [20]:
# Train_model Function
def train_model(
    seq2seq_model, train_data, best_accuracy, output_dir=None, show_running_loss=True, args=None, eval_data=None, verbose=True
):
    """
    Trains the model using 'train_data'

    Args:
        train_data: Pandas DataFrame containing the 2 columns - `input_text`, `target_text`.
                    - `input_text`: The input text sequence.
                    - `target_text`: The target text sequence
        output_dir: The directory where model files will be saved. If not given, self.args.output_dir will be used.
        show_running_loss (optional): Set to False to prevent running loss from being printed to console. Defaults to True.
        args (optional): Optional changes to the args dict of the model. Any changes made will persist for the model.
        eval_data (optional): A DataFrame against which evaluation will be performed when evaluate_during_training is enabled. Is required if evaluate_during_training is enabled.
    Returns:
        None
    """  

    if args:
        seq2seq_model.args.update_from_dict(args)

    if seq2seq_model.args.evaluate_during_training and eval_data is None:
        raise ValueError(
            "evaluate_during_training is enabled but eval_data is not specified."
            " Pass eval_data to model.train_model() if using evaluate_during_training."
        )

    if not output_dir:
        output_dir = seq2seq_model.args.output_dir

    if os.path.exists(output_dir) and os.listdir(output_dir) and not seq2seq_model.args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty."
            " Set args.overwrite_output_dir = True to overcome.".format(output_dir)
        )

    _move_model_to_device(seq2seq_model)

    train_dataset = load_and_cache_examples(seq2seq_model, train_data, verbose=verbose)

    os.makedirs(output_dir, exist_ok=True)

    global_step, tr_loss, best_accuracy = train(
        seq2seq_model,
        train_dataset,
        output_dir,
        best_accuracy,
        show_running_loss=show_running_loss,
        eval_data=eval_data,
        verbose=verbose
    )

    _save_model(seq2seq_model, seq2seq_model.args.output_dir, model=seq2seq_model.model)

    if verbose:
        logger.info(" Training of {} model complete. Saved to {}.".format(seq2seq_model.args.model_name, output_dir))

    return best_accuracy



## Training the model

In [21]:
steps = [1]
learing_rates = [4e-5]

best_accuracy = 0
for lr in learing_rates:
    for step in steps:
        model_args = {
            "reprocess_input_data": True,
            "overwrite_output_dir": True,
            "max_seq_length": 128,
            "train_batch_size": 16,
            "num_train_epochs": 5,
            "save_eval_checkpoints": False,
            "save_model_every_epoch": False,
            "evaluate_during_training": True,
            "evaluate_generated_text": True,
            "evaluate_during_training_verbose": True,
            "use_multiprocessing": False,
            "max_length": 128,
            "manual_seed": 2022,
            "gradient_accumulation_steps": step,
            "learning_rate":  lr,
            "save_steps": 99999999999999,
            "early_stopping_patience": 5,
        }

        # Initialize model
        seq2seq_model = Seq2SeqModel(
            encoder_decoder_type="bart",
            encoder_decoder_name="facebook/bart-base",
            use_cuda=True,
            args=model_args,
        )

        # Train the model
        best_accuracy = train_model(seq2seq_model,train_data=train_df, eval_data=valid_df, best_accuracy=best_accuracy)

 

Downloading config.json:   0%|          | 0.00/1.68k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/532M [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

  0%|          | 0/2997 [00:00<?, ?it/s]

Training started




Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/188 [00:00<?, ?it/s]



Validation Accuracy:  0.8453333333333334
batch: 16 accumulation_steps: 1 lr: 4e-05 epochs: 5 epoch: 1
---test dataset----
Test Accuracy:  0.8353448275862069


  0%|          | 0/750 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Running Epoch 1 of 5:   0%|          | 0/188 [00:00<?, ?it/s]

Validation Accuracy:  0.8386666666666667
batch: 16 accumulation_steps: 1 lr: 4e-05 epochs: 5 epoch: 2


  0%|          | 0/750 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/188 [00:00<?, ?it/s]

Validation Accuracy:  0.868
batch: 16 accumulation_steps: 1 lr: 4e-05 epochs: 5 epoch: 3
---test dataset----
Test Accuracy:  0.8439655172413794


  0%|          | 0/750 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/188 [00:00<?, ?it/s]

Validation Accuracy:  0.8613333333333333
batch: 16 accumulation_steps: 1 lr: 4e-05 epochs: 5 epoch: 4


  0%|          | 0/750 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/188 [00:00<?, ?it/s]

Validation Accuracy:  0.8746666666666667
batch: 16 accumulation_steps: 1 lr: 4e-05 epochs: 5 epoch: 5
---test dataset----
Test Accuracy:  0.8448275862068966


  0%|          | 0/750 [00:00<?, ?it/s]

Best Validation Accuracy:  0.8746666666666667
