## SetFit ABSA Training


In [None]:
# load packages
import pandas as pd
import ast
import warnings
warnings.filterwarnings("ignore")

In [None]:
pip install spacy

In [None]:
!spacy download en_core_web_sm

CUDA is required to run SetFit ABSA model, run below code block to check if CUDA is available

In [None]:
# chekc if cuda is available
import torch
torch.cuda.is_available()

The training dataset we prepared for trainning our own SetFit ABSA model is made available through huggingface. 
https://huggingface.co/datasets/ginkgogo/ca_restaurants_random_sample We should be able to load the dataset directly from huggingface fter installing required setfit[absa] packages

In [None]:
from datasets import load_dataset

dataset = load_dataset("ginkgogo/ca_restaurants_random_sample", split="train")
# splitting dataset into two parts, one for training purposes and the other one for evaluation
train_dataset = dataset.select(range(50))
eval_dataset = dataset.select(range(50, 102))

In [None]:
# quickly take a look at our training data
train_dataset

In [None]:
# also spot on our evaluation data
eval_dataset

Prepare a new instance of Absa model, with selected transformers and spacy large model

In [None]:
from setfit import AbsaModel

model = AbsaModel.from_pretrained(
    "sentence-transformers/all-MiniLM-L6-v2",
    "sentence-transformers/all-mpnet-base-v2",
    spacy_model="en_core_web_sm",
)

### Training the SetFitABSA model
Prepare training arguments for the ABSA model and passing training dataset and evaluation dataset to the training process. We completed the training using Google Colab and it took about 1 hour using A100 GPU run-time environment. Therefore, we saved this model to huggingface so that we can use it whenever we want without rerun the training. Check "Using SetFitABSA model" below for details

In [None]:
from setfit import SetFitModel, Trainer, TrainingArguments, sample_dataset, AbsaTrainer
from transformers import EarlyStoppingCallback

args = TrainingArguments(
    output_dir="models",
    num_epochs=5,
    use_amp=True,
    batch_size=50,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=50,
    load_best_model_at_end=True,
)

trainer = AbsaTrainer(
    model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)
trainer.train()

In order to inspect the model, we use the built-in method provided by the setfit[absa] package to check the accuracy of the model

In [None]:
metrics = trainer.evaluate(eval_dataset)
print(metrics)

In [None]:
# pip install -U "huggingface_hub[cli]"

### Saving the SetFitABSA model to huggingface

In [None]:
# uncomment below to login to huggingface
# !huggingface-cli login

In [None]:
# uncomment below to save the model to huggingface
# model.push_to_hub("ginkgogo/setfit-absa-bge-small-en-v1.5-restaurants")

### Using SetFitABSA model

In [None]:
from setfit import AbsaModel

# Download from the 🤗 Hub
model = AbsaModel.from_pretrained(
    "ginkgogo/setfit-absa-bge-small-en-v1.5-restaurants-aspect",
    "ginkgogo/setfit-absa-bge-small-en-v1.5-restaurants-polarity",
    spacy_model="en_core_web_sm",
)
# Run inference
preds = model("The food was great, but the venue is just way too busy.")
print(preds)

In [None]:
df = pd.read_csv(
  '/content/drive/MyDrive/699/ca_restaurants.csv'
)
# this is list of business ids that we used in training the SetFit ABSA model, 
# we need to ommit this from the random sample to avoid bias
bus_used_in_train = [234152, 88955, 174286, 228338, 203671, 151156, 88166, 64932, 142804, 210180, 35159, 90839, 137484, 85880, 128479, 92603, 20842, 200330, 175440, 8844, 61777, 3815, 123379, 125840, 180129, 206443, 219869, 101729, 107887, 188230, 244420, 49208, 139902, 242337, 35581, 228649, 44946, 32763, 69556, 152494, 5069963, 3915492, 4486491]

random_df_2000 = df.sample(2000)

for business in bus_used_in_train:
    if business in random_df_2000['business_id']:
        random_df_2000.drop(business, inplace=True)

In [None]:
# run inference on the random sample of 2k rows from the California resturant dataset
sentences = list(random_df_2000['text'].str.lower())
preds = model(sentences)

In [None]:
# quickly inspect model predictions
print(preds)
print(len(preds))

Model prediction (preds) is a list of lists of aspect and sentiment pair like this [[{'span': 'food', 'polarity': 'positive'}],
[{'span': 'food', 'polarity': 'positive'}, {'span': 'prices', 'polarity': 'positive'}],
[{'span': 'waiting time', 'polarity': 'positive'}]] To better view the aspect and sentiment, we need to make dictionary of aspect: sentiment like this {'food': 'positive', 'service': 'positive'}

In [None]:
# if there's no sentiment extracted, use empty {} as the column value
aspects_sentiment = []
for i in preds:
    if len(i) > 0:
        aspects_sentiment.append(i)
    else:
        aspects_sentiment.append('{}')

random_df_2000['aspects_sentiment'] = aspects_sentiment

In [None]:
# method to create aspect: sentiment dict
def extract(aspect_list):
    if isinstance(aspect_list, list):
        aspect_dict = {}
    for aspect in aspect_list:
        aspect_dict[aspect['span']] = aspect['polarity']
    return aspect_dict


In [None]:
# apply this method to the random samples
random_df_2000['aspects_sentiment'] = random_df_2000['aspects_sentiment'].apply(extract)

In [None]:
random_df_2000.head()

In [None]:
# create a dataframe where one or more aspects are extracted from the review text
with_aspect_df = random_df_2000.dropna(subset=['aspects_sentiment'])
print(with_aspect_df.shape)

In [None]:
# prepare this random sample for a file

random_df_2000['aspects_sentiment'] = random_df_2000['aspects_sentiment'].fillna('{}')
random_df_2000['aspects_sentiment'] = random_df_2000['aspects_sentiment'].apply(ast.literal_eval)
flatten_asepct = pd.json_normalize(random_df_2000['aspects_sentiment'])

random_df_2000.reset_index(inplace=True)
flatten_asepct.reset_index(inplace=True)
final_df = pd.concat([random_df_2000, flatten_asepct], axis=1)

In [None]:
final_df.shape

In [None]:
# prepare this random sample with aspects extracted for a file
with_aspect_df.reset_index(inplace=True)
with_aspect_df['aspects_sentiment'] = with_aspect_df['aspects_sentiment'].apply(ast.literal_eval)
with_aspect_flatten = pd.json_normalize(with_aspect_df['aspects_sentiment'])
with_aspect_flatten.reset_index(inplace=True)
with_aspect_final_df = pd.concat([with_aspect_df, with_aspect_flatten], axis=1)

### Saving processed data to drive
We saved processed data to csv for us to better perform manual evaluation. Uncomment sections of code to create files to Google Drive

In [None]:
# uncomment below lines to mount to your Google drive
# from google.colab import drive
# drive.mount('/content/drive')

In [21]:
# uncomment below lines to write to drive
# with_aspect_final_df.to_csv('with_aspect_from_random_2k.csv')
# !cp with_aspect_from_random_2k.csv '/content/drive/MyDrive/699/'

In [None]:
# final_df.to_csv('random_2k.csv')
# !cp random_2k.csv '/content/drive/MyDrive/699/'

In [None]:
import pandas as pd
out = pd.read_csv('/content/drive/MyDrive/699/with_aspect_from_random_2k.csv')
out.head()

In [None]:
out_random = pd.read_csv('/content/drive/MyDrive/699/random_2k.csv')
out_random.shape

### Reading Manual evaluation results

In [None]:
# load manual evaluation results 
import pandas as pd
setfit_absa_eval_df = pd.read_csv('../data/results/SetFit_ABSA_manual_eval.csv')
setfit_absa_eval_df.head()

In [None]:
setfit_absa_eval_df.shape

In [None]:
# calculate when there's an aspect extracted, the accuray of predicting the correct sentiment
cal_df = setfit_absa_eval_df[(setfit_absa_eval_df['has_aspects_model_label'] == 'Y') 
                             & (setfit_absa_eval_df['aspects_extracted_manual_label'] == 'Y')]
sentiment_correctness = len(cal_df[cal_df['Model Label'] == cal_df['Manual Label']])/len(cal_df)
print('Accuracy of predicting sentiment is :', format(sentiment_correctness, ".1%"))