In [2]:
import os
import random
import tabulate
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# datasets
from datasets import Dataset
from datasets import DatasetDict
from datasets import load_metric
from datasets import load_dataset

# transformers
from transformers import Trainer
from transformers import AutoModel
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoModelForSequenceClassification
from transformers import EarlyStoppingCallback
from transformers import IntervalStrategy

import torch
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset

from sklearn.metrics import accuracy_score, f1_score

import evaluate

In [None]:
def print_using_tabulate(data):
    table_data = []
    for key, values in data.items():
        if key != 'macro avg' and key != 'weighted avg':
            if isinstance(values, dict):
                row = [key, values['precision'], values['recall'], values['f1-score'], values['support']]
                table_data.append(row)

    # Print the classification report using tabulate
    headers = ['Class', 'Precision', 'Recall', 'F1-Score', 'Support']
    m_table = tabulate(table_data, headers=headers, tablefmt='psql',floatfmt=".4f")
    print(m_table)


def plotConfusionMatrix(title, confusionMat, targetNames):
    fig, ax = plt.subplots(1,1,figsize=(8,6))
    ax.matshow(confusionMat, aspect='auto', vmin=0, vmax=1000, cmap=plt.get_cmap('Blues'))
    plt.ylabel('Predicted Category')
    plt.yticks(range(len(targetNames)), targetNames)
    plt.xlabel('Actual Category')
    plt.xticks(range(len(targetNames)), targetNames, rotation=90)
    plt.title(title)
    plt.show()

In [None]:
train_path = "train.csv"
test_path = "test.csv"
valid_path = "valid.csv"

In [None]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
valid_df = pd.read_csv(valid_path)

In [None]:
train_df = train_df[train_df['stars'].isin([1, 2, 4, 5])]
# train_df = train_df[train_df['language'].isin(['en', 'de', 'fr', 'es'])]
train_df = train_df[train_df['language'].isin(['en'])]

test_df = test_df[test_df['stars'].isin([1, 2, 4, 5])]
# test_df = test_df[test_df['language'].isin(['en', 'de', 'fr', 'es'])]
test_df = test_df[test_df['language'].isin(['en'])]

valid_df = valid_df[valid_df['stars'].isin([1, 2, 4, 5])]
# test_df = test_df[test_df['language'].isin(['en', 'de', 'fr', 'es'])]
valid_df = valid_df[valid_df['language'].isin(['en'])]


columns_to_drop = ['review_id', 'product_id', 'reviewer_id', 'product_category']
train_df.drop(columns=columns_to_drop, inplace=True)
test_df.drop(columns=columns_to_drop, inplace=True)
valid_df.drop(columns=columns_to_drop, inplace=True)

In [None]:
num_rows_to_delete = 35000

In [None]:
for lang in ['en']:
# for lang in ['en', 'de', 'es', 'fr']:
    for star in [1, 2, 4, 5]:
        print(f'Language: {lang}, Stars: {star}')
        print(f'Number of rows before: {len(train_df[(train_df["language"] == lang) & (train_df["stars"] == star)])}')
        random_indices = np.random.choice(train_df[(train_df['language'] == lang) & (train_df['stars'] == star)].index, num_rows_to_delete, replace=False)
        train_df.drop(index=random_indices, inplace=True)
        print(f'Number of rows after: {len(train_df[(train_df["language"] == lang) & (train_df["stars"] == star)])}')

In [None]:
def replace_mapping(df, label):
    for i, r in df.iterrows():
        if r[label] >= 4:
            df.loc[i, label] = 1
        else:
            df.loc[i, label] = 0

    return df

In [None]:
test_df = replace_mapping(test_df, 'stars')
valid_df = replace_mapping(valid_df, 'stars')

In [None]:
train_df = replace_mapping(train_df, 'stars')
test_df = replace_mapping(test_df, 'stars')
valid_df = replace_mapping(valid_df, 'stars')

In [None]:
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
valid_df.reset_index(drop=True, inplace=True)

In [None]:
print(train_df["stars"].unique())
print(test_df["stars"].unique())
print(valid_df["stars"].unique())

In [None]:
def prepare_truncation(data_df, tokenizer, m, n):
    # Create a tqdm progress bar for the loop
    for i, r in tqdm(data_df.iterrows(), total=len(data_df), desc="Processing reviews"):
        tokenized_row = tokenizer.tokenize(r['review_body'])
        if len(tokenized_row) > m+n:
            data_df.loc[i, 'review_body'] = tokenizer.convert_tokens_to_string(tokenized_row[:m] + tokenized_row[-n:])

    return data_df

In [None]:
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name, num_labels=2)

In [None]:
N = 128
M = 382
# N = 64
# M = 64

In [None]:
target_names = ['Negative', 'Positive']

In [None]:
train_df = prepare_truncation(train_df, tokenizer, M, N)
test_df = prepare_truncation(test_df, tokenizer, M, N)
valid_df = prepare_truncation(valid_df, tokenizer, M, N)

In [None]:
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)
valid_ds = Dataset.from_pandas(valid_df)


en_only_dataset = DatasetDict()

en_only_dataset['train'] = train_ds
en_only_dataset['validation'] = valid_ds
en_only_dataset['test'] = test_ds

In [None]:
# Preprocess function with labels
def preprocess_function(examples):
    inputs = tokenizer(examples["review_body"], truncation=True)
    inputs["labels"] = examples["stars"]
    return inputs

# Tokenize the datasets
tokenized_train = en_only_dataset["train"].map(preprocess_function, batched=True)
tokenized_validation = en_only_dataset["validation"].map(preprocess_function, batched=True)
tokenized_test = en_only_dataset["test"].map(preprocess_function, batched=True)

# Data collator for padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

# Load metrics outside the compute_metrics function
load_accuracy = evaluate.load("accuracy")
load_f1 = evaluate.load("f1")

# Compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    return {"accuracy": accuracy, "f1": f1}

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    warmup_steps=10000,
    optim="adamw_torch",
    num_train_epochs=2,
    weight_decay=1e-4,
    save_strategy=IntervalStrategy.STEPS,
    evaluation_strategy=IntervalStrategy.STEPS,
    eval_steps=500,
    logging_strategy=IntervalStrategy.STEPS,
    push_to_hub=False,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    save_total_limit=5,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

for f in os.listdir("weights"):
    model = AutoModelForSequenceClassification.from_pretrained(f)
    # # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

In [None]:
for f in os.listdir("weights/binary/"):
    model_name
    model = AutoModelForSequenceClassification.from_pretrained(f)
    # # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )