In [1]:
!pip install transformers[torch]
!pip install sentencepiece
!pip install datasets
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0


In [2]:
import random
from tabulate import tabulate
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import classification_report

# datasets
from datasets import Dataset
from datasets import DatasetDict
from datasets import load_metric
from datasets import load_dataset

# transformers
from transformers import Trainer
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification
from transformers import IntervalStrategy

import torch
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset

from sklearn.metrics import accuracy_score, f1_score

import evaluate

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [4]:
from datasets.utils.logging import disable_progress_bar
from transformers import logging


disable_progress_bar()
logging.set_verbosity_error()

In [5]:
directory = "/kaggle/input/amazon-with-translated"
# directory = "/content/drive/MyDrive/MSc/NLP/nlp-project"

In [6]:
train_path = f"{directory}/train.csv"
test_path = f"{directory}/test.csv"
valid_path = f"{directory}/valid.csv"
translated_test_path = f"{directory}/amazon_translated_body_and_title_with_originals_all_stars.csv"

In [7]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
valid_df = pd.read_csv(valid_path)
translated_test_df = pd.read_csv(translated_test_path)

In [None]:
translated_test_df.rename(columns={"review_body": "review_body_original", "review_title": "review_title_original"}, inplace=True)

In [None]:
translated_test_df.rename(columns={"translated_body": "review_body", "translated_title": "review_title"}, inplace=True)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)


In [None]:
SEED = 111

# Set the random seed for Python to SEED
random.seed(SEED)

# Set the random seed for numpy to SEED
np.random.seed(SEED)

# Set the random seed for torch to SEED
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [None]:
def verbose_print(msg, verbose=False):
    if verbose:
        print(msg)


def drop_data(df, stars, languages, columns_to_drop):
    df = df[df["stars"].isin(stars)]
    df = df[df["language"].isin(languages)]
    if len(columns_to_drop) > 0:
        df.drop(columns=columns_to_drop, inplace=True)
    df.reset_index(drop=True, inplace=True)

    return df


def prepare_data(
    train, test, valid, columns_to_drop, stars=[1, 2, 3, 4], languages=["en"]
):
    train = drop_data(train, stars, languages, columns_to_drop)
    test = drop_data(test, stars, languages, columns_to_drop)
    valid = drop_data(valid, stars, languages, columns_to_drop)

    return train, test, valid


def reduce_dataset(df, stars, languages, num_of_rows_to_drop, verbose=False):
    for lang in languages:
        for star in stars:
            verbose_print(f"Language: {lang}, Stars: {star}", verbose)
            verbose_print(
                f'Number of rows before: {len(df[(df["language"] == lang) & (df["stars"] == star)])}',
                verbose,
            )
            random_indices = np.random.choice(
                df[(df["language"] == lang) & (df["stars"] == star)].index,
                num_of_rows_to_drop,
                replace=False,
            )
            df.drop(index=random_indices, inplace=True)
            verbose_print(
                f'Number of rows after: {len(df[(df["language"] == lang) & (df["stars"] == star)])}',
                verbose,
            )

    return df


def replace_mapping(df, label, mapping):
    for k, v in mapping.items():
        df[label].replace(k, v, inplace=True)
    return df


def prepare_truncation(data_df, tokenizer, m, n):
    for i, r in tqdm(data_df.iterrows(), total=len(data_df), desc="Processing reviews"):
        tokenized_row = tokenizer.tokenize(r["review_body"])
        if len(tokenized_row) > m + n:
            data_df.loc[i, "review_body"] = tokenizer.convert_tokens_to_string(
                tokenized_row[:m] + tokenized_row[-n:]
            )

    return data_df


def print_using_tabulate(data):
    table_data = []
    for key, values in data.items():
        if key != "macro avg" and key != "weighted avg":
            if isinstance(values, dict):
                row = [
                    key,
                    values["precision"],
                    values["recall"],
                    values["f1-score"],
                    values["support"],
                ]
                table_data.append(row)

    # Print the classification report using tabulate
    headers = ["Class", "Precision", "Recall", "F1-Score", "Support"]
    m_table = tabulate(table_data, headers=headers, tablefmt="psql", floatfmt=".4f")
    print(m_table)


def eval_model(trainer, test_set, target_names, label):
    predictions = trainer.predict(test_set)
    predicted_labels = predictions.predictions.argmax(axis=1)
    ground_truth_labels = test_set[label]
    classification_reports = classification_report(
        ground_truth_labels,
        predicted_labels,
        target_names=target_names,
        output_dict=True,
    )
    print_using_tabulate(classification_reports)
    print("\n\n\n")


load_accuracy = evaluate.load("accuracy")
load_f1 = evaluate.load("f1")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)[
        "accuracy"
    ]
    f1 = load_f1.compute(
        predictions=predictions, references=labels, average="weighted"
    )["f1"]
    return {"accuracy": accuracy, "f1": f1}


# Preprocess function with labels
def preprocess_function(examples):
    inputs = tokenizer(examples["review_body"], truncation=True)
    inputs["labels"] = examples["stars"]
    return inputs


def train_model(
    model_name,
    num_of_labels,
    dataset,
    tokenizer,
    path_to_save,
    epochs=1,
    disable_tqdm=False,
    batch_size=8,
):
    training_args = TrainingArguments(
        output_dir=path_to_save,
        warmup_steps=10000,
        optim="adamw_torch",
        num_train_epochs=epochs,
        weight_decay=1e-4,
        evaluation_strategy=IntervalStrategy.EPOCH,
        save_strategy=IntervalStrategy.EPOCH,
        metric_for_best_model="f1",
        save_total_limit=1,
        disable_tqdm=disable_tqdm,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
    )

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=num_of_labels
    )
    tokenized_train = dataset["train"].map(preprocess_function, batched=True)
    tokenized_validation = dataset["validation"].map(preprocess_function, batched=True)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_validation,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    return trainer


def prepare_dataset(train_df, test_df, valid_df, translated_df):
    train_ds = Dataset.from_pandas(train_df)
    test_ds = Dataset.from_pandas(test_df)
    valid_ds = Dataset.from_pandas(valid_df)
    translated_ds = Dataset.from_pandas(translated_df)

    dataset = DatasetDict()

    dataset["train"] = train_ds
    dataset["validation"] = valid_ds
    dataset["test"] = test_ds
    dataset["translated"] = translated_ds

    return dataset


def run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    verbose=False,
    epochs=1,
    disable_tqdm=True,
    batch_size=8):

    train, test, val = prepare_data(
        train_df, test_df, valid_df, columns_to_drop, stars, languages
    )
    train = reduce_dataset(train, stars, languages, rows_to_drop, verbose)
    train = replace_mapping(train, label, mapping)
    test = replace_mapping(test, label, mapping)
    val = replace_mapping(val, label, mapping)

    translated = drop_data(translated_test_df, stars, ["de", "es", "fr"], [])
    translated = replace_mapping(translated, label, mapping)

    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, num_labels=num_of_labels)
    train = prepare_truncation(train, tokenizer, M, N)
    test = prepare_truncation(test, tokenizer, M, N)
    val = prepare_truncation(val, tokenizer, M, N)
    translated = prepare_truncation(translated, tokenizer, M, N)

    dataset = prepare_dataset(train, test, val, translated)
    trainer = train_model(
        model_name,
        num_of_labels,
        dataset,
        tokenizer,
        f"./{model_name}_labels_{num_of_labels}_M_{M}_N_{N}",
        epochs=epochs,
        disable_tqdm=disable_tqdm,
        batch_size=batch_size,
    )

    print(f"Results for model: {model_name}, M = {M}, N = {N}")
    print("English test set results")
    tokenized_train = dataset["test"].map(preprocess_function, batched=True)
    eval_model(trainer, tokenized_train, label_names, label)
    print()
    print("Translated to English set results")
    tokenized_train = dataset["translated"].map(preprocess_function, batched=True)
    eval_model(trainer, tokenized_train, label_names, label)
    del trainer
    torch.cuda.empty_cache()

In [None]:
columns_to_drop = ['review_id', 'product_id', 'reviewer_id', 'product_category']
tokenizer = None
rows_to_drop = 30000

In [None]:
stars = [1, 2, 4, 5]
languages = ["en"]
mapping = {1: 0, 2: 0, 4: 1, 5: 1}
num_of_labels = 2
label = "stars"
label_names = ["Negative", "Positive"]
epochs = 1

**BERT UNCASED**

**Two labels without Neutral; N=128, M=382; overall 510 tokens**

In [None]:
model_name = "bert-base-uncased"

In [None]:
N = 128
M = 382


run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

**Two labels without Neutral; N=64, M=64; overall 128 tokens**

In [None]:
N = 64
M = 64

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

**Two labels without Neutral; N=0, M=510; overall 510 tokens**

In [None]:
N = 0
M = 510

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

**Two labels without Neutral; N=510, M=0; overall 510 tokens**

In [None]:
N = 510
M = 0

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

**BERT CASED**

**Two labels without Neutral; N=128, M=382; overall 510 tokens**

In [None]:
model_name = "bert-base-cased"
N = 128
M = 382


run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

**Two labels without Neutral; N=64, M=64; overall 128 tokens**

In [None]:
N = 64
M = 64

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

**Two labels without Neutral; N=510, M=0; overall 510 tokens**

In [None]:
N = 510
M = 0

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

**Two labels without Neutral; N=0, M=510; overall 510 tokens**

In [None]:
N = 0
M = 510

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

BERT UNCASED

**Two labels without Neutral; N=128, M=382; overall 510 tokens**

In [None]:
model_name = "bert-base-uncased"

In [None]:
N = 128
M = 382


run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

**Two labels without Neutral; N=64, M=64; overall 128 tokens**

In [None]:
N = 64
M = 64

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

**Two labels without Neutral; N=510, M=0; overall 510 tokens**

In [None]:
N = 510
M = 0

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

**Two labels without Neutral; N=0, M=510; overall 510 tokens**

In [None]:
N = 0
M = 510

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

**XLNET 2 labels without Neutrals**

In [None]:
model_name = "xlnet-base-cased"

**Two labels without Neutral; N=128, M=382; overall 510 tokens**

In [None]:
N = 128
M = 382

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

**Two labels without Neutral; N=64, M=64; overall 128 tokens**

In [None]:
N = 64
M = 64

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

**Two labels without Neutral; N=510, M=0; overall 510 tokens**

In [None]:
N = 510
M = 0

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

**Two labels without Neutral; N=0, M=510; overall 510 tokens**

In [None]:
N = 0
M = 510

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
    batch_size=2
)