In [1]:
!pip install transformers[torch]
!pip install sentencepiece
!pip install datasets
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0


In [2]:
import random
from tabulate import tabulate
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import classification_report

# datasets
from datasets import Dataset
from datasets import DatasetDict
from datasets import load_metric
from datasets import load_dataset

# transformers
from transformers import Trainer
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification
from transformers import IntervalStrategy

import torch
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset

from sklearn.metrics import accuracy_score, f1_score

import evaluate

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [4]:
from datasets.utils.logging import disable_progress_bar
from transformers import logging


disable_progress_bar()
logging.set_verbosity_error()

In [5]:
directory = "/kaggle/input/amazon-with-translated"
# directory = "/content/drive/MyDrive/MSc/NLP/nlp-project"

In [6]:
train_path = f"{directory}/train.csv"
test_path = f"{directory}/test.csv"
valid_path = f"{directory}/valid.csv"
translated_test_path = f"{directory}/amazon_translated_body_and_title_with_originals_all_stars.csv"

In [7]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
valid_df = pd.read_csv(valid_path)
translated_test_df = pd.read_csv(translated_test_path)

In [8]:
translated_test_df.rename(columns={"review_body": "review_body_original", "review_title": "review_title_original"}, inplace=True)

In [9]:
translated_test_df.rename(columns={"translated_body": "review_body", "translated_title": "review_title"}, inplace=True)

In [10]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)


cuda


In [11]:
SEED = 111

# Set the random seed for Python to SEED
random.seed(SEED)

# Set the random seed for numpy to SEED
np.random.seed(SEED)

# Set the random seed for torch to SEED
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [12]:
def verbose_print(msg, verbose=False):
    if verbose:
        print(msg)


def drop_data(df, stars, languages, columns_to_drop):
    df = df[df["stars"].isin(stars)]
    df = df[df["language"].isin(languages)]
    if len(columns_to_drop) > 0:
        df.drop(columns=columns_to_drop, inplace=True)
    df.reset_index(drop=True, inplace=True)

    return df


def prepare_data(
    train, test, valid, columns_to_drop, stars=[1, 2, 3, 4], languages=["en"]
):
    train = drop_data(train, stars, languages, columns_to_drop)
    test = drop_data(test, stars, languages, columns_to_drop)
    valid = drop_data(valid, stars, languages, columns_to_drop)

    return train, test, valid


def reduce_dataset(df, stars, languages, num_of_rows_to_drop, verbose=False):
    for lang in languages:
        for star in stars:
            verbose_print(f"Language: {lang}, Stars: {star}", verbose)
            verbose_print(
                f'Number of rows before: {len(df[(df["language"] == lang) & (df["stars"] == star)])}',
                verbose,
            )
            random_indices = np.random.choice(
                df[(df["language"] == lang) & (df["stars"] == star)].index,
                num_of_rows_to_drop,
                replace=False,
            )
            df.drop(index=random_indices, inplace=True)
            verbose_print(
                f'Number of rows after: {len(df[(df["language"] == lang) & (df["stars"] == star)])}',
                verbose,
            )

    return df


def replace_mapping(df, label, mapping):
    for k, v in mapping.items():
        df[label].replace(k, v, inplace=True)
    return df


def prepare_truncation(data_df, tokenizer, m, n):
    for i, r in tqdm(data_df.iterrows(), total=len(data_df), desc="Processing reviews"):
        tokenized_row = tokenizer.tokenize(r["review_body"])
        if len(tokenized_row) > m + n:
            data_df.loc[i, "review_body"] = tokenizer.convert_tokens_to_string(
                tokenized_row[:m] + tokenized_row[-n:]
            )

    return data_df


def print_using_tabulate(data):
    table_data = []
    for key, values in data.items():
        if key != "macro avg" and key != "weighted avg":
            if isinstance(values, dict):
                row = [
                    key,
                    values["precision"],
                    values["recall"],
                    values["f1-score"],
                    values["support"],
                ]
                table_data.append(row)

    # Print the classification report using tabulate
    headers = ["Class", "Precision", "Recall", "F1-Score", "Support"]
    m_table = tabulate(table_data, headers=headers, tablefmt="psql", floatfmt=".4f")
    print(m_table)


def eval_model(trainer, test_set, target_names, label):
    predictions = trainer.predict(test_set)
    predicted_labels = predictions.predictions.argmax(axis=1)
    ground_truth_labels = test_set[label]
    classification_reports = classification_report(
        ground_truth_labels,
        predicted_labels,
        target_names=target_names,
        output_dict=True,
    )
    print_using_tabulate(classification_reports)
    print("\n\n\n")


load_accuracy = evaluate.load("accuracy")
load_f1 = evaluate.load("f1")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)[
        "accuracy"
    ]
    f1 = load_f1.compute(
        predictions=predictions, references=labels, average="weighted"
    )["f1"]
    return {"accuracy": accuracy, "f1": f1}


# Preprocess function with labels
def preprocess_function(examples):
    inputs = tokenizer(examples["review_body"], truncation=True)
    inputs["labels"] = examples["stars"]
    return inputs


def train_model(
    model_name,
    num_of_labels,
    dataset,
    tokenizer,
    path_to_save,
    epochs=1,
    disable_tqdm=False,
):
    training_args = TrainingArguments(
        output_dir=path_to_save,
        warmup_steps=10000,
        optim="adamw_torch",
        num_train_epochs=epochs,
        weight_decay=1e-4,
        evaluation_strategy=IntervalStrategy.EPOCH,
        save_strategy=IntervalStrategy.EPOCH,
        metric_for_best_model="f1",
        save_total_limit=1,
        disable_tqdm=disable_tqdm,
    )

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=num_of_labels
    )
    tokenized_train = dataset["train"].map(preprocess_function, batched=True)
    tokenized_validation = dataset["validation"].map(preprocess_function, batched=True)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_validation,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    return trainer


def prepare_dataset(train_df, test_df, valid_df, translated_df):
    train_ds = Dataset.from_pandas(train_df)
    test_ds = Dataset.from_pandas(test_df)
    valid_ds = Dataset.from_pandas(valid_df)
    translated_ds = Dataset.from_pandas(translated_df)

    dataset = DatasetDict()

    dataset["train"] = train_ds
    dataset["validation"] = valid_ds
    dataset["test"] = test_ds
    dataset["translated"] = translated_ds

    return dataset


def run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    verbose=False,
    epochs=1,
    disable_tqdm=True):

    train, test, val = prepare_data(
        train_df, test_df, valid_df, columns_to_drop, stars, languages
    )
    train = reduce_dataset(train, stars, languages, rows_to_drop, verbose)
    train = replace_mapping(train, label, mapping)
    test = replace_mapping(test, label, mapping)
    val = replace_mapping(val, label, mapping)

    translated = drop_data(translated_test_df, stars, ["de", "es", "fr"], [])
    translated = replace_mapping(translated, label, mapping)

    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, num_labels=num_of_labels)
    train = prepare_truncation(train, tokenizer, M, N)
    test = prepare_truncation(test, tokenizer, M, N)
    val = prepare_truncation(val, tokenizer, M, N)
    translated = prepare_truncation(translated, tokenizer, M, N)

    dataset = prepare_dataset(train, test, val, translated)
    trainer = train_model(
        model_name,
        num_of_labels,
        dataset,
        tokenizer,
        f"./{model_name}_labels_{num_of_labels}_M_{M}_N_{N}",
        epochs=epochs,
        disable_tqdm=disable_tqdm
    )

    print(f"Results for model: {model_name}, M = {M}, N = {N}")
    print("English test set results")
    tokenized_train = dataset["test"].map(preprocess_function, batched=True)
    eval_model(trainer, tokenized_train, label_names, label)
    print()
    print("Translated to English set results")
    tokenized_train = dataset["translated"].map(preprocess_function, batched=True)
    eval_model(trainer, tokenized_train, label_names, label)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [13]:
columns_to_drop = ['review_id', 'product_id', 'reviewer_id', 'product_category']
tokenizer = None
rows_to_drop = 30000

**BERT CASED**

**Two labels with Neutral; N=128, M=382; overall 510 tokens**

In [14]:
model_name = "bert-base-cased"
stars = [1, 2, 3, 4, 5]
languages = ["en"]
mapping = {1: 0, 2: 0, 3: 0, 4: 1, 5: 1}
N = 128
M = 382
num_of_labels = 2
label = "stars"
label_names = ["Negative", "Positive"]
epochs = 1

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Processing reviews: 100%|██████████| 50000/50000 [00:13<00:00, 3661.52it/s]
Processing reviews: 100%|██████████| 5000/5000 [00:01<00:00, 3767.48it/s]
Processing reviews: 100%|██████████| 5000/5000 [00:01<00:00, 3671.87it/s]
Processing reviews: 100%|██████████| 15000/15000 [00:03<00:00, 3911.97it/s]


Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3451,0.403006,0.8504,0.851868


Results for model: bert-base-cased, M = 382, N = 128
English test set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9329 |   0.8060 |     0.8648 |      3000 |
| Positive |      0.7583 |   0.9130 |     0.8285 |      2000 |
+----------+-------------+----------+------------+-----------+





Translated to English set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9399 |   0.7697 |     0.8463 |      9000 |
| Positive |      0.7283 |   0.9262 |     0.8154 |      6000 |
+----------+-------------+----------+------------+-----------+






**Two labels with Neutral; N=64, M=64; overall 128 tokens**

In [15]:
N = 64
M = 64

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

Processing reviews: 100%|██████████| 50000/50000 [00:16<00:00, 3078.86it/s]
Processing reviews: 100%|██████████| 5000/5000 [00:01<00:00, 3193.60it/s]
Processing reviews: 100%|██████████| 5000/5000 [00:01<00:00, 3355.64it/s]
Processing reviews: 100%|██████████| 15000/15000 [00:04<00:00, 3411.12it/s]


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3596,0.391421,0.8692,0.868661


Results for model: bert-base-cased, M = 64, N = 64
English test set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.8780 |   0.9137 |     0.8955 |      3000 |
| Positive |      0.8621 |   0.8095 |     0.8350 |      2000 |
+----------+-------------+----------+------------+-----------+





Translated to English set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.8776 |   0.8921 |     0.8848 |      9000 |
| Positive |      0.8340 |   0.8133 |     0.8236 |      6000 |
+----------+-------------+----------+------------+-----------+






**Two labels with Neutral; N=510, M=0; overall 510 tokens**

In [16]:
N = 510
M = 0

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

Processing reviews: 100%|██████████| 50000/50000 [00:15<00:00, 3322.12it/s]
Processing reviews: 100%|██████████| 5000/5000 [00:01<00:00, 3487.97it/s]
Processing reviews: 100%|██████████| 5000/5000 [00:01<00:00, 3436.39it/s]
Processing reviews: 100%|██████████| 15000/15000 [00:04<00:00, 3682.84it/s]


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3528,0.356827,0.8732,0.873334


Results for model: bert-base-cased, M = 0, N = 510
English test set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.8884 |   0.8813 |     0.8849 |      3000 |
| Positive |      0.8241 |   0.8340 |     0.8290 |      2000 |
+----------+-------------+----------+------------+-----------+





Translated to English set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.8940 |   0.8481 |     0.8705 |      9000 |
| Positive |      0.7885 |   0.8492 |     0.8177 |      6000 |
+----------+-------------+----------+------------+-----------+






**Two labels with Neutral; N=0, M=510; overall 510 tokens**

In [17]:
N = 0
M = 510

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

Processing reviews: 100%|██████████| 50000/50000 [00:14<00:00, 3344.02it/s]
Processing reviews: 100%|██████████| 5000/5000 [00:01<00:00, 3497.60it/s]
Processing reviews: 100%|██████████| 5000/5000 [00:01<00:00, 3290.11it/s]
Processing reviews: 100%|██████████| 15000/15000 [00:04<00:00, 3701.66it/s]


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3475,0.312681,0.8798,0.879164


Results for model: bert-base-cased, M = 510, N = 0
English test set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.8825 |   0.9160 |     0.8989 |      3000 |
| Positive |      0.8664 |   0.8170 |     0.8410 |      2000 |
+----------+-------------+----------+------------+-----------+





Translated to English set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.8884 |   0.8821 |     0.8853 |      9000 |
| Positive |      0.8250 |   0.8338 |     0.8294 |      6000 |
+----------+-------------+----------+------------+-----------+






**XLNET 2 labels with Neutrals**

In [18]:
model_name = "xlnet-base-cased"

**Two labels with Neutral; N=128, M=382; overall 510 tokens**

In [19]:
N = 128
M = 382

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Processing reviews: 100%|██████████| 50000/50000 [00:17<00:00, 2832.46it/s]
Processing reviews: 100%|██████████| 5000/5000 [00:01<00:00, 2958.42it/s]
Processing reviews: 100%|██████████| 5000/5000 [00:01<00:00, 2911.43it/s]
Processing reviews: 100%|██████████| 15000/15000 [00:04<00:00, 3027.13it/s]


Downloading pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3641,0.293744,0.8884,0.8884


Results for model: xlnet-base-cased, M = 382, N = 128
English test set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.8926 |   0.9083 |     0.9004 |      3000 |
| Positive |      0.8588 |   0.8360 |     0.8472 |      2000 |
+----------+-------------+----------+------------+-----------+





Translated to English set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9082 |   0.8628 |     0.8849 |      9000 |
| Positive |      0.8085 |   0.8692 |     0.8378 |      6000 |
+----------+-------------+----------+------------+-----------+






**Two labels with Neutral; N=64, M=64; overall 128 tokens**

In [20]:
N = 64
M = 64

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

Processing reviews: 100%|██████████| 50000/50000 [00:18<00:00, 2633.62it/s]
Processing reviews: 100%|██████████| 5000/5000 [00:01<00:00, 2847.53it/s]
Processing reviews: 100%|██████████| 5000/5000 [00:01<00:00, 2793.50it/s]
Processing reviews: 100%|██████████| 15000/15000 [00:04<00:00, 3041.32it/s]


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3546,0.433952,0.8744,0.875136


Results for model: xlnet-base-cased, M = 64, N = 64
English test set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9178 |   0.8670 |     0.8917 |      3000 |
| Positive |      0.8158 |   0.8835 |     0.8483 |      2000 |
+----------+-------------+----------+------------+-----------+





Translated to English set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9251 |   0.8344 |     0.8774 |      9000 |
| Positive |      0.7835 |   0.8987 |     0.8371 |      6000 |
+----------+-------------+----------+------------+-----------+






**Two labels with Neutral; N=510, M=0; overall 510 tokens**

In [21]:
N = 510
M = 0

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

Processing reviews: 100%|██████████| 50000/50000 [00:17<00:00, 2822.27it/s]
Processing reviews: 100%|██████████| 5000/5000 [00:01<00:00, 2878.19it/s]
Processing reviews: 100%|██████████| 5000/5000 [00:01<00:00, 2883.52it/s]
Processing reviews: 100%|██████████| 15000/15000 [00:05<00:00, 2992.22it/s]


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3513,0.367319,0.8852,0.885082


Results for model: xlnet-base-cased, M = 0, N = 510
English test set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.8910 |   0.9043 |     0.8976 |      3000 |
| Positive |      0.8532 |   0.8340 |     0.8435 |      2000 |
+----------+-------------+----------+------------+-----------+





Translated to English set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.8975 |   0.8720 |     0.8846 |      9000 |
| Positive |      0.8159 |   0.8507 |     0.8329 |      6000 |
+----------+-------------+----------+------------+-----------+






**Two labels with Neutral; N=0, M=510; overall 510 tokens**

In [22]:
N = 0
M = 510

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

Processing reviews: 100%|██████████| 50000/50000 [00:17<00:00, 2887.96it/s]
Processing reviews: 100%|██████████| 5000/5000 [00:01<00:00, 2895.24it/s]
Processing reviews: 100%|██████████| 5000/5000 [00:01<00:00, 2925.08it/s]
Processing reviews: 100%|██████████| 15000/15000 [00:04<00:00, 3025.20it/s]


Epoch,Training Loss,Validation Loss
