In [1]:
!pip install transformers[torch]
!pip install sentencepiece
!pip install datasets
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0


In [2]:
import random
from tabulate import tabulate
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import classification_report

# datasets
from datasets import Dataset
from datasets import DatasetDict
from datasets import load_metric
from datasets import load_dataset

# transformers
from transformers import Trainer
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification
from transformers import IntervalStrategy

import torch
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset

from sklearn.metrics import accuracy_score, f1_score

import evaluate

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [4]:
from datasets.utils.logging import disable_progress_bar
from transformers import logging


disable_progress_bar()
logging.set_verbosity_error()

In [5]:
directory = "/kaggle/input/amazon-with-translated"
# directory = "/content/drive/MyDrive/MSc/NLP/nlp-project"

In [6]:
train_path = f"{directory}/train.csv"
test_path = f"{directory}/test.csv"
valid_path = f"{directory}/valid.csv"
translated_test_path = f"{directory}/amazon_translated_body_and_title_with_originals_all_stars.csv"

In [10]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
valid_df = pd.read_csv(valid_path)
translated_test_df = pd.read_csv(translated_test_path)

In [11]:
translated_test_df.rename(columns={"review_body": "review_body_original", "review_title": "review_title_original"}, inplace=True)

In [12]:
translated_test_df.rename(columns={"translated_body": "review_body", "translated_title": "review_title"}, inplace=True)

In [13]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)


cuda


In [15]:
SEED = 111

# Set the random seed for Python to SEED
random.seed(SEED)

# Set the random seed for numpy to SEED
np.random.seed(SEED)

# Set the random seed for torch to SEED
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [16]:
def verbose_print(msg, verbose=False):
    if verbose:
        print(msg)


def drop_data(df, stars, languages, columns_to_drop):
    df = df[df["stars"].isin(stars)]
    df = df[df["language"].isin(languages)]
    if len(columns_to_drop) > 0:
        df.drop(columns=columns_to_drop, inplace=True)
    df.reset_index(drop=True, inplace=True)

    return df


def prepare_data(
    train, test, valid, columns_to_drop, stars=[1, 2, 3, 4], languages=["en"]
):
    train = drop_data(train, stars, languages, columns_to_drop)
    test = drop_data(test, stars, languages, columns_to_drop)
    valid = drop_data(valid, stars, languages, columns_to_drop)

    return train, test, valid


def reduce_dataset(df, stars, languages, num_of_rows_to_drop, verbose=False):
    for lang in languages:
        for star in stars:
            verbose_print(f"Language: {lang}, Stars: {star}", verbose)
            verbose_print(
                f'Number of rows before: {len(df[(df["language"] == lang) & (df["stars"] == star)])}',
                verbose,
            )
            random_indices = np.random.choice(
                df[(df["language"] == lang) & (df["stars"] == star)].index,
                num_of_rows_to_drop,
                replace=False,
            )
            df.drop(index=random_indices, inplace=True)
            verbose_print(
                f'Number of rows after: {len(df[(df["language"] == lang) & (df["stars"] == star)])}',
                verbose,
            )

    return df


def replace_mapping(df, label, mapping):
    for k, v in mapping.items():
        df[label].replace(k, v, inplace=True)
    return df


def prepare_truncation(data_df, tokenizer, m, n):
    for i, r in tqdm(data_df.iterrows(), total=len(data_df), desc="Processing reviews"):
        tokenized_row = tokenizer.tokenize(r["review_body"])
        if len(tokenized_row) > m + n:
            data_df.loc[i, "review_body"] = tokenizer.convert_tokens_to_string(
                tokenized_row[:m] + tokenized_row[-n:]
            )

    return data_df


def print_using_tabulate(data):
    table_data = []
    for key, values in data.items():
        if key != "macro avg" and key != "weighted avg":
            if isinstance(values, dict):
                row = [
                    key,
                    values["precision"],
                    values["recall"],
                    values["f1-score"],
                    values["support"],
                ]
                table_data.append(row)

    # Print the classification report using tabulate
    headers = ["Class", "Precision", "Recall", "F1-Score", "Support"]
    m_table = tabulate(table_data, headers=headers, tablefmt="psql", floatfmt=".4f")
    print(m_table)


def eval_model(trainer, test_set, target_names, label):
    predictions = trainer.predict(test_set)
    predicted_labels = predictions.predictions.argmax(axis=1)
    ground_truth_labels = test_set[label]
    classification_reports = classification_report(
        ground_truth_labels,
        predicted_labels,
        target_names=target_names,
        output_dict=True,
    )
    print_using_tabulate(classification_reports)
    print("\n\n\n")


load_accuracy = evaluate.load("accuracy")
load_f1 = evaluate.load("f1")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)[
        "accuracy"
    ]
    f1 = load_f1.compute(
        predictions=predictions, references=labels, average="weighted"
    )["f1"]
    return {"accuracy": accuracy, "f1": f1}


# Preprocess function with labels
def preprocess_function(examples):
    inputs = tokenizer(examples["review_body"], truncation=True)
    inputs["labels"] = examples["stars"]
    return inputs


def train_model(
    model_name,
    num_of_labels,
    dataset,
    tokenizer,
    path_to_save,
    epochs=1,
    disable_tqdm=False,
    batch_size=8,
):
    training_args = TrainingArguments(
        output_dir=path_to_save,
        warmup_steps=10000,
        optim="adamw_torch",
        num_train_epochs=epochs,
        weight_decay=1e-4,
        evaluation_strategy=IntervalStrategy.EPOCH,
        save_strategy=IntervalStrategy.EPOCH,
        metric_for_best_model="f1",
        save_total_limit=1,
        disable_tqdm=disable_tqdm,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
    )

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=num_of_labels
    )
    tokenized_train = dataset["train"].map(preprocess_function, batched=True)
    tokenized_validation = dataset["validation"].map(preprocess_function, batched=True)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_validation,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )


    return trainer


def prepare_dataset(train_df, test_df, valid_df, translated_df):
    train_ds = Dataset.from_pandas(train_df)
    test_ds = Dataset.from_pandas(test_df)
    valid_ds = Dataset.from_pandas(valid_df)
    translated_ds = Dataset.from_pandas(translated_df)

    dataset = DatasetDict()

    dataset["train"] = train_ds
    dataset["validation"] = valid_ds
    dataset["test"] = test_ds
    dataset["translated"] = translated_ds

    return dataset


def run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    verbose=False,
    epochs=1,
    disable_tqdm=True,
    batch_size=8):

    train, test, val = prepare_data(
        train_df, test_df, valid_df, columns_to_drop, stars, languages
    )
    train = reduce_dataset(train, stars, languages, rows_to_drop, verbose)
    train = replace_mapping(train, label, mapping)
    test = replace_mapping(test, label, mapping)
    val = replace_mapping(val, label, mapping)

    translated = drop_data(translated_test_df, stars, ["de", "es", "fr"], [])
    translated = replace_mapping(translated, label, mapping)

    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, num_labels=num_of_labels)
    train = prepare_truncation(train, tokenizer, M, N)
    test = prepare_truncation(test, tokenizer, M, N)
    val = prepare_truncation(val, tokenizer, M, N)
    translated = prepare_truncation(translated, tokenizer, M, N)

    dataset = prepare_dataset(train, test, val, translated)
    trainer = train_model(
        model_name,
        num_of_labels,
        dataset,
        tokenizer,
        f"./{model_name}_labels_{num_of_labels}_M_{M}_N_{N}",
        epochs=epochs,
        disable_tqdm=disable_tqdm,
        batch_size=batch_size,
    )

    trainer.train()

    print(f"Results for model: {model_name}, M = {M}, N = {N}")
    print("English test set results")
    tokenized_train = dataset["test"].map(preprocess_function, batched=True)
    eval_model(trainer, tokenized_train, label_names, label)
    print()
    print("Translated to English set results")
    tokenized_train = dataset["translated"].map(preprocess_function, batched=True)
    eval_model(trainer, tokenized_train, label_names, label)
    del trainer
    torch.cuda.empty_cache()

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [17]:
columns_to_drop = ['review_id', 'product_id', 'reviewer_id', 'product_category']
tokenizer = None
rows_to_drop = 30000

In [21]:
stars = [1, 2, 3, 4, 5]
languages = ["en"]
mapping = {1: 0, 2: 0, 3: 0, 4: 1, 5: 1}
num_of_labels = 2
label = "stars"
label_names = ["Negative", "Positive"]
epochs = 1

**BERT UNCASED**

**Two labels with Neutral; N=128, M=382; overall 510 tokens**

In [33]:
model_name = "bert-base-uncased"

In [34]:
N = 128
M = 382


run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

Processing reviews: 100%|██████████| 50000/50000 [00:15<00:00, 3284.59it/s]
Processing reviews: 100%|██████████| 5000/5000 [00:01<00:00, 2945.65it/s]
Processing reviews: 100%|██████████| 5000/5000 [00:01<00:00, 3395.53it/s]
Processing reviews: 100%|██████████| 15000/15000 [00:04<00:00, 3547.66it/s]


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3393,0.362679,0.8686,0.869626


Results for model: bert-base-uncased, M = 382, N = 128
English test set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9232 |   0.8497 |     0.8849 |      3000 |
| Positive |      0.7986 |   0.8940 |     0.8436 |      2000 |
+----------+-------------+----------+------------+-----------+





Translated to English set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9253 |   0.8194 |     0.8692 |      9000 |
| Positive |      0.7688 |   0.9008 |     0.8296 |      6000 |
+----------+-------------+----------+------------+-----------+






**Two labels with Neutral; N=64, M=64; overall 128 tokens**

In [35]:
N = 64
M = 64

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

Processing reviews: 100%|██████████| 50000/50000 [00:16<00:00, 2946.41it/s]
Processing reviews: 100%|██████████| 5000/5000 [00:01<00:00, 3094.79it/s]
Processing reviews: 100%|██████████| 5000/5000 [00:01<00:00, 2968.26it/s]
Processing reviews: 100%|██████████| 15000/15000 [00:05<00:00, 2952.16it/s]


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3399,0.363241,0.8712,0.871688


Results for model: bert-base-uncased, M = 64, N = 64
English test set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9035 |   0.8610 |     0.8817 |      3000 |
| Positive |      0.8052 |   0.8620 |     0.8326 |      2000 |
+----------+-------------+----------+------------+-----------+





Translated to English set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9121 |   0.8322 |     0.8703 |      9000 |
| Positive |      0.7775 |   0.8797 |     0.8255 |      6000 |
+----------+-------------+----------+------------+-----------+






**Two labels with Neutral; N=0, M=510; overall 510 tokens**

In [36]:
N = 0
M = 510

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

Processing reviews: 100%|██████████| 50000/50000 [00:16<00:00, 3050.54it/s]
Processing reviews: 100%|██████████| 5000/5000 [00:01<00:00, 3223.67it/s]
Processing reviews: 100%|██████████| 5000/5000 [00:01<00:00, 3177.10it/s]
Processing reviews: 100%|██████████| 15000/15000 [00:04<00:00, 3277.94it/s]


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3416,0.403775,0.8574,0.858557


Results for model: bert-base-uncased, M = 510, N = 0
English test set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9171 |   0.8333 |     0.8732 |      3000 |
| Positive |      0.7801 |   0.8870 |     0.8301 |      2000 |
+----------+-------------+----------+------------+-----------+





Translated to English set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9287 |   0.7927 |     0.8553 |      9000 |
| Positive |      0.7450 |   0.9087 |     0.8187 |      6000 |
+----------+-------------+----------+------------+-----------+






**Two labels with Neutral; N=510, M=0; overall 510 tokens**

In [37]:
N = 510
M = 0

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

Processing reviews: 100%|██████████| 50000/50000 [00:16<00:00, 2969.62it/s]
Processing reviews: 100%|██████████| 5000/5000 [00:01<00:00, 3115.46it/s]
Processing reviews: 100%|██████████| 5000/5000 [00:01<00:00, 3094.02it/s]
Processing reviews: 100%|██████████| 15000/15000 [00:04<00:00, 3269.51it/s]


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3414,0.366499,0.8658,0.866665


Results for model: bert-base-uncased, M = 0, N = 510
English test set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9134 |   0.8547 |     0.8831 |      3000 |
| Positive |      0.8012 |   0.8785 |     0.8381 |      2000 |
+----------+-------------+----------+------------+-----------+





Translated to English set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9236 |   0.8061 |     0.8609 |      9000 |
| Positive |      0.7558 |   0.9000 |     0.8216 |      6000 |
+----------+-------------+----------+------------+-----------+






**BERT CASED**

In [22]:
stars = [1, 2, 4, 5]
languages = ["en"]
mapping = {1: 0, 2: 0, 4: 1, 5: 1}
num_of_labels = 2
label = "stars"
label_names = ["Negative", "Positive"]
epochs = 1

**Two labels without Neutral; N=128, M=382; overall 510 tokens**

In [39]:
model_name = "bert-base-cased"
N = 128
M = 382


run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Processing reviews: 100%|██████████| 40000/40000 [00:11<00:00, 3427.03it/s]
Processing reviews: 100%|██████████| 4000/4000 [00:01<00:00, 2684.83it/s]
Processing reviews: 100%|██████████| 4000/4000 [00:01<00:00, 3121.66it/s]
Processing reviews: 100%|██████████| 12000/12000 [00:03<00:00, 3574.70it/s]


Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2795,0.297899,0.91375,0.913715


Results for model: bert-base-cased, M = 382, N = 128
English test set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9031 |   0.9365 |     0.9195 |      2000 |
| Positive |      0.9341 |   0.8995 |     0.9165 |      2000 |
+----------+-------------+----------+------------+-----------+





Translated to English set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9054 |   0.9170 |     0.9112 |      6000 |
| Positive |      0.9159 |   0.9042 |     0.9100 |      6000 |
+----------+-------------+----------+------------+-----------+






**Two labels without Neutral; N=64, M=64; overall 128 tokens**

In [40]:
N = 64
M = 64

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

Processing reviews: 100%|██████████| 40000/40000 [00:12<00:00, 3181.15it/s]
Processing reviews: 100%|██████████| 4000/4000 [00:01<00:00, 3382.42it/s]
Processing reviews: 100%|██████████| 4000/4000 [00:01<00:00, 3260.24it/s]
Processing reviews: 100%|██████████| 12000/12000 [00:03<00:00, 3528.04it/s]


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.289,0.331093,0.91025,0.910248


Results for model: bert-base-cased, M = 64, N = 64
English test set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9006 |   0.9240 |     0.9121 |      2000 |
| Positive |      0.9220 |   0.8980 |     0.9098 |      2000 |
+----------+-------------+----------+------------+-----------+





Translated to English set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9165 |   0.8928 |     0.9045 |      6000 |
| Positive |      0.8955 |   0.9187 |     0.9070 |      6000 |
+----------+-------------+----------+------------+-----------+






**Two labels without Neutral; N=510, M=0; overall 510 tokens**

In [41]:
N = 510
M = 0

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

Processing reviews: 100%|██████████| 40000/40000 [00:11<00:00, 3382.29it/s]
Processing reviews: 100%|██████████| 4000/4000 [00:01<00:00, 2927.47it/s]
Processing reviews: 100%|██████████| 4000/4000 [00:01<00:00, 3432.47it/s]
Processing reviews: 100%|██████████| 12000/12000 [00:03<00:00, 3634.54it/s]


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2889,0.283645,0.914,0.913958


Results for model: bert-base-cased, M = 0, N = 510
English test set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9012 |   0.9345 |     0.9175 |      2000 |
| Positive |      0.9320 |   0.8975 |     0.9144 |      2000 |
+----------+-------------+----------+------------+-----------+





Translated to English set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9091 |   0.9105 |     0.9098 |      6000 |
| Positive |      0.9104 |   0.9090 |     0.9097 |      6000 |
+----------+-------------+----------+------------+-----------+






**Two labels without Neutral; N=0, M=510; overall 510 tokens**

In [42]:
N = 0
M = 510

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

Processing reviews: 100%|██████████| 40000/40000 [00:11<00:00, 3368.02it/s]
Processing reviews: 100%|██████████| 4000/4000 [00:01<00:00, 3438.50it/s]
Processing reviews: 100%|██████████| 4000/4000 [00:01<00:00, 3249.93it/s]
Processing reviews: 100%|██████████| 12000/12000 [00:03<00:00, 3614.45it/s]


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2923,0.380517,0.90175,0.901486


Results for model: bert-base-cased, M = 510, N = 0
English test set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.8654 |   0.9615 |     0.9109 |      2000 |
| Positive |      0.9567 |   0.8505 |     0.9005 |      2000 |
+----------+-------------+----------+------------+-----------+





Translated to English set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.8657 |   0.9475 |     0.9048 |      6000 |
| Positive |      0.9420 |   0.8530 |     0.8953 |      6000 |
+----------+-------------+----------+------------+-----------+






BERT UNCASED

**Two labels without Neutral; N=128, M=382; overall 510 tokens**

In [43]:
model_name = "bert-base-uncased"

In [44]:
N = 128
M = 382


run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

Processing reviews: 100%|██████████| 40000/40000 [00:13<00:00, 3046.25it/s]
Processing reviews: 100%|██████████| 4000/4000 [00:01<00:00, 3242.18it/s]
Processing reviews: 100%|██████████| 4000/4000 [00:01<00:00, 3171.36it/s]
Processing reviews: 100%|██████████| 12000/12000 [00:03<00:00, 3334.14it/s]


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.267,0.314819,0.909,0.908977


Results for model: bert-base-uncased, M = 382, N = 128
English test set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9221 |   0.9115 |     0.9168 |      2000 |
| Positive |      0.9125 |   0.9230 |     0.9177 |      2000 |
+----------+-------------+----------+------------+-----------+





Translated to English set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9290 |   0.8877 |     0.9079 |      6000 |
| Positive |      0.8925 |   0.9322 |     0.9119 |      6000 |
+----------+-------------+----------+------------+-----------+






**Two labels without Neutral; N=64, M=64; overall 128 tokens**

In [45]:
N = 64
M = 64

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

Processing reviews: 100%|██████████| 40000/40000 [00:13<00:00, 2941.51it/s]
Processing reviews: 100%|██████████| 4000/4000 [00:01<00:00, 3077.58it/s]
Processing reviews: 100%|██████████| 4000/4000 [00:01<00:00, 2903.50it/s]
Processing reviews: 100%|██████████| 12000/12000 [00:03<00:00, 3250.62it/s]


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2649,0.306264,0.918,0.917992


Results for model: bert-base-uncased, M = 64, N = 64
English test set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9244 |   0.9175 |     0.9210 |      2000 |
| Positive |      0.9181 |   0.9250 |     0.9215 |      2000 |
+----------+-------------+----------+------------+-----------+





Translated to English set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9259 |   0.8973 |     0.9114 |      6000 |
| Positive |      0.9004 |   0.9282 |     0.9141 |      6000 |
+----------+-------------+----------+------------+-----------+






**Two labels without Neutral; N=510, M=0; overall 510 tokens**

In [46]:
N = 510
M = 0

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

Processing reviews: 100%|██████████| 40000/40000 [00:12<00:00, 3081.13it/s]
Processing reviews: 100%|██████████| 4000/4000 [00:01<00:00, 3135.16it/s]
Processing reviews: 100%|██████████| 4000/4000 [00:01<00:00, 2857.34it/s]
Processing reviews: 100%|██████████| 12000/12000 [00:04<00:00, 2849.56it/s]


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2622,0.291302,0.9125,0.912495


Results for model: bert-base-uncased, M = 0, N = 510
English test set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9206 |   0.9165 |     0.9186 |      2000 |
| Positive |      0.9169 |   0.9210 |     0.9189 |      2000 |
+----------+-------------+----------+------------+-----------+





Translated to English set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9269 |   0.8817 |     0.9037 |      6000 |
| Positive |      0.8872 |   0.9305 |     0.9083 |      6000 |
+----------+-------------+----------+------------+-----------+






**Two labels without Neutral; N=0, M=510; overall 510 tokens**

In [47]:
N = 0
M = 510

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

Processing reviews: 100%|██████████| 40000/40000 [00:13<00:00, 3042.73it/s]
Processing reviews: 100%|██████████| 4000/4000 [00:01<00:00, 3250.50it/s]
Processing reviews: 100%|██████████| 4000/4000 [00:01<00:00, 3154.68it/s]
Processing reviews: 100%|██████████| 12000/12000 [00:03<00:00, 3270.20it/s]


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2634,0.31201,0.911,0.910933


Results for model: bert-base-uncased, M = 510, N = 0
English test set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9376 |   0.8870 |     0.9116 |      2000 |
| Positive |      0.8928 |   0.9410 |     0.9163 |      2000 |
+----------+-------------+----------+------------+-----------+





Translated to English set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9430 |   0.8550 |     0.8969 |      6000 |
| Positive |      0.8674 |   0.9483 |     0.9061 |      6000 |
+----------+-------------+----------+------------+-----------+






**XLNET 2 labels without Neutrals**

In [18]:
model_name = "xlnet-base-cased"

**Two labels without Neutral; N=128, M=382; overall 510 tokens**

In [49]:
N = 128
M = 382

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Processing reviews: 100%|██████████| 40000/40000 [00:13<00:00, 2903.25it/s]
Processing reviews: 100%|██████████| 4000/4000 [00:01<00:00, 2795.88it/s]
Processing reviews: 100%|██████████| 4000/4000 [00:01<00:00, 2831.58it/s]
Processing reviews: 100%|██████████| 12000/12000 [00:03<00:00, 3066.93it/s]


Downloading pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2947,0.338842,0.91925,0.919246


Results for model: xlnet-base-cased, M = 382, N = 128
English test set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9067 |   0.9325 |     0.9194 |      2000 |
| Positive |      0.9305 |   0.9040 |     0.9171 |      2000 |
+----------+-------------+----------+------------+-----------+





Translated to English set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9228 |   0.9058 |     0.9142 |      6000 |
| Positive |      0.9075 |   0.9242 |     0.9158 |      6000 |
+----------+-------------+----------+------------+-----------+






**Two labels without Neutral; N=64, M=64; overall 128 tokens**

In [50]:
N = 64
M = 64

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

Processing reviews: 100%|██████████| 40000/40000 [00:14<00:00, 2681.99it/s]
Processing reviews: 100%|██████████| 4000/4000 [00:01<00:00, 2808.38it/s]
Processing reviews: 100%|██████████| 4000/4000 [00:01<00:00, 2710.67it/s]
Processing reviews: 100%|██████████| 12000/12000 [00:04<00:00, 2999.69it/s]


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3117,0.384186,0.91825,0.918249


Results for model: xlnet-base-cased, M = 64, N = 64
English test set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9158 |   0.9305 |     0.9231 |      2000 |
| Positive |      0.9294 |   0.9145 |     0.9219 |      2000 |
+----------+-------------+----------+------------+-----------+





Translated to English set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9271 |   0.9152 |     0.9211 |      6000 |
| Positive |      0.9162 |   0.9280 |     0.9221 |      6000 |
+----------+-------------+----------+------------+-----------+






**Two labels without Neutral; N=510, M=0; overall 510 tokens**

In [51]:
N = 510
M = 0

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
)

Processing reviews: 100%|██████████| 40000/40000 [00:14<00:00, 2735.34it/s]
Processing reviews: 100%|██████████| 4000/4000 [00:01<00:00, 2804.00it/s]
Processing reviews: 100%|██████████| 4000/4000 [00:01<00:00, 2889.25it/s]
Processing reviews: 100%|██████████| 12000/12000 [00:03<00:00, 3044.02it/s]


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2884,0.346373,0.92025,0.920186


Results for model: xlnet-base-cased, M = 0, N = 510
English test set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.8854 |   0.9615 |     0.9219 |      2000 |
| Positive |      0.9579 |   0.8755 |     0.9148 |      2000 |
+----------+-------------+----------+------------+-----------+





Translated to English set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9026 |   0.9328 |     0.9175 |      6000 |
| Positive |      0.9305 |   0.8993 |     0.9147 |      6000 |
+----------+-------------+----------+------------+-----------+






**Two labels without Neutral; N=0, M=510; overall 510 tokens**

In [23]:
N = 0
M = 510

run_and_eval(
    train_df,
    test_df,
    valid_df,
    stars,
    languages,
    columns_to_drop,
    rows_to_drop,
    label,
    mapping,
    model_name,
    num_of_labels,
    M,
    N,
    label_names,
    False,
    epochs,
    False,
    batch_size=2
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Processing reviews: 100%|██████████| 40000/40000 [00:14<00:00, 2833.43it/s]
Processing reviews: 100%|██████████| 4000/4000 [00:01<00:00, 2971.48it/s]
Processing reviews: 100%|██████████| 4000/4000 [00:01<00:00, 2902.52it/s]
Processing reviews: 100%|██████████| 12000/12000 [00:04<00:00, 2972.90it/s]


Downloading pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6927,0.687049,0.52,0.382036


Results for model: xlnet-base-cased, M = 510, N = 0
English test set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.9444 |   0.0595 |     0.1119 |      2000 |
| Positive |      0.5145 |   0.9965 |     0.6786 |      2000 |
+----------+-------------+----------+------------+-----------+





Translated to English set results


+----------+-------------+----------+------------+-----------+
| Class    |   Precision |   Recall |   F1-Score |   Support |
|----------+-------------+----------+------------+-----------|
| Negative |      0.8352 |   0.0498 |     0.0941 |      6000 |
| Positive |      0.5103 |   0.9902 |     0.6735 |      6000 |
+----------+-------------+----------+------------+-----------+




