In [1]:
!pip install transformers[torch]
!pip install sentencepiece
!pip install datasets
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0


In [2]:
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

# datasets
from datasets import Dataset
from datasets import DatasetDict
from datasets import load_metric
from datasets import load_dataset

# transformers
from transformers import Trainer
from transformers import AutoModel
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoModelForSequenceClassification
from transformers import EarlyStoppingCallback
from transformers import IntervalStrategy

import torch
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset

from sklearn.metrics import accuracy_score, f1_score

import evaluate

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
SEED = 111

# Set the random seed for Python to SEED
random.seed(SEED)

# Set the random seed for numpy to SEED
np.random.seed(SEED)

# Set the random seed for torch to SEED
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [4]:
train_path = "/kaggle/input/amazon/train.csv"
test_path = "/kaggle/input/amazon/test.csv"
valid_path = "/kaggle/input/amazon/valid.csv"

In [5]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
valid_df = pd.read_csv(valid_path)

In [6]:
train_df = train_df[train_df['stars'].isin([1, 2, 4, 5])]
train_df = train_df[train_df['language'].isin(['en', 'de', 'fr', 'es'])]
# train_df = train_df[train_df['language'].isin(['en'])]

test_df = test_df[test_df['stars'].isin([1, 2, 4, 5])]
test_df = test_df[test_df['language'].isin(['en', 'de', 'fr', 'es'])]
# test_df = test_df[test_df['language'].isin(['en'])]

valid_df = valid_df[valid_df['stars'].isin([1, 2, 4, 5])]
test_df = test_df[test_df['language'].isin(['en', 'de', 'fr', 'es'])]
# valid_df = valid_df[valid_df['language'].isin(['en'])]


columns_to_drop = ['review_id', 'product_id', 'reviewer_id', 'product_category']
train_df.drop(columns=columns_to_drop, inplace=True)
test_df.drop(columns=columns_to_drop, inplace=True)
valid_df.drop(columns=columns_to_drop, inplace=True)

In [7]:
num_rows_to_delete = 30000

In [8]:
# for lang in ['en']:
for lang in ['en', 'de', 'es', 'fr']:
    for star in [1, 2, 4, 5]:
        print(f'Language: {lang}, Stars: {star}')
        print(f'Number of rows before: {len(train_df[(train_df["language"] == lang) & (train_df["stars"] == star)])}')
        random_indices = np.random.choice(train_df[(train_df['language'] == lang) & (train_df['stars'] == star)].index, num_rows_to_delete, replace=False)
        train_df.drop(index=random_indices, inplace=True)
        print(f'Number of rows after: {len(train_df[(train_df["language"] == lang) & (train_df["stars"] == star)])}')

Language: en, Stars: 1
Number of rows before: 40000
Number of rows after: 10000
Language: en, Stars: 2
Number of rows before: 40000
Number of rows after: 10000
Language: en, Stars: 4
Number of rows before: 40000
Number of rows after: 10000
Language: en, Stars: 5
Number of rows before: 40000
Number of rows after: 10000
Language: de, Stars: 1
Number of rows before: 40000
Number of rows after: 10000
Language: de, Stars: 2
Number of rows before: 40000
Number of rows after: 10000
Language: de, Stars: 4
Number of rows before: 40000
Number of rows after: 10000
Language: de, Stars: 5
Number of rows before: 40000
Number of rows after: 10000
Language: es, Stars: 1
Number of rows before: 40000
Number of rows after: 10000
Language: es, Stars: 2
Number of rows before: 40000
Number of rows after: 10000
Language: es, Stars: 4
Number of rows before: 40000
Number of rows after: 10000
Language: es, Stars: 5
Number of rows before: 40000
Number of rows after: 10000
Language: fr, Stars: 1
Number of rows be

In [9]:
def replace_mapping(df, label):
    for i, r in df.iterrows():
        if r[label] >= 4:
            df.loc[i, label] = 1
        else:
            df.loc[i, label] = 0
    
    return df

In [10]:
train_df = replace_mapping(train_df, 'stars')
test_df = replace_mapping(test_df, 'stars')
valid_df = replace_mapping(valid_df, 'stars')

In [11]:
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
valid_df.reset_index(drop=True, inplace=True)

In [12]:
train_df.shape, test_df.shape, valid_df.shape

((160000, 4), (16000, 4), (24000, 4))

In [13]:
print(train_df["stars"].unique())
print(test_df["stars"].unique())
print(valid_df["stars"].unique())

[0 1]
[0 1]
[0 1]


In [14]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [15]:
def prepare_truncation(data_df, tokenizer, m, n):
    # Create a tqdm progress bar for the loop
    for i, r in tqdm(data_df.iterrows(), total=len(data_df), desc="Processing reviews"):
        tokenized_row = tokenizer.tokenize(r['review_body'])
        if len(tokenized_row) > m+n:
            data_df.loc[i, 'review_body'] = tokenizer.convert_tokens_to_string(tokenized_row[:m] + tokenized_row[-n:])

    return data_df

In [16]:
model_name = 'bert-base-multilingual-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name, num_labels=2)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [17]:
N = 128
M = 382
# N = 64
# M = 64

In [18]:
train_df = prepare_truncation(train_df, tokenizer, M, N)
test_df = prepare_truncation(test_df, tokenizer, M, N)
valid_df = prepare_truncation(valid_df, tokenizer, M, N)

Processing reviews:   2%|▏         | 2768/160000 [00:00<00:52, 2969.39it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (591 > 512). Running this sequence through the model will result in indexing errors
Processing reviews: 100%|██████████| 160000/160000 [00:47<00:00, 3335.67it/s]
Processing reviews: 100%|██████████| 16000/16000 [00:04<00:00, 3378.40it/s]
Processing reviews: 100%|██████████| 24000/24000 [00:07<00:00, 3217.30it/s]


In [19]:
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)
valid_ds = Dataset.from_pandas(valid_df)


en_only_dataset = DatasetDict()

en_only_dataset['train'] = train_ds
en_only_dataset['validation'] = valid_ds
en_only_dataset['test'] = test_ds

In [21]:
train_df.shape, test_df.shape, valid_df.shape

((160000, 4), (16000, 4), (24000, 4))

In [20]:
# Preprocess function with labels
def preprocess_function(examples):
    inputs = tokenizer(examples["review_body"], truncation=True)
    inputs["labels"] = examples["stars"]
    return inputs

# Tokenize the datasets
tokenized_train = en_only_dataset["train"].map(preprocess_function, batched=True)
tokenized_validation = en_only_dataset["validation"].map(preprocess_function, batched=True)
tokenized_test = en_only_dataset["test"].map(preprocess_function, batched=True)

# Data collator for padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

# Load metrics outside the compute_metrics function
load_accuracy = evaluate.load("accuracy")
load_f1 = evaluate.load("f1")

# Compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    return {"accuracy": accuracy, "f1": f1}

  0%|          | 0/160 [00:00<?, ?ba/s]

  0%|          | 0/24 [00:00<?, ?ba/s]

  0%|          | 0/16 [00:00<?, ?ba/s]

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [22]:
training_args = TrainingArguments(
    output_dir="./results",
    warmup_steps=10000,
    optim="adamw_torch",
    num_train_epochs=2,
    weight_decay=1e-4,
    save_strategy=IntervalStrategy.STEPS,
    evaluation_strategy=IntervalStrategy.STEPS,
    eval_steps=500,
    logging_strategy=IntervalStrategy.STEPS,
    push_to_hub=False,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    save_total_limit=5,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,F1
500,1.0248,0.599325,0.751875,0.751823
1000,0.5025,0.584421,0.7575,0.748143
1500,0.4474,0.441986,0.820875,0.820485
2000,0.3624,0.515748,0.821167,0.820718


In [None]:
trainer.evaluate()

In [None]:
results = trainer.evaluate(eval_dataset=tokenized_test)

In [None]:
file_name = 'uncased-92acc-2classes-128tokens'
file_name = f"{model_name}_2classes_{N+M}tokens_{results['eval_f1']:.2f}f1"
trainer.save_model(file_name)