In [None]:
!pip install transformers[torch]
!pip install sentencepiece
!pip install datasets
!pip install evaluate

In [None]:
%pip install --upgrade jupyter ipywidgets
%jupyter nbextension enable --py widgetsnbextension

In [1]:
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

# datasets
from datasets import Dataset
from datasets import DatasetDict
from datasets import load_metric
from datasets import load_dataset

# transformers
from transformers import Trainer
from transformers import AutoModel
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoModelForSequenceClassification
from transformers import EarlyStoppingCallback
from transformers import IntervalStrategy

import torch
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset

from sklearn.metrics import accuracy_score, f1_score

import evaluate

In [2]:
SEED = 111

# Set the random seed for Python to SEED
random.seed(SEED)

# Set the random seed for numpy to SEED
np.random.seed(SEED)

# Set the random seed for torch to SEED
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [None]:
# model = AutoModelForSequenceClassification.from_pretrained("./2_class_97_acc/")

In [3]:
# dataset = load_dataset('amazon_reviews_multi')
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
valid_df = pd.read_csv('valid.csv')

In [None]:
# train_df = pd.DataFrame(dataset['train'])
# test_df = pd.DataFrame(dataset['test'])
# valid_df = pd.DataFrame(dataset['validation'])

In [4]:
train_df = train_df[train_df['stars'].isin([1, 2, 4, 5])]
# train_df = train_df[train_df['language'].isin(['en', 'de', 'fr', 'es'])]
train_df = train_df[train_df['language'].isin(['en'])]

test_df = test_df[test_df['stars'].isin([1, 2, 4, 5])]
# test_df = test_df[test_df['language'].isin(['en', 'de', 'fr', 'es'])]
test_df = test_df[test_df['language'].isin(['en'])]

valid_df = valid_df[valid_df['stars'].isin([1, 2, 4, 5])]
# test_df = test_df[test_df['language'].isin(['en', 'de', 'fr', 'es'])]
valid_df = valid_df[valid_df['language'].isin(['en'])]


columns_to_drop = ['review_id', 'product_id', 'reviewer_id', 'product_category']
train_df.drop(columns=columns_to_drop, inplace=True)
test_df.drop(columns=columns_to_drop, inplace=True)
valid_df.drop(columns=columns_to_drop, inplace=True)

In [None]:
train_df.shape

In [None]:
train_df.head()

In [5]:
num_rows_to_delete = 30000

In [6]:
for lang in ['en']:
# for lang in ['en', 'de', 'es', 'fr']:
    for star in [1, 2, 4, 5]:
        print(f'Language: {lang}, Stars: {star}')
        print(f'Number of rows before: {len(train_df[(train_df["language"] == lang) & (train_df["stars"] == star)])}')
        random_indices = np.random.choice(train_df[(train_df['language'] == lang) & (train_df['stars'] == star)].index, num_rows_to_delete, replace=False)
        train_df.drop(index=random_indices, inplace=True)
        print(f'Number of rows after: {len(train_df[(train_df["language"] == lang) & (train_df["stars"] == star)])}')

Language: en, Stars: 1
Number of rows before: 40000
Number of rows after: 10000
Language: en, Stars: 2
Number of rows before: 40000
Number of rows after: 10000
Language: en, Stars: 4
Number of rows before: 40000
Number of rows after: 10000
Language: en, Stars: 5
Number of rows before: 40000
Number of rows after: 10000


In [7]:
def replace_mapping(df, label):
    for i, r in df.iterrows():
        if r[label] >= 4:
            df.loc[i, label] = 1
        else:
            df.loc[i, label] = 0
    
    return df

In [8]:
train_df = replace_mapping(train_df, 'stars')
test_df = replace_mapping(test_df, 'stars')
valid_df = replace_mapping(valid_df, 'stars')

In [9]:
# replacement_mapping = {1: 0, 2: 0, 3:0, 4: 1, 5: 1}
# train_df['stars'] = train_df['stars'].replace(replacement_mapping)
# test_df['stars'] = test_df['stars'].replace(replacement_mapping)
# valid_df['stars'] = test_df['stars'].replace(replacement_mapping)

train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
valid_df.reset_index(drop=True, inplace=True)

In [10]:
train_df.shape

(40000, 4)

In [11]:
train_df.head()

Unnamed: 0,stars,review_body,review_title,language
0,0,the cabinet dot were all detached from backing...,Not use able,en
1,0,I am disappointed in this purchase. I bought o...,Not what I ordered,en
2,0,was not pleased and Bluetooth malfunctioned af...,I would not recommend it,en
3,0,Stems were broken due to poor packing. Shapes ...,stems were broken,en
4,0,The product was faulty and seller offered refu...,Not worth $40,en


In [12]:
test_df.head()

Unnamed: 0,stars,review_body,review_title,language
0,0,"These are AWFUL. They are see through, the fab...",Don’t waste your time!,en
1,0,I bought 4 and NONE of them worked. Yes I used...,One Star,en
2,0,On first use it didn't heat up and now it does...,Totally useless,en
3,0,You want an HONEST answer? I just returned fro...,Gold filled earrings,en
4,0,The glue works fine but the container is impos...,Poor container,en


In [13]:
train_df.shape, test_df.shape, valid_df.shape

((40000, 4), (4000, 4), (4000, 4))

In [14]:
valid_df.head()

Unnamed: 0,stars,review_body,review_title,language
0,0,Pathetic design of the caps. Very impractical ...,Not worth the price and very bad cap design,en
1,0,"Shoes were purchased on March 6, 2019. My wife...",Garbage!,en
2,0,It's taken me 1 whole year to set this thing u...,I do not recommend this printer,en
3,0,Each cartridge printed once. Both dried up in ...,Don't purchase these refurbished cartridges!,en
4,0,No light hard to see,Not worth,en


In [15]:
print(train_df["stars"].unique())
print(test_df["stars"].unique())
print(valid_df["stars"].unique())

[0 1]
[0 1]
[0 1]


In [16]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cpu


In [None]:
# model_name = 'bert-base-multilingual-cased'

# # Load the tokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# # Load your DataFrame with "review_body" column as text and "stars" column as labels
# # Replace the following line with your DataFrame loading code
# # df = pd.read_csv("your_dataframe.csv")
# # For example, assuming your DataFrame is already loaded:
# df = test_df  # Replace `your_dataframe` with the actual variable name of your DataFrame

# # Tokenize the text data and convert it into input features
# def tokenize_function(text):
#     return tokenizer(text, padding="max_length", truncation=True, return_tensors="pt")

# inputs = tokenize_function(df["review_body"].tolist())
# labels = torch.tensor(df["stars"].tolist()).to(device)

# inputs = {k: v.to(device) for k, v in inputs.items()}

# # Create a DataLoader
# batch_size = 32
# dataset = TensorDataset(inputs["input_ids"], inputs["attention_mask"], labels)
# data_loader = DataLoader(dataset, batch_size=batch_size)

# # Set the model in evaluation mode (important if using dropout, batch normalization, etc.)
# model = model.to(device)
# model.eval()

# # Initialize variables to store predictions and true labels
# all_predictions = []
# all_true_labels = []




In [None]:
# # Evaluation loop
# with torch.no_grad():
#     for batch in tqdm(data_loader, desc="Evaluating", leave=False):
#         input_ids_batch, attention_mask_batch, labels_batch = batch

#         input_ids_batch, attention_mask_batch, labels_batch = input_ids_batch.to(device), attention_mask_batch.to(device), labels_batch.to(device)


#         logits = model(input_ids_batch, attention_mask=attention_mask_batch)[0]

#         # Convert logits to probabilities
#         probabilities = torch.softmax(logits, dim=1)

#         # Get the predicted class for each instance
#         predicted_labels = torch.argmax(probabilities, dim=1).tolist()

#         # Store predictions and true labels
#         all_predictions.extend(predicted_labels)
#         all_true_labels.extend(labels_batch.tolist())

# all_predictions = torch.tensor(all_predictions).cpu()
# all_true_labels = labels.cpu()

# accuracy = accuracy_score(all_true_labels, all_predictions)
# f1 = f1_score(all_true_labels, all_predictions, average='weighted')

# print("Accuracy:", accuracy)
# print("F1-score:", f1)

In [17]:
# def prepare_truncation(data_df, tokenizer, m, n):
#     for i, r in data_df.iterrows():
#         tokenized_row = tokenizer.tokenize(r['review_body'])
#         if len(tokenized_row) > m+n:
#             data_df.loc[i, 'review_body'] = tokenizer.convert_tokens_to_string(tokenized_row[:m] + tokenized_row[-n:])

#     return data_df
def prepare_truncation(data_df, tokenizer, m, n):
    # Create a tqdm progress bar for the loop
    for i, r in tqdm(data_df.iterrows(), total=len(data_df), desc="Processing reviews"):
        tokenized_row = tokenizer.tokenize(r['review_body'])
        if len(tokenized_row) > m+n:
            data_df.loc[i, 'review_body'] = tokenizer.convert_tokens_to_string(tokenized_row[:m] + tokenized_row[-n:])

    return data_df

In [18]:
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name, num_labels=2)

In [19]:
N = 128
M = 382
# N = 64
# M = 64

In [20]:
train_df = prepare_truncation(train_df, tokenizer, M, N)
test_df = prepare_truncation(test_df, tokenizer, M, N)
valid_df = prepare_truncation(valid_df, tokenizer, M, N)

Processing reviews:   5%|▌         | 2102/40000 [00:00<00:05, 7092.20it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (786 > 512). Running this sequence through the model will result in indexing errors
Processing reviews: 100%|██████████| 40000/40000 [00:05<00:00, 7366.86it/s]
Processing reviews: 100%|██████████| 4000/4000 [00:00<00:00, 7419.35it/s]
Processing reviews: 100%|██████████| 4000/4000 [00:00<00:00, 7438.55it/s]


In [21]:
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)
valid_ds = Dataset.from_pandas(valid_df)


en_only_dataset = DatasetDict()

en_only_dataset['train'] = train_ds
en_only_dataset['validation'] = valid_ds
en_only_dataset['test'] = test_ds

In [None]:
# train = Dataset.from_pandas(train_df)
# test = Dataset.from_pandas(test_df)
# valid = Dataset.from_pandas(valid_df)

In [22]:
# Preprocess function with labels
def preprocess_function(examples):
    inputs = tokenizer(examples["review_body"], truncation=True)
    inputs["labels"] = examples["stars"]
    return inputs

# Tokenize the datasets
tokenized_train = en_only_dataset["train"].map(preprocess_function, batched=True)
tokenized_validation = en_only_dataset["validation"].map(preprocess_function, batched=True)
tokenized_test = en_only_dataset["test"].map(preprocess_function, batched=True)

# Data collator for padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

# Load metrics outside the compute_metrics function
load_accuracy = evaluate.load("accuracy")
load_f1 = evaluate.load("f1")

# Compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    return {"accuracy": accuracy, "f1": f1}

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# Training arguments
# training_args = TrainingArguments(
#     output_dir="test_trainer",
#     optim="adamw_torch",
#     warmup_steps=10000,
#     num_train_epochs=1,
#     weight_decay=1e-4,
#     evaluation_strategy="epoch",
#     save_strategy="no",
# )
training_args = TrainingArguments(
    output_dir="./results",
    warmup_steps=10000,
    optim="adamw_torch",
    num_train_epochs=2,
    weight_decay=1e-4,
    save_strategy=IntervalStrategy.STEPS,
    evaluation_strategy=IntervalStrategy.STEPS,
    eval_steps=500,
    logging_strategy=IntervalStrategy.STEPS,
    push_to_hub=False,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    save_total_limit=5,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [24]:
trainer.train()

  0%|          | 0/10000 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
trainer.evaluate()

In [None]:
file_name = '2-classes-92acc-128tokens'
trainer.save_model(file_name)

In [None]:
file_name = '2-classes-92acc-128tokens'
model = AutoModelForSequenceClassification.from_pretrained(file_name)

In [None]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer.evaluate()