<a href="https://colab.research.google.com/github/mmilannaik/BigOCheatSheet/blob/master/NLP_4_BERT_Grocery_Quantzig.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets seqeval -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


# Importing Libraries

In [20]:
import transformers
print(transformers.__version__)

4.52.4


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np

## 📂 Load Data

In [6]:
from datasets import load_dataset
dataset = load_dataset("mteb/amazon_reviews_multi", "en")


NotImplementedError: Loading a dataset cached in a LocalFileSystem is not supported.

In [7]:
# Step 1: Load separate Kaggle CSV files
df_train = pd.read_csv("/content/train.csv")
df_valid = pd.read_csv("/content/validation.csv")
df_test = pd.read_csv("/content/test.csv")


In [9]:
# Step 2: Concatenate splits and filter to CPG-relevant categories
categories = ['grocery', 'beauty', 'apparel', 'kitchen', 'home']
df_all = pd.concat([df_train, df_valid, df_test], axis=0)
df_all = df_all[df_all['product_category'].isin(categories)]

In [8]:
df_train.head(2)

Unnamed: 0.1,Unnamed: 0,review_id,product_id,reviewer_id,stars,review_body,review_title,language,product_category
0,0,de_0203609,product_de_0865382,reviewer_de_0267719,1,Armband ist leider nach 1 Jahr kaputt gegangen,Leider nach 1 Jahr kaputt,de,sports
1,1,de_0559494,product_de_0678997,reviewer_de_0783625,1,In der Lieferung war nur Ein Akku!,EINS statt ZWEI Akkus!!!,de,home_improvement


In [10]:
df_all = df_all[df_all['language']=='en']

In [11]:
df_all.head(2)

Unnamed: 0.1,Unnamed: 0,review_id,product_id,reviewer_id,stars,review_body,review_title,language,product_category
200002,200002,en_0311558,product_en_0399702,reviewer_en_0152034,1,I received my first order of this product and ...,The product is junk.,en,home
200006,200006,en_0206383,product_en_0041998,reviewer_en_0005698,1,Ordered 2 they shipped 1 promised by certain d...,Not reliable ☹️,en,home


In [12]:
# Step 3: Label mapping from star rating
def map_sentiment(row):
    if row <= 2:
        return 0  # Negative
    elif row == 3:
        return 1  # Neutral
    else:
        return 2  # Positive

df_all['label'] = df_all['stars'].apply(map_sentiment)

In [13]:
from sklearn.model_selection import train_test_split
train_df,test_df = train_test_split(df_all,test_size =0.2,stratify = df_all['label'],random_state = 42)


In [14]:
train_ds = Dataset.from_pandas(train_df[['review_body','label']].reset_index(drop=True))
test_ds = Dataset.from_pandas(test_df[['review_body','label']].reset_index(drop=True))

## ✂️ Tokenization & Label Alignment

In [15]:
# Step 5: Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=3)


# Step 6: Tokenization
def tokenize_function(example):
    return tokenizer(example["review_body"], truncation=True)

train_ds = train_ds.map(tokenize_function, batched=True)
test_ds = test_ds.map(tokenize_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/51071 [00:00<?, ? examples/s]

Map:   0%|          | 0/12768 [00:00<?, ? examples/s]

In [16]:
# Step 7: Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, average='weighted'),
        "recall": recall_score(labels, preds, average='weighted'),
        "f1": f1_score(labels, preds, average='weighted')
    }

In [17]:
train_ds.shape

(51071, 5)

In [24]:
# Step 8: Trainer setup
args = TrainingArguments(
    output_dir="bert-cpg-sentiment",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=100,
    logging_dir="./logs",
    report_to="none",
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [25]:
# Step 9: Train
import os
os.environ["WANDB_DISABLED"] = "true"
trainer.train()

# Step 10: Evaluate
trainer.evaluate()


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6051,0.568244,0.75979,0.750248,0.75979,0.754155
2,0.4886,0.572816,0.758537,0.770633,0.758537,0.763303


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6051,0.568244,0.75979,0.750248,0.75979,0.754155
2,0.4886,0.572816,0.758537,0.770633,0.758537,0.763303
3,0.3161,0.70068,0.757284,0.759634,0.757284,0.75823


{'eval_loss': 0.7006798386573792,
 'eval_accuracy': 0.7572838345864662,
 'eval_precision': 0.7596339007825523,
 'eval_recall': 0.7572838345864662,
 'eval_f1': 0.7582295316170675,
 'eval_runtime': 97.3065,
 'eval_samples_per_second': 131.214,
 'eval_steps_per_second': 8.201,
 'epoch': 3.0}