In [1]:
#!pip install transformers datasets torch scikit-learn accelerate>=0.26.0

In [2]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


## Load and Prepare Your Dataset

In [3]:
file_path = r"Z:\BD\Novumgen\Data\Business Analysis\Backup\Nisarg\Learning\DataBase\Sentiment Analysis Dataset\Reviews.csv"
df = pd.read_csv(file_path)

In [4]:
df.sample(5)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
464698,464699,B000UBD88A,A3RIXEWG42PY46,M. Cirrito,0,0,4,1241395200,Delicious Coffee,I love Senseo's coffee pods. They make an exc...
321007,321008,B000XB2E1E,A3POMB9ZWDEA9H,N. Thomas,0,0,5,1336262400,Pamela's just doesn't make a bad product,I am a sworn from-scratch baker. With the poss...
528325,528326,B001DBJK4C,A3LUXJ38SUF422,Kyle Bowles,4,4,3,1267660800,Very Impressed (Edited 2011-05-07),Since I started a gluten free diet a few month...
246912,246913,B0029NIGMA,A1M3QODN76DHIJ,ron,0,0,5,1349913600,Midnight,Midnight loves gravy and this is about the onl...
26305,26306,B001EQ4DAM,A2M80SE2YL2LQ3,Karen E. Schwartz,1,1,5,1293494400,Good Value,I had difficulty finding hazelnuts before the ...


In [5]:
df['Score'].value_counts(ascending=False)

Score
5    363122
4     80655
1     52268
3     42640
2     29769
Name: count, dtype: int64

In [6]:
def label_sentiment(rating):
    if rating <= 2:
        return 'Negative'
    elif rating >= 4:
        return 'Positive'
    else:
        return 'Neutral'

In [7]:
df['Sentiment'] = df['Score'].apply(label_sentiment)

In [8]:
df.sample(5)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Sentiment
325305,325306,B0033GMSTY,A20WUW45LIX63G,"Myron J. Rutsky ""Mabsy""",1,1,5,1303776000,Great morning cup of coffee,I always liked this Donut House coffee in the ...,Positive
63673,63674,B002IEZJMA,ABMX8XUNPR3LP,Jennifer Sicurella,0,1,2,1332806400,Not for casual coffee drinkers,Perhaps it's just something about the area I l...,Negative
520197,520198,B000F6SWA4,AFEK35INET1GW,Onmyown,1,1,4,1165881600,NOT what I expected but VERY GOOD!!,I drink A LOT of tea...99% I drink unsweetened...,Positive
390810,390811,B00139ZPKM,A3FJ0YTQ4TR4QL,"Rebecca K-B ""RAK-B""",0,0,5,1349654400,Quality pet food,We were looking for an organic food after the ...,Positive
511606,511607,B003R0LKUE,ANNHLLI73NW4B,Brian_in_Tulsa,1,1,2,1339891200,All but 2 cans in the case were severely damaged,I purchase most of my dog food from Amazon and...,Negative


In [9]:
df['Sentiment'].value_counts()

Sentiment
Positive    443777
Negative     82037
Neutral      42640
Name: count, dtype: int64

In [10]:
data =df[['Text','Sentiment']]

In [11]:
data.rename(columns={'Text':'review_text'},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.rename(columns={'Text':'review_text'},inplace=True)


In [12]:
sentiment_map = {
    'Positive' : 1,
    'Negative' : 0,
    'Neutral' : 2
}

data['Sentiment'] = data['Sentiment'].map(sentiment_map).astype('int32')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Sentiment'] = data['Sentiment'].map(sentiment_map).astype('int32')


In [13]:
data.info(), data['Sentiment'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   review_text  568454 non-null  object
 1   Sentiment    568454 non-null  int32 
dtypes: int32(1), object(1)
memory usage: 6.5+ MB


(None,
 Sentiment
 1    443777
 0     82037
 2     42640
 Name: count, dtype: int64)

## Sampling Data

In [14]:
from sklearn.utils import resample

# Keep class balance proportional
sample_size = 3000  # adjust between 1000–5000 depending on your CPU
data_sampled = data.groupby('Sentiment', group_keys=False).apply(
    lambda x: x.sample(frac=sample_size / len(data), random_state=42)
)

  data_sampled = data.groupby('Sentiment', group_keys=False).apply(


In [15]:
data_sampled.info(), data_sampled['Sentiment'].value_counts()

<class 'pandas.core.frame.DataFrame'>
Index: 3000 entries, 525327 to 183351
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review_text  3000 non-null   object
 1   Sentiment    3000 non-null   int32 
dtypes: int32(1), object(1)
memory usage: 58.6+ KB


(None,
 Sentiment
 1    2342
 0     433
 2     225
 Name: count, dtype: int64)

## Spliting Dataset

In [16]:
train_df, test_df = train_test_split(data_sampled, test_size=0.2, random_state=42)

In [17]:
train_df.info(), test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2400 entries, 465607 to 468135
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review_text  2400 non-null   object
 1   Sentiment    2400 non-null   int32 
dtypes: int32(1), object(1)
memory usage: 46.9+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 600 entries, 538116 to 72871
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review_text  600 non-null    object
 1   Sentiment    600 non-null    int32 
dtypes: int32(1), object(1)
memory usage: 11.7+ KB


(None, None)

## Converting into HuggingFace Dataset

### Que: Why we have to convert into HuggingFace Dataset format?
#### Ans: Work seamlessly with Hugging Face’s tokenizers and data collators, support tokenization and batched mapping, allow fast loading, shuffling, and filtering — even with large datasets, optimized for PyTorch and TensorFlow training.

In [18]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [19]:
train_dataset, test_dataset

(Dataset({
     features: ['review_text', 'Sentiment', '__index_level_0__'],
     num_rows: 2400
 }),
 Dataset({
     features: ['review_text', 'Sentiment', '__index_level_0__'],
     num_rows: 600
 }))

In [20]:
train_dataset['review_text'][0]

"I'm a ramyun addict and I have always loved ramyun since I was little.  It is the first thing I crave when I'm hungry and it's the only thing I can keep down when I'm hung-over. Ramyun has been and will always be my comfort food although I consider them very unhealthy. When I first spotted the Black Shin Ramyun on Amazon, I thought it was too expensive and didn't want to buy an entire box especially while I've been trying so hard to stay away from ramyun products to reduce my sodium and other toxic intakes. After many fruitless searches at nearby Asian markets to try a few packets, I ended up getting a case from Amazon after several months of hesitation; what can I say...I have to try all ramyun that I notice. Upon trying, I must say that I'm not disappointed at all and this is my new favorite ramen for the moment. The broth is thicker (from the beef bone extract powder) and has a bit of miso flavor very similar to the restaurant quality Japanese (non-instant) miso ramyun. It is defin

In [21]:
train_dataset['Sentiment'][0]

1

In [22]:
train_dataset['__index_level_0__'][0]

465607

In [23]:
data['review_text'][465607]

"I'm a ramyun addict and I have always loved ramyun since I was little.  It is the first thing I crave when I'm hungry and it's the only thing I can keep down when I'm hung-over. Ramyun has been and will always be my comfort food although I consider them very unhealthy. When I first spotted the Black Shin Ramyun on Amazon, I thought it was too expensive and didn't want to buy an entire box especially while I've been trying so hard to stay away from ramyun products to reduce my sodium and other toxic intakes. After many fruitless searches at nearby Asian markets to try a few packets, I ended up getting a case from Amazon after several months of hesitation; what can I say...I have to try all ramyun that I notice. Upon trying, I must say that I'm not disappointed at all and this is my new favorite ramen for the moment. The broth is thicker (from the beef bone extract powder) and has a bit of miso flavor very similar to the restaurant quality Japanese (non-instant) miso ramyun. It is defin

## Tokenize Text

In [24]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(example):
    tokenized = tokenizer(
        example['review_text'],
        truncation=True,
        padding = 'max_length',
        max_length=128
    )
    tokenized['labels'] = example['Sentiment']
    return tokenized

### Explanation:
#### Loads the pre-trained BERT tokenizer for 'bert-base-uncased'. “uncased” = it lowercases all text. Creates a tokenizer object you can use to turn text → token IDs.
#### example\['review_text'] → This pulls the text from your dataset — one batch of examples at a time.
#### truncation=True → If the text is longer than 128 tokens, cut it off (so model input size stays fixed).
#### padding = 'max_length' → While truncation cuts sequences that are too long, padding adds zeros to sequences that are too short to make them all the same length.
#### max_length=128 → Each sequence is exactly 128 tokens long (BERT’s max is 512, but 128 is faster and usually fine for sentiment tasks).
#### tokenized\['labels'] = example\['Sentiment'] → Adds the sentiment value (0, 1, 2) as a new field named 'labels'. Hugging Face’s Trainer automatically looks for this 'labels' key during training — so this step links your target variable to each example.

### Apply tokenization to the entire dataset

In [25]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map: 100%|█████████████████████████████████████████████████████████████████| 2400/2400 [00:04<00:00, 594.39 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████| 600/600 [00:01<00:00, 597.58 examples/s]


### Explanation:
#### .map() applies your function to every example in the dataset.
#### batched=True means it processes multiple rows at once (much faster than looping one by one).
#### The resulting datasets now contain numeric tensors instead of raw text.

In [26]:
train_dataset.column_names

['review_text',
 'Sentiment',
 '__index_level_0__',
 'input_ids',
 'token_type_ids',
 'attention_mask',
 'labels']

### Initially we had only 2 cols: review_text & Sentiment
### then __index_level_0__ added as we have converted database into huggingface Dataset objects
### As we applied tokenization on this we got 4 extra columns
#### 1) input_ids : Token IDs of the review text
#### 2) token_type_ids : Tell BERT which token belongs to which sentence
#### 3) attention_mask : Tells the model which tokens are real text and which tokens are padding. BERT expects input tensors of the same length (e.g., max_length=128), so shorter sentences are padded with zeros. Without attention_mask, BERT would treat padding tokens as real input, which can hurt performance.
#### 4) labels : Target sentiment label
### we keep only last 4 columns rest are unnecessary for training

In [27]:
columns_to_remove = ['review_text', 'Sentiment', '__index_level_0__']
train_dataset = train_dataset.remove_columns(columns_to_remove)
test_dataset = test_dataset.remove_columns(columns_to_remove)

In [28]:
train_dataset.column_names, test_dataset.column_names

(['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
 ['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

## Load Pretrained BERT Model

In [29]:
num_labels = len(data['Sentiment'].unique())
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Explanation:
#### num_labels → This ensures the output layer of BERT matches your number of classes.
#### BertForSequenceClassification is a BERT model with an added classification head: Pretrained BERT encoder → \[CLS] token embedding → Linear layer → Output logits
#### from_pretrained('bert-base-uncased') loads the pretrained weights from BERT’s base model.

## Define Training Arguments

In [30]:
training_args = TrainingArguments(
    output_dir='./result',
    eval_strategy='steps',
    eval_steps= 100,
    save_strategy='no',  # Disable saving
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_strategy='steps',  # log every X steps
    logging_steps=50,          # log training loss every 50 steps
    save_total_limit=0,  # Don't save any checkpoints
    no_cuda=True,  # Force CPU if GPU issues exist
)



### Explanation:
#### 1) output_dir='./result' : Where the model, tokenizer, and training logs will be saved after training. Even if you disable saving (save_strategy='no'), some temporary outputs or metrics may still be stored here.
#### 2) eval_strategy='epoch'Determines how often evaluation runs during training.'epoch' → evaluate once after each full pass through the training dataset. Other options: 'steps' (evaluate every N steps) or 'no' (never evaluate).
#### 3) eval_steps= 100 → if eval_strategy='steps' then this can b use. Steps per epoch = 3000/8 =375 As evalution is being done after every 100 step here 3-4 times evaluation will be done.
#### 5) learning_rate=2e-5 Too high → may diverge; too low → may converge slowly. 2e-5 is a standard starting point for fine-tuning BERT.
#### 6) per_device_train_batch_size=8 → Batch size per device (CPU/GPU) during training.
#### 7) per_device_eval_batch_size=8 → Batch size per device during evaluation.
#### 8) num_train_epochs=1 → Number of passes over the entire training dataset.
#### 9) weight_decay=0.01 → L2 regularization applied to model weights. Helps prevent overfitting, especially for small datasets.
#### 10) logging_dir='./logs' → Directory where TensorBoard logs are saved. You can visualize training metrics with TensorBoard.
#### 11) save_total_limit=0 → Limits the number of saved checkpoints. 0 → don’t save any, consistent with save_strategy='no'.
#### 12) no_cuda=True → Forces CPU training, even if a GPU is available.

In [31]:
print(torch.cuda.device_count())
print(torch.cuda.is_available())

0
False


## Define Trainer

In [32]:
trainer = Trainer(
    model= model,
    args= training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

### Explanation:
#### data_collator=DataCollatorWithPadding(tokenizer=tokenizer): In tokenize_function we use padding of 128 no matter how small the token sequences are, which lead to unnecessary padding. With Data Collator Dynamic Padding is possible. DataCollatorWithPadding is a built-in collator in Hugging Face that automatically pads sequences in a batch to the length of the longest sequence in that batch.
#### tokenizer=tokenizer : This will tell DataCollatorWithPadding which token is used for padding. Also tells Padding side: Knows whether to pad on the left or right.

## Train the Model

In [33]:
trainer.train()

Step,Training Loss,Validation Loss
100,0.6042,0.544574
200,0.4469,0.402198
300,0.4006,0.410632


TrainOutput(global_step=300, training_loss=0.5201594034830729, metrics={'train_runtime': 1500.856, 'train_samples_per_second': 1.599, 'train_steps_per_second': 0.2, 'total_flos': 157868050636800.0, 'train_loss': 0.5201594034830729, 'epoch': 1.0})

## Predict on New Data

In [38]:
test_texts = [
    "I didn't absolutely love this!",
    " Worthy the price."
]

In [39]:
inputs = tokenizer(test_texts,return_tensors='pt',truncation=True,padding=True,max_length=128)
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=-1)

print(predictions)

tensor([0, 1])


#### Pedictions seem to be true, but lets evaluate this model

## Evaluation of the Model