In [None]:
#required package
!pip install transformers datasets torch scikit-learn

In [None]:
#import Required packages
import pandas as pd
import matplotlib.pyplot as plt
import re
from datasets import Dataset
from transformers import BertTokenizer,BertForSequenceClassification,Trainer, TrainingArguments,BertConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers.trainer_callback import EarlyStoppingCallback
import torch
import logging

In [None]:
#Read Dataset
df=pd.read_csv("amazon_review.csv")

In [None]:
df #dataset

In [None]:
df.head() #top 5 rows

In [None]:
df.tail() #Last 5 rows

In [None]:
df.info() #information about dataset

In [None]:
df.isnull().sum() #check the null value

In [None]:
df.duplicated().sum() #duplicate remove

In [None]:
df['label'].value_counts() #label count

In [None]:
#Distribution of labels
data=df['label'].value_counts()
ax=data.plot(kind='bar')
ax.bar_label(ax.containers[0])
plt.title("Distribution Of Labels")
plt.show()

In [None]:
df['text'][0] #In dataset text col first row

In [None]:
#Apply Preprocess Step On That
def clean_text(text):
    text = re.sub(r'<[^>]+>', ' ', text)  # Remove HTML tags

    text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace

    text = text.lower() #lowercase

    return text


df['text'] = df['text'].apply(clean_text)

In [None]:
df['text'][5555]

In [None]:
# If your labels are __label__1 and __label__2
def convert_label(label):
    return 1 if label == '__label__1' else 0  # 1: churn/negative, 0: not churn/positive

In [None]:
df['label'] = df['label'].apply(convert_label)

In [None]:
df

In [None]:
#train and test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
#convert the dataset into huggingface format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

#bert tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#tokenize function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

#Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

#Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
#load the bert model with number of classes and dropout
num_labels = len(df['label'].unique())
config = BertConfig.from_pretrained(
    'bert-base-uncased',
    num_labels=num_labels,
    hidden_dropout_prob=0.15,
    attention_probs_dropout_prob=0.15
)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)

In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}

In [None]:
#define the traning arguments
training_args = TrainingArguments(
    output_dir='./results5.0',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.025,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    greater_is_better=True,
    learning_rate=1e-5,
    lr_scheduler_type='cosine',
    seed=42,
)

In [None]:
#Initialize trainer with EarlyStoppingCallback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(
        early_stopping_patience=2,
        early_stopping_threshold=0.00001
    )]
)

In [None]:
#train
trainer.train()

In [None]:
!pip install datasets numpy --upgrade #sometimes after run this train give numpy error then run this cell.

In [None]:
#Evaluate
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)

In [None]:
import matplotlib.pyplot as plt

# Data from your results
epochs = [1, 2]
train_loss = [0.224400, 0.078900]
val_loss = [0.174372, 0.197357]
accuracy = [0.945400, 0.947200]
precision = [0.945400, 0.947227]
recall = [0.945400, 0.947200]
f1 = [0.945400, 0.947201]

# Create a figure with two subplots
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8))

# Plot training and validation loss
ax1.plot(epochs, train_loss, label='Training Loss', marker='o', color='blue')
ax1.plot(epochs, val_loss, label='Validation Loss', marker='o', color='red')
ax1.set_title('Training and Validation Loss per Epoch')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.grid(True)

# Plot accuracy, precision, recall, and F1 score
ax2.plot(epochs, accuracy, label='Accuracy', marker='o', color='green')
ax2.plot(epochs, precision, label='Precision', marker='o', color='purple')
ax2.plot(epochs, recall, label='Recall', marker='o', color='orange')
ax2.plot(epochs, f1, label='F1 Score', marker='o', color='brown')
ax2.set_title('Performance Metrics per Epoch')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Score')
ax2.legend()
ax2.grid(True)

# Adjust layout to prevent overlap
plt.tight_layout()

# Save the plot
plt.savefig('model_metrics.png')
plt.show()


In [None]:
model.save_pretrained('./bert_model')
tokenizer.save_pretrained('./bert_model')

In [None]:
#inference
from transformers import pipeline

classifier = pipeline('text-classification', model='./bert_model', tokenizer='./bert_model')
text = "I was a bit skeptical at first, but this product turned out to be amazing. It works exactly as described and the quality is top-notch. Totally worth the money!"
prediction = classifier(text)
print(f"Text: {text}\nPrediction: {prediction}")