In [None]:
#required package
!pip install transformers datasets torch scikit-learn

In [None]:
#import Required packages
import pandas as pd
import matplotlib.pyplot as plt
import re
from datasets import Dataset
from transformers import BertTokenizer,BertForSequenceClassification,Trainer, TrainingArguments,BertConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support,classification_report
from transformers.trainer_callback import EarlyStoppingCallback
import torch
import logging

In [None]:
#Set up log
logging.basicConfig(
    level=logging.INFO,
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler('training.log')
    ]
)
logger = logging.getLogger(__name__)

In [None]:
#Read Dataset
df=pd.read_csv("product_review.csv")

In [None]:
df #dataset

In [None]:
df.head() #top 5 rows

In [None]:
df.tail() #Last 5 rows

In [None]:
df.info() #information about dataset

In [None]:
df.isnull().sum() #check the null value

In [None]:
df.duplicated().sum() #duplicate remove

In [None]:
df['label'].value_counts() #label count

In [None]:
#Distribution of labels
data=df['label'].value_counts()
ax=data.plot(kind='bar')
ax.bar_label(ax.containers[0])
plt.title("Distribution Of Labels")
plt.show()

In [None]:
df['text'][0] #In dataset text col first row

In [None]:
#Apply Preprocess Step On That
def clean_text(text):
    text = re.sub(r'<[^>]+>', ' ', text)  # Remove HTML tags

    text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace

    text = text.lower() #lowercase

    return text


df['text'] = df['text'].apply(clean_text)

In [None]:
df['text'][5555]

In [None]:
# If your labels are __label__1 and __label__2
def convert_label(label):
    return 1 if label == '__label__1' else 0  # 1: churn/negative, 0: not churn/positive

In [None]:
df['label'] = df['label'].apply(convert_label)

In [None]:
df

In [None]:
#train and test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
#convert the dataset into huggingface format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

#bert tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#tokenize function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

#Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

#Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
#load the bert model with number of classes and dropout
num_labels = len(df['label'].unique())
config = BertConfig.from_pretrained(
    'bert-base-uncased',
    num_labels=num_labels,
    hidden_dropout_prob=0.15,  
    attention_probs_dropout_prob=0.15
)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)

In [None]:
#function for classification reports
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    report = classification_report(labels, preds, output_dict=True, target_names=['negative', 'positive'])
    logger.info(f"Classification Report:\n{report}")  # Log report
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'neg_precision': report['negative']['precision'],
        'neg_recall': report['negative']['recall'],
        'neg_f1': report['negative']['f1-score'],
        'pos_precision': report['positive']['precision'],
        'pos_recall': report['positive']['recall'],
        'pos_f1': report['positive']['f1-score']
    }

In [None]:
#define the traning arguments
training_args = TrainingArguments(
    output_dir='./results5.0',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.025,  
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy', 
    greater_is_better=True,
    learning_rate=1e-5,
    lr_scheduler_type='cosine',
    seed=42,
)

In [None]:
#Initialize trainer with EarlyStoppingCallback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(
        early_stopping_patience=2,
        early_stopping_threshold=0.00001  
    )]
)

In [None]:
#train
trainer.train()
logger.info(f"Training completed with {trainer.state.global_step} steps")

In [None]:
!pip install datasets numpy --upgrade #sometimes after run this train give numpy error then run this cell.

In [None]:
#Evaluate
eval_results = trainer.evaluate()
logger.info(f"Final Evaluation Results: {eval_results}")
print("Evaluation results:", eval_results)

In [None]:
#visualize the trainin and validation loss
epochs = [1, 2]
training_loss = [0.211900, 0.071800]
validation_loss = [0.189324, 0.196725]

# Create the line plot
plt.figure(figsize=(8, 4))
plt.plot(epochs, training_loss, label='Training Loss', color='#4BC0C0', marker='o')  # Cyan
plt.plot(epochs, validation_loss, label='Validation Loss', color='#FF6384', marker='s')  # Red

# Customize the plot
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training vs Validation Loss')
plt.legend()
plt.grid(True)
plt.ylim(0, max(max(training_loss), max(validation_loss)) * 1.2)  # Start y-axis at 0

# Save and show the plot
plt.savefig('training_validation_loss.png')
plt.show()

In [None]:
#save the model
model.save_pretrained('./bert_model')
tokenizer.save_pretrained('./bert_model')

In [None]:
#inference
from transformers import pipeline

classifier = pipeline('text-classification', model='./bert_model', tokenizer='./bert_model')
text = "I was a bit skeptical at first, but this product turned out to be amazing. It works exactly as described and the quality is top-notch. Totally worth the money!"
prediction = classifier(text)
print(f"Text: {text}\nPrediction: {prediction}")