In [None]:
#Text Classification by Fine-tuning Language Model

# Install simpletransformers package
!pip install simpletransformers

# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the dataset (replace with your dataset path)
df = pd.read_csv('nlp_dataset_final.csv')

# Exploratory Data Analysis (EDA)
print(df.info())  # Overview of data structure
print(df['Price Range'].value_counts())  # Class distribution

# Encode labels as integers
label_encoder = LabelEncoder()
df['Price Range'] = label_encoder.fit_transform(df['Price Range'])

# Split dataset into train and validation sets
train_data, val_data = train_test_split(df, test_size=0.2, random_state=42)

# Preparing the data in the correct format for SimpleTransformers
train_df = pd.DataFrame({
    'text': train_data['Product Description'],
    'labels': train_data['Price Range']
})

val_df = pd.DataFrame({
    'text': val_data['Product Description'],
    'labels': val_data['Price Range']
})

# Verify dataset format
print(train_df.head())
print(train_df.dtypes)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1290 entries, 0 to 1289
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Product Description  1290 non-null   object
 1   Price Range          1290 non-null   object
dtypes: object(2)
memory usage: 20.3+ KB
None
Price Range
Medium    844
High      269
Low       177
Name: count, dtype: int64
                                                   text  labels
634   Adjustable - Eight heights can be freely adjus...       2
673   Experience the true immersive sound with a pum...       2
823   50 assorted shades + 1 scraping tool|Confirms ...       1
1202  This water filter candle Suitable for 10 'Pre-...       2
135   AC1300 Lightning-Fast Speed — AC1300 (867 Mbps...       2
text      object
labels     int64
dtype: object


In [None]:
import re

# Define a function to clean text data
def clean_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove extra whitespace
    text = text.strip()

    return text

# Apply the cleaning function to the dataset
train_df['text'] = train_df['text'].apply(clean_text)
val_df['text'] = val_df['text'].apply(clean_text)

print(train_df.head())

                                                   text  labels
634   adjustable  eight heights can be freely adjust...       2
673   experience the true immersive sound with a pum...       2
823   assorted shades   scraping toolconfirms to saf...       1
1202  this water filter candle suitable for  prefilt...       2
135   ac lightningfast speed  ac  mbps on the  ghz b...       2


In [None]:
!pip install simpletransformers



In [None]:
from simpletransformers.classification import ClassificationModel

# Get the number of unique labels dynamically
num_labels = df['Price Range'].nunique()

# Define model arguments
model_args = {
    "overwrite_output_dir": True,  # Allows overwriting previous outputs
    "num_train_epochs": 1,  # You can adjust this
    "train_batch_size": 16,  # Adjust based on available resources
    "eval_batch_size": 16,
    "save_eval_checkpoints": False,
    "save_model_every_epoch": False,
    "use_multiprocessing": False,
}

# Create a BERT model for text classification
bert_model = ClassificationModel(
    "bert", "bert-base-uncased", num_labels=num_labels, args=model_args, use_cuda=False
)

# Create a RoBERTa model for text classification
roberta_model = ClassificationModel(
    "roberta", "roberta-base", num_labels=num_labels, args=model_args, use_cuda=False
)

# Train the BERT model
bert_model.train_model(train_df)

# Train the RoBERTa model
roberta_model.train_model(train_df)

# Evaluate the models on the validation dataset
bert_result, bert_model_outputs, _ = bert_model.eval_model(val_df)
roberta_result, roberta_model_outputs, _ = roberta_model.eval_model(val_df)

# Print evaluation results
print("BERT Model Evaluation:", bert_result)
print("RoBERTa Model Evaluation:", roberta_result)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 1 of 1:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 1 of 1:   0%|          | 0/65 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Running Evaluation:   0%|          | 0/17 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Running Evaluation:   0%|          | 0/17 [00:00<?, ?it/s]

BERT Model Evaluation: {'mcc': np.float64(0.5194855734020472), 'eval_loss': 0.6284049135797164}
RoBERTa Model Evaluation: {'mcc': np.float64(0.4081704814645412), 'eval_loss': 0.6526510557707619}


In [None]:
bert_model.save_model('bert_best_model')

In [None]:
roberta_model.save_model('roberta_best_model')

In [None]:
# Initialize a RoBERTa classification model
model = ClassificationModel(
    "roberta", "roberta-base", num_labels=3, use_cuda=(device == "cuda")
)

# Training parameters
train_args = {
    "num_train_epochs": 5,  # More epochs improve learning
    "train_batch_size": 8,  # Adjust batch size based on memory
    "overwrite_output_dir": True,  # Overwrite existing model files
    "output_dir": "roberta_best_model",  # Where to save the model
}

# Train the model
model.train_model(df, args=train_args)

# ✅ **Save the model correctly**
model.save_model("roberta_best_model")

# Verify saved files
print("Model saved successfully!")
print("Saved Model Files:", os.listdir("roberta_best_model"))

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0it [00:00, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 5 of 5:   0%|          | 0/1 [00:00<?, ?it/s]

Model saved successfully!
Saved Model Files: ['special_tokens_map.json', 'training_args.bin', 'tokenizer.json', 'config.json', 'vocab.json', 'tokenizer_config.json', 'model_args.json', 'model.safetensors', 'checkpoint-3-epoch-3', 'checkpoint-2-epoch-2', 'checkpoint-1-epoch-1', 'merges.txt', 'checkpoint-5-epoch-5', 'checkpoint-4-epoch-4']


In [None]:
from simpletransformers.classification import ClassificationModel

# ✅ **Load the saved RoBERTa model correctly**
roberta_model = ClassificationModel(
    'roberta', 'roberta_best_model', use_cuda=False
)

# Check if model loads successfully
print("Model loaded successfully!")

Model loaded successfully!


In [None]:
# Example product description for price prediction
product_description = "Brand-Borosil, Specification Ã¢â‚¬â€œ 23V ~ 5Hz;1 W Power for Faster Grilling|Makes 2 Sandwiches in minutes;Stainless Steel Mirror Finish Body; Easy to use;Easy to clean coated plates|Auto-lock latch;Larger deep ribbed grill plates for Krisp grilling|Rubber feet to avoid slipping on Countertop;Indicator Display.|Material Type: Plastic|Suitable for gifting on Diwali and other special occasions."

# Predict price range
predictions, _ = roberta_model.predict([product_description])  # Input as a list

# Mapping predictions to price labels
label_map = {0: "Low", 1: "Medium", 2: "High"}
predicted_label = label_map[predictions[0]]  # Extract first prediction

# Display result
print(f"Product: {product_description}")
print(f"Predicted Price Range: {predicted_label}")


0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Product: Brand-Borosil, Specification Ã¢â‚¬â€œ 23V ~ 5Hz;1 W Power for Faster Grilling|Makes 2 Sandwiches in minutes;Stainless Steel Mirror Finish Body; Easy to use;Easy to clean coated plates|Auto-lock latch;Larger deep ribbed grill plates for Krisp grilling|Rubber feet to avoid slipping on Countertop;Indicator Display.|Material Type: Plastic|Suitable for gifting on Diwali and other special occasions.
Predicted Price Range: High


In [None]:
from simpletransformers.classification import ClassificationModel

# Load a pre-trained BERT model (bert-base-uncased)
model = ClassificationModel('bert', 'bert-base-uncased', use_cuda=False)

# Save the model properly
model.save_model("bert_best_model")
print("Model saved successfully!")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model saved successfully!


In [None]:
from simpletransformers.classification import ClassificationModel
import pandas as pd
import torch

# Ensure the correct device is selected (CUDA if available, else CPU)
device = "cuda" if torch.cuda.is_available() else "cpu"

# Sample dataset (replace with your actual dataset)
data = [
    ["Cheap plastic water bottle", 0],  # Low price
    ["Affordable cotton t-shirt", 0],  # Low price
    ["Mid-range smartphone with 6GB RAM", 1],  # Medium price
    ["High-end gaming laptop with RTX 4070", 2],  # High price
    ["Luxury leather handbag", 2],  # High price
]


# Convert data to Pandas DataFrame
df = pd.DataFrame(data, columns=["text", "labels"])

# Create a BERT model for classification
model = ClassificationModel(
    "bert", "bert-base-uncased", num_labels=3, use_cuda=(device == "cuda")
)

# Train the model with optimized parameters
train_args = {
    "num_train_epochs": 5,  # Number of training epochs
    "train_batch_size": 8,  # Batch size (reduce if memory issue)
    "save_model_every_epoch": False,  # Avoid saving after every epoch
    "overwrite_output_dir": True,  # Allow overwriting if needed
    "output_dir": "bert_best_model",  # Specify output directory
}

# Train the model
model.train_model(df, args=train_args)

# Save the fine-tuned model
model.save_model("bert_best_model")
print("Fine-tuned model saved successfully!")

# Verify the saved model files
import os
print("Saved Model Files:", os.listdir("bert_best_model"))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0it [00:00, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 5 of 5:   0%|          | 0/1 [00:00<?, ?it/s]

Fine-tuned model saved successfully!
Saved Model Files: ['special_tokens_map.json', 'vocab.txt', 'training_args.bin', 'tokenizer.json', 'config.json', 'tokenizer_config.json', 'model_args.json', 'model.safetensors']


In [None]:
bert_model = ClassificationModel('bert', "bert_best_model", use_cuda=False)
print("Model loaded successfully!")


Model loaded successfully!


In [None]:
import os
print(os.listdir("bert_best_model"))  # Should list 'pytorch_model.bin', 'config.json', etc.


['special_tokens_map.json', 'vocab.txt', 'training_args.bin', 'tokenizer.json', 'config.json', 'tokenizer_config.json', 'model_args.json', 'model.safetensors']


In [None]:
from simpletransformers.classification import ClassificationModel

# Load the saved BERT model
bert_model = ClassificationModel('bert', 'bert_best_model', use_cuda=False)

# Single product description (Replace with real input)
product_description = "Powerful MediaTek Helio P35 Octa Core 2.3GHz with Android 12,One UI Core 4.1|13MP+2MP Dual camera setup- True 13MP (F2.2) main camera + 2MP (F2.4) | 5MP (F2.2) front came|16.55 centimeters (6.5-inch) LCD, HD+ resolution with 720 x 1600 pixels resolution, 269 PPI with 16M color|5000mAH lithium-ion battery, 1 year manufacturer warranty for device and 6 months manufacturer warranty for in-box accessories including batteries from the date of purchase"

# Predict the price range
predictions_bert, _ = bert_model.predict([product_description])  # Pass as a list

# Mapping numerical predictions to price range labels
label_map = {0: "Low", 1: "Medium", 2: "High"}
predicted_label = label_map[predictions_bert[0]]  # Extract single prediction

# Display result
print(f"Product: {product_description}")
print(f"Predicted Price Range: {predicted_label}")


0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Product: Powerful MediaTek Helio P35 Octa Core 2.3GHz with Android 12,One UI Core 4.1|13MP+2MP Dual camera setup- True 13MP (F2.2) main camera + 2MP (F2.4) | 5MP (F2.2) front came|16.55 centimeters (6.5-inch) LCD, HD+ resolution with 720 x 1600 pixels resolution, 269 PPI with 16M color|5000mAH lithium-ion battery, 1 year manufacturer warranty for device and 6 months manufacturer warranty for in-box accessories including batteries from the date of purchase
Predicted Price Range: High
