In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [36]:
# Step 1: Prepare your dataset
import random

categories = {
    100: ["iPhone 13", "iPhone 14", "iPhone case", "Apple phone"],
    200: ["USB-C Charger", "Wireless Charger", "Laptop charger"],
    300: ["MacBook Pro", "Dell XPS", "HP Laptop"],
    400: ["Apple Watch", "Smartwatch", "Fitness tracker"],
    500: ["Phone Case", "Samsung case", "Android cover"]
}

data = {"title": [], "category_id": []}

for cat_id, titles in categories.items():
    for _ in range(100):
        phrase = random.choice(titles) + " " + random.choice(["", "Pro", "Plus", "2023", "Gen 3"])
        data["title"].append(phrase)
        data["category_id"].append(cat_id)

df = pd.DataFrame(data)

print(df)
print("Head: \n", df.head())
print("Datatypes: \n", df.info())


                  title  category_id
0            iPhone 13           100
1       iPhone case Pro          100
2         iPhone 14 Pro          100
3       iPhone 13 Gen 3          100
4       iPhone 13 Gen 3          100
..                  ...          ...
495   Samsung case Plus          500
496       Samsung case           500
497  Samsung case Gen 3          500
498    Phone Case Gen 3          500
499   Samsung case 2023          500

[500 rows x 2 columns]
Head: 
              title  category_id
0       iPhone 13           100
1  iPhone case Pro          100
2    iPhone 14 Pro          100
3  iPhone 13 Gen 3          100
4  iPhone 13 Gen 3          100
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        500 non-null    object
 1   category_id  500 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 7.9+ KB
Datatypes: 
 N

In [37]:
# Step 2: Encode labels (if needed)
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category_id'])  # Converts 100 → 0, etc.


In [38]:
# Step 3: Split the dataset
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [39]:
# Convert to Hugging Face dataset
train_dataset = Dataset.from_pandas(train_df[['title', 'label']])
test_dataset = Dataset.from_pandas(test_df[['title', 'label']])

# Step 4: Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


In [40]:
def tokenize(batch):
    return tokenizer(batch["title"], padding=True, truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Step 5: Load model
num_labels = len(label_encoder.classes_)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)


Map: 100%|██████████| 400/400 [00:00<00:00, 19117.59 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 14340.97 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    import numpy as np
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return {"accuracy": (predictions == labels).mean()}



In [42]:
# Step 6: Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True
)

# Step 7: Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # <-- Add this line
)


# Step 8: Train
trainer.train()



  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5131,0.005569,1.0
2,0.004,0.001921,1.0
3,0.002,0.001273,1.0
4,0.0015,0.001035,1.0
5,0.0013,0.000971,1.0




TrainOutput(global_step=500, training_loss=0.10440566110610962, metrics={'train_runtime': 104.8842, 'train_samples_per_second': 19.069, 'train_steps_per_second': 4.767, 'total_flos': 9250247196000.0, 'train_loss': 0.10440566110610962, 'epoch': 5.0})

In [43]:
def predict_category(title):
    model.eval()
    inputs = tokenizer(title, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        predicted_class = torch.argmax(outputs.logits, dim=1).item()
    category_id = label_encoder.inverse_transform([predicted_class])[0]
    return category_id

print(predict_category("iPhone 14 Pro"))  # Should predict something like 100
print(predict_category("Fast USB-C Wall Charger"))  # Should predict 200
print(predict_category("Laptop"))  # Should predict 300


100
200
300
