In [5]:
# Step 1: Imports
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sentence_transformers import SentenceTransformer


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Example product titles by category
categories = {
    100: ["iPhone 13", "iPhone 14", "iPhone case", "Apple phone"],
    200: ["USB-C Charger", "Wireless Charger", "Laptop charger"],
    300: ["MacBook Pro", "Dell XPS", "HP Laptop"],
    400: ["Apple Watch", "Smartwatch", "Fitness tracker"],
    500: ["Phone Case", "Samsung case", "Android cover"]
}

# Generate synthetic dataset
data = {"title": [], "category_id": []}
for cat_id, titles in categories.items():
    for _ in range(100):  # 100 examples per category
        phrase = random.choice(titles) + " " + random.choice(["Pro", "Plus", "2023", "Gen 3", "Max"])
        data["title"].append(phrase)
        data["category_id"].append(cat_id)

df = pd.DataFrame(data)


In [7]:
# Load SBERT model
sbert = SentenceTransformer('all-MiniLM-L6-v2')

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['title'], df['category_id'], test_size=0.2, random_state=42)

# Encode with SBERT
X_train_emb = sbert.encode(X_train.tolist(), convert_to_numpy=True)
X_test_emb = sbert.encode(X_test.tolist(), convert_to_numpy=True)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [12]:
# Train classifier on SBERT embeddings
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_emb, y_train)
import joblib

# Save classifier
joblib.dump(clf, "rf_model.pkl")


['rf_model.pkl']

In [13]:
# Predict on test set
y_pred = clf.predict(X_test_emb)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 1.0

Classification Report:
              precision    recall  f1-score   support

         100       1.00      1.00      1.00        28
         200       1.00      1.00      1.00        14
         300       1.00      1.00      1.00        10
         400       1.00      1.00      1.00        24
         500       1.00      1.00      1.00        24

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100



In [14]:
def predict_category(title):
    emb = sbert.encode([title])
    pred = clf.predict(emb)[0]
    return pred

# Example predictions
print("Prediction for 'iPhone 14 Pro':", predict_category("iPhone 14 Pro"))         # → 100
print("Prediction for 'Fast USB-C Charger':", predict_category("Fast USB-C Charger"))  # → 200
print("Prediction for 'Dell Laptop Gen 3':", predict_category("Dell Laptop Gen 3"))    # → 300


Prediction for 'iPhone 14 Pro': 100
Prediction for 'Fast USB-C Charger': 200
Prediction for 'Dell Laptop Gen 3': 300
