In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [8]:
df = pd.read_csv("data/ecommerceDataset.csv", names=["label", "text"])
df.head()

Unnamed: 0,label,text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [16]:
df['label'] = df['label'].astype('category')
df['text'] = df['text'].astype('str')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50425 entries, 0 to 50424
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   label   50425 non-null  category
 1   text    50425 non-null  object  
dtypes: category(1), object(1)
memory usage: 443.5+ KB


In [22]:
X = df[['text']]
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y,test_size=0.20)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,stratify=y_train,test_size=0.20)

In [28]:
column_names = ['text']
text_features = ['text']
train_pool = Pool(X_train, label=y_train, text_features=text_features, feature_names=column_names)
val_pool = Pool(X_val, label=y_val, text_features=text_features, feature_names=column_names)
model = CatBoostClassifier(iterations=100, depth=3, learning_rate=0.1, verbose=True, auto_class_weights = "Balanced")
model.fit(train_pool, eval_set=val_pool, use_best_model=True, early_stopping_rounds=50)

0:	learn: 1.1418975	test: 1.1377865	best: 1.1377865 (0)	total: 40ms	remaining: 3.96s
1:	learn: 0.9760046	test: 0.9685924	best: 0.9685924 (1)	total: 81.3ms	remaining: 3.98s
2:	learn: 0.8515481	test: 0.8420847	best: 0.8420847 (2)	total: 122ms	remaining: 3.95s
3:	learn: 0.7553615	test: 0.7442942	best: 0.7442942 (3)	total: 161ms	remaining: 3.88s
4:	learn: 0.6769321	test: 0.6642041	best: 0.6642041 (4)	total: 200ms	remaining: 3.81s
5:	learn: 0.6129491	test: 0.5984553	best: 0.5984553 (5)	total: 240ms	remaining: 3.75s
6:	learn: 0.5602650	test: 0.5444227	best: 0.5444227 (6)	total: 281ms	remaining: 3.73s
7:	learn: 0.5154574	test: 0.4986516	best: 0.4986516 (7)	total: 322ms	remaining: 3.7s
8:	learn: 0.4771987	test: 0.4594676	best: 0.4594676 (8)	total: 367ms	remaining: 3.71s
9:	learn: 0.4445906	test: 0.4257451	best: 0.4257451 (9)	total: 410ms	remaining: 3.69s
10:	learn: 0.4167552	test: 0.3968471	best: 0.3968471 (10)	total: 450ms	remaining: 3.64s
11:	learn: 0.3925019	test: 0.3715470	best: 0.3715470 

<catboost.core.CatBoostClassifier at 0x33132e220>

In [29]:
predictions = model.predict(X_test)

In [30]:
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))

                        precision    recall  f1-score   support

                 Books       0.95      0.95      0.95      2364
Clothing & Accessories       0.96      0.98      0.97      1734
           Electronics       0.96      0.93      0.95      2124
             Household       0.96      0.96      0.96      3863

              accuracy                           0.96     10085
             macro avg       0.96      0.96      0.96     10085
          weighted avg       0.96      0.96      0.96     10085

[[2235   20   26   83]
 [   6 1699   13   16]
 [  54   12 1984   74]
 [  51   48   48 3716]]
