In [88]:
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# internal imports
import visualization as vis

## Get and explore data

In [55]:
# https://zenodo.org/records/3355823
file_name = "ecommerceDataset.csv"
data = pd.read_csv(file_name, header=None)
data.columns = ["category", "description"]

In [56]:
print("Rows, cols:", data.shape)
data.head(10)

Rows, cols: (50425, 2)


Unnamed: 0,category,description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...
5,Household,Pitaara Box Romantic Venice Canvas Painting 6m...
6,Household,Paper Plane Design Starry Night Vangoh Wall Ar...
7,Household,Pitaara Box Romantic Venice Canvas Painting 6m...
8,Household,SAF 'Ganesh Modern Art Print' Painting (Synthe...
9,Household,Paintings Villa UV Textured Modern Art Print F...


Category distribution

In [57]:
data["category"].value_counts()

Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: category, dtype: int64

Description examples

In [58]:
for category in data["category"].unique():
    row_example = data[data["category"] == category].sample(n=100).iloc[0]
    print(f"CATEGORY: {row_example['category']}\nDESCRIPTION:\n{row_example['description']}\n\n")

CATEGORY: Household
DESCRIPTION:
Philips Ultron 17-Watt Wall Light (Cool White, Round) Excessive brightness can cause discomfort to your eyes. But the right light provides more comfort physical ease and relaxation when looking directly at them. Philips LED products provide the right light for all your needs.


CATEGORY: Books
DESCRIPTION:
General Knowledge Aptitude and Logical Thinking for AIIMS 


CATEGORY: Clothing & Accessories
DESCRIPTION:
AJ DEZINES Boy's Velvet Party Wear Suit Set Exclusive three piece party suit set from AJ Dezines. It is composed from good quality velvet material that makes it skin friendly. This party wear suit set comes with T-Shirt, Blazer and Trouser. Featuring blazer full sleeve, welt pocket and stylish neck with beautiful pattern. Full length trouser and made of half elastic on the back side which ensures it fits to the little and delicate waist of your child. This product are available in three different colour fawn, grey and Lemon dress for boys is supe

Handling empty values

In [59]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50425 entries, 0 to 50424
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   category     50425 non-null  object
 1   description  50424 non-null  object
dtypes: object(2)
memory usage: 788.0+ KB


In [60]:
data[data["description"].isna()]

Unnamed: 0,category,description
39330,Clothing & Accessories,


In [61]:
data.dropna(inplace=True)

## Handle duplicates

In [62]:
repeated_messages = data \
    .groupby("description", as_index=False) \
    .agg(
        n_repeats=("category", "count"),
        n_unique_categories=("category", lambda x: len(np.unique(x)))
    )

assert all(repeated_messages["n_unique_categories"] == 1), \
    f"Repeated descriptions with non-unique categories: " + \
    f"{repeated_messages[repeated_messages['n_unique_categories']] > 1}"

repeated_messages = repeated_messages[repeated_messages["n_repeats"] > 1]

print(f"Count of repeated messages (unique): {repeated_messages.shape[0]}")
print(f"Total number: {repeated_messages['n_repeats'].sum()} out of {data.shape[0]}")

repeated_messages.head(10)

Count of repeated messages (unique): 13979
Total number: 36601 out of 50424


Unnamed: 0,description,n_repeats,n_unique_categories
2,# The Silky Beans 2 KG Premium Bean Bag Filler...,4,1
3,# The Silky Beans 500 Gram Premium A-Grade for...,4,1
4,#Horror,4,1
7,(CERTIFIED REFURBISHED) HP DeskJet 2131 All-in...,2,1
8,(CERTIFIED REFURBISHED) Logitech K230 Wireless...,2,1
11,(CERTIFIED REFURBISHED) Philips BT 106 Bluetoo...,2,1
17,.42x HD Super Wide Angle Panoramic Macro Fishe...,2,1
18,0-Degree Electrical Socket Cover for Indian Pl...,2,1
20,0-Degree Women's Ankle Length Premium Cotton M...,2,1
21,1 12 4PCS Gold Metal Door Knocker Lock Doorpla...,2,1


In [63]:
data.drop_duplicates(inplace=True)
print(f"New dataset size: {data.shape}")
print(data["category"].value_counts())

New dataset size: (27802, 2)
Household                 10564
Books                      6256
Clothing & Accessories     5674
Electronics                5308
Name: category, dtype: int64


## Languages

In [68]:
# TODO: filter only English texts

## Description lengths

In [64]:
data["description_length"] = data["description"].apply(len)

In [65]:
px.histogram(data, x="description_length", title="Descriptions length distribution")

In [66]:
px.histogram(
    data[data["description_length"] <= 5000],
    x="description_length",
    title="Descriptions length distribution per category",
    color="category",
    barmode="overlay",
    histnorm="density"
)

In [67]:
print("Shortest messages, examples:\n")

for i, row in data.sort_values("description_length").iloc[:7].iterrows():
    print("{} '{}': \n{}\n".format(i, row["category"], row["description"]))

Shortest messages, examples:

31225 'Clothing & Accessories': 
BELT

25918 'Books': 
Sail

25314 'Books': 
Yes!

30433 'Books': 
On War

30411 'Books': 
On War 

25810 'Books': 
Tennis 

23015 'Books': 
Essays 



## Train test eval split

In [71]:
data_train, data_test = train_test_split(data, test_size=0.3)
data_test, data_eval = train_test_split(data_test, test_size=0.5)

In [73]:
data_train.shape, data_test.shape, data_eval.shape

((19461, 3), (4170, 3), (4171, 3))

## Modelling

In [74]:
inference_time_s = dict()
scores = dict()

### Baseline

In [78]:
count_vectorizer = CountVectorizer(max_features=100)
x_train_baseline, y_train_baseline = count_vectorizer.fit_transform(data_train["description"]), data_train["category"]
x_test_baseline, y_test_baseline = count_vectorizer.transform(data_test["description"]), data_test["category"]
x_eval_baseline, y_eval_baseline = count_vectorizer.transform(data_eval["description"]), data_eval["category"]

x_train_baseline = x_train_baseline.toarray()
x_test_baseline = x_test_baseline.toarray()
x_eval_baseline = x_eval_baseline.toarray()

x_train_baseline.shape, x_test_baseline.shape, x_eval_baseline.shape

((19461, 100), (4170, 100), (4171, 100))

In [81]:
ss = StandardScaler()
x_train_baseline = ss.fit_transform(x_train_baseline)
x_test_baseline = ss.transform(x_test_baseline)
x_eval_baseline = ss.transform(x_eval_baseline)

In [86]:
%%capture

lr = LogisticRegression()
lr.fit(x_train_baseline, y_train_baseline)
scores["baseline"] = balanced_accuracy_score(y_test_baseline, lr.predict(x_test_baseline))

In [87]:
print("Balanced accuracy score: {:.1f}%".format(scores["baseline"] * 100))

Balanced accuracy score: 77.4%


In [89]:
vis.draw_confusion_matrix(
    y_test_baseline,
    lr.predict(x_test_baseline),
    lr.classes_,
    label="recall",
    normalize="true"
)

In [90]:
vis.draw_confusion_matrix(
    y_test_baseline,
    lr.predict(x_test_baseline),
    lr.classes_,
    label="precision",
    normalize="pred"
)