In [23]:
import pandas as pd

# Read Data

In [24]:
df = pd.read_csv("ecommerce_dataset.csv", names=["category", "description"], header=None)
print(df.shape)
df.head()

(50425, 2)


Unnamed: 0,category,description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [25]:
df.category.value_counts()

category
Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: count, dtype: int64

In [26]:
df.dropna(inplace=True)
df.shape

(50424, 2)

# Prepare data for fasttext

## Edit Column Name

In [27]:
df.category.replace("Clothing & Accessories", "Cothing_Accessories", inplace=True)
df.category.unique()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.category.replace("Clothing & Accessories", "Cothing_Accessories", inplace=True)


array(['Household', 'Books', 'Cothing_Accessories', 'Electronics'],
      dtype=object)

## add '__label__' prefix in data to meet fasttext data requirement

In [28]:
df['category'] = "__label__" + df['category'].astype(str)
df.head()

Unnamed: 0,category,description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...


In [29]:
df['category_description'] = df['category'] + " " + df['description']
df.head()

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__Household Paper Plane Design Framed W...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__Household SAF 'Floral' Framed Paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__Household SAF 'UV Textured Modern Art...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__Household SAF Flower Print Framed Pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__Household Incredible Gifts India Wood...


In [30]:
df['category_description'][0]

'__label__Household Paper Plane Design Framed Wall Hanging Motivational Office Decor Art Prints (8.7 X 8.7 inch) - Set of 4 Painting made up in synthetic frame with uv textured print which gives multi effects and attracts towards it. This is an special series of paintings which makes your wall very beautiful and gives a royal touch. This painting is ready to hang, you would be proud to possess this unique painting that is a niche apart. We use only the most modern and efficient printing technology on our prints, with only the and inks and precision epson, roland and hp printers. This innovative hd printing technique results in durable and spectacular looking prints of the highest that last a lifetime. We print solely with top-notch 100% inks, to achieve brilliant and true colours. Due to their high level of uv resistance, our prints retain their beautiful colours for many years. Add colour and style to your living space with this digitally printed painting. Some are for pleasure and so

# Preprocess Data

## Function of preprocess data

In [31]:
import re

def preprocess(text):
    text = re.sub(r'[^\w\s\']', ' ', text)
    text = re.sub(r' +', ' ', text)
    return text.strip().lower()

## process data

In [32]:
preprocess("   VIKI's | Bookcase/Bookshelf (3-Shelf/Shelve, White) | ? . hi")

"viki's bookcase bookshelf 3 shelf shelve white hi"

In [33]:
df['category_description'] = df['category_description'].map(preprocess)

In [34]:
df.head()

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__household paper plane design framed w...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__household saf 'floral' framed paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__household saf 'uv textured modern art...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__household saf flower print framed pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__household incredible gifts india wood...


# Train Model

## Train test split

In [36]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [37]:
train.shape, test.shape

((40339, 3), (10085, 3))

In [38]:
train.head()

Unnamed: 0,category,description,category_description
45087,__label__Electronics,iVoltaa 3.5mm Braided Aux (Auxiliary) Audio Ca...,__label__electronics ivoltaa 3 5mm braided aux...
16499,__label__Household,"Gala Spin Mop Handle with Refill, White Gala B...",__label__household gala spin mop handle with r...
33457,__label__Cothing_Accessories,Mickey & Friends Girls' Regular Fit Cotton Sho...,__label__cothing_accessories mickey friends gi...
44174,__label__Electronics,HP v236w 16GB USB 2.0 Pen Drive This pen drive...,__label__electronics hp v236w 16gb usb 2 0 pen...
16014,__label__Household,Whirlpool 6.2 kg Fully-Automatic Top Loading W...,__label__household whirlpool 6 2 kg fully auto...


In [39]:
test.head()

Unnamed: 0,category,description,category_description
17756,__label__Household,"ALTON SHR20835 ABS 6-inch, 6-Function Overhead...",__label__household alton shr20835 abs 6 inch 6...
49658,__label__Electronics,Orgic India Full Body Matte RED Skin for Apple...,__label__electronics orgic india full body mat...
42376,__label__Electronics,Pasow Cat5e Ethernet Patch Cable RJ45 Computer...,__label__electronics pasow cat5e ethernet patc...
8189,__label__Household,ADA Handicraft Collapsible Portable Closet Sto...,__label__household ada handicraft collapsible ...
28153,__label__Books,"The 5 AM Club: Own Your Morning, Elevate Your ...",__label__books the 5 am club own your morning ...


## Save train and test data to train fasttext

In [41]:
train.to_csv("ecomerce.train", columns=["category_description"], index=False, header=False)
test.to_csv("ecomerce.test", columns=["category_description"], index=False, header=False)

## train the model

In [43]:
import fasttext

model = fasttext.train_supervised(input="ecomerce.train")
model.test("ecomerce.test")

(10085, 0.9659890927119484, 0.9659890927119484)

In [44]:
model.predict("wintech assemble desktop pc cpu 500gb sata hdd 4gb ram intel c2d processor")

(('__label__electronics',), array([0.99881554]))

In [45]:
model.predict("think and grow rich deluxe edition")

(('__label__books',), array([1.00000978]))

In [46]:
model.predict("ockey men's cotton shirt fabric details 80 cotton 20 polyester super combo")

(('__label__cothing_accessories',), array([1.00001001]))

In [54]:
model.get_nearest_neighbors('painting')

[(0.9971017241477966, 'vacuum'),
 (0.996876060962677, 'steam'),
 (0.99615478515625, 'temperature'),
 (0.995823323726654, 'guard'),
 (0.9957368969917297, 'extended'),
 (0.9956884980201721, 'inverter'),
 (0.9955567121505737, 'microwave'),
 (0.9954966306686401, 'juicer'),
 (0.9953467845916748, 'appliances'),
 (0.995195209980011, 'usha')]