In [6]:
import pandas as pd

df = pd.read_csv("ecommerce_dataset.csv", names=["category", "description"],header=None)
df.head(3)

Unnamed: 0,category,description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...


In [7]:
df.category.value_counts()

category
Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: count, dtype: int64

In [8]:
df.shape

(50425, 2)

In [9]:
df.dropna(inplace=True)
df.shape

(50424, 2)

In [10]:
df.category.replace("Clothing & Accessories", "Clothing_Accessories", inplace=True)
df.category.unique()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.category.replace("Clothing & Accessories", "Clothing_Accessories", inplace=True)


array(['Household', 'Books', 'Clothing_Accessories', 'Electronics'],
      dtype=object)

In [11]:
df['category'] = "__label__" + df['category'].astype(str)
df.head()

Unnamed: 0,category,description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...


In [12]:
df['category_description'] = df['category'] + " " + df['description']
df.head()

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__Household Paper Plane Design Framed W...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__Household SAF 'Floral' Framed Paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__Household SAF 'UV Textured Modern Art...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__Household SAF Flower Print Framed Pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__Household Incredible Gifts India Wood...


In [13]:
import re
def preprocess(text):
    text = re.sub('[^\w\s]', ' ', text, flags=re.MULTILINE)
    text = re.sub(' +', ' ', text, flags=re.MULTILINE)
    return text.strip().lower()

In [14]:
preprocess("hi!, my name is .. malhar,,, will you? be my friend...")

'hi my name is malhar will you be my friend'

In [15]:
df['category_description'] = df['category_description'].map(preprocess)

In [16]:
df.head()

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__household paper plane design framed w...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__household saf floral framed painting ...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__household saf uv textured modern art ...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__household saf flower print framed pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__household incredible gifts india wood...


In [17]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [18]:
test

Unnamed: 0,category,description,category_description
40359,__label__Electronics,Paper Plane Design 004_Army Army Collection La...,__label__electronics paper plane design 004_ar...
10647,__label__Household,Storite Pistole shape Metal Impulse Igniter Sp...,__label__household storite pistole shape metal...
7008,__label__Household,Fortune Premium 10 - Pieces Stainless Steel Ba...,__label__household fortune premium 10 pieces s...
50371,__label__Electronics,"Apple iPhone XR (Red, 3GB RAM, 64GB Storage, 1...",__label__electronics apple iphone xr red 3gb r...
20656,__label__Books,On Directing Film Amazon.com Review According ...,__label__books on directing film amazon com re...
...,...,...,...
43627,__label__Electronics,Samsung 27 inch (68.6 cm) Curved Bezel Less LE...,__label__electronics samsung 27 inch 68 6 cm c...
14730,__label__Household,Bajaj Majesty 1603 T 16-Litre Oven Toaster Gri...,__label__household bajaj majesty 1603 t 16 lit...
24710,__label__Books,UGC-NET/JRF/SET Education (Paper II & III) It ...,__label__books ugc net jrf set education paper...
34461,__label__Clothing_Accessories,Sojanya (Since 1958) Men's Sky Blue Cotton Lin...,__label__clothing_accessories sojanya since 19...


In [19]:
train.to_csv("ecommerce.train", columns=['category_description'], header=None, index=False)
test.to_csv("ecommerce.test", columns=['category_description'], header=None, index=False)

In [20]:
import fasttext
model = fasttext.train_supervised(input="ecommerce.train")
model.test("ecommerce.test")

# output ---> (test size, precision, recall)

(10082, 0.965383852410236, 0.965383852410236)

In [22]:
model.get_nearest_neighbors("sony")

[(0.9983512759208679, 'dvd'),
 (0.9982203841209412, 'antenna'),
 (0.9977090358734131, 'binocular'),
 (0.9973396062850952, 'devices'),
 (0.9967654347419739, 'whey'),
 (0.9967459440231323, 'glossy'),
 (0.9966604709625244, 'binoculars'),
 (0.9966168403625488, 'viewing'),
 (0.9965702891349792, 'charger'),
 (0.9964793920516968, 'mac')]