In [2]:
import pandas as pd

df= pd.read_csv("ecommerce_dataset.csv", names=["category", "description"], header=None)
print(df.shape)
df.head(3)

(50425, 2)


Unnamed: 0,category,description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...


In [3]:
df.dropna(inplace=True)
df.shape

(50424, 2)

In [4]:
df.category.unique()

array(['Household', 'Books', 'Clothing & Accessories', 'Electronics'],
      dtype=object)

In [5]:
df.category.replace("Clothing & Accessories", "Clothing_Accessories", inplace=True)

In [6]:
df.category.unique()

array(['Household', 'Books', 'Clothing_Accessories', 'Electronics'],
      dtype=object)

In [7]:
df['category'] = '__label__' + df['category'].astype(str)
df.head(5)

Unnamed: 0,category,description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...


In [8]:
df['category_description'] = df['category'] + ' ' + df['description']
df.head(3)

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__Household Paper Plane Design Framed W...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__Household SAF 'Floral' Framed Paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__Household SAF 'UV Textured Modern Art...


In [9]:
import re

text = "  VIKI's | Bookcase/Bookshelf (3-Shelf/Shelve, White) | ? . hi"
text = re.sub(r'[^\w\s\']',' ', text)
text = re.sub(' +', ' ', text)
text.strip().lower()

"viki's bookcase bookshelf 3 shelf shelve white hi"

In [10]:
def preprocess(text):
    text = re.sub(r'[^\w\s\']',' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip().lower() 

In [11]:
df['category_description'] = df['category_description'].map(preprocess)
df.head()

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__household paper plane design framed w...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__household saf 'floral' framed paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__household saf 'uv textured modern art...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__household saf flower print framed pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__household incredible gifts india wood...


In [12]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [13]:
train.shape, test.shape

((40339, 3), (10085, 3))

In [14]:
train.to_csv("ecommerce.train", columns=["category_description"], index=False, header=False)
test.to_csv("ecommerce.test", columns=["category_description"], index=False, header=False)

In [16]:
pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m968.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting pybind11>=2.2 (from fasttext)
  Obtaining dependency information for pybind11>=2.2 from https://files.pythonhosted.org/packages/26/55/e776489172f576b782e616f58273e1f3de56a91004b0d20504169dd345af/pybind11-2.12.0-py3-none-any.whl.metadata
  Using cached pybind11-2.12.0-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.12.0-py3-none-any.whl (234 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25ldone
[?25h  Created wheel for fasttext: filename=fasttext-0.9.2-cp311-cp311-macosx_14_0_arm64.whl size=281379 sha256=98f0e121dd0ceb300ce21e597daa9e34e3feda7f60f5aecd0a7f830e73d12b02
  Stored in directory: /Users/mugilmr/Library/Caches/pip/wheels/12/89/c9/c932d04c4dd65

In [17]:
import fasttext

model = fasttext.train_supervised(input="ecommerce.train")
model.test("ecommerce.test")

Read 4M words
Number of words:  79305
Number of labels: 4
Progress: 100.0% words/sec/thread: 4186189 lr:  0.000000 avg.loss:  0.176434 ETA:   0h 0m 0s


(10083, 0.9727263711197064, 0.9727263711197064)

In [18]:
model.predict("wintech assemble desktop pc cpu 500 gb sata hdd 4 gb ram intel c2d processor 3")

(('__label__electronics',), array([0.99383354]))

In [19]:
model.predict("ockey men's cotton t shirt fabric details 80 cotton 20 polyester super combed cotton rich fabric")

(('__label__clothing_accessories',), array([1.00001001]))

In [20]:
model.predict("think and grow rich deluxe edition")


(('__label__books',), array([1.0000093]))

In [21]:
model.get_nearest_neighbors("painting")

[(0.9991217255592346, 'pvccolors'),
 (0.9991217255592346, '104gnotes'),
 (0.9991217255592346, 'tapequantity'),
 (0.9991217255592346, '\xa0measuring'),
 (0.9991217255592346, 'rollsmaterial'),
 (0.9991071224212646, 'cumulate'),
 (0.9991071224212646, 'cuvees'),
 (0.9991071224212646, 'sperior'),
 (0.999103844165802, '1400bl'),
 (0.999103844165802, 'osmon')]

In [22]:
model.get_nearest_neighbors("sony")

[(0.9993852376937866, 'x765w'),
 (0.999359667301178, '3lb'),
 (0.999359667301178, '20x4'),
 (0.999359667301178, 'strapped'),
 (0.999348521232605, 'konica'),
 (0.999348521232605, 'nissin'),
 (0.999348521232605, 'oloong'),
 (0.999348521232605, 'triopo'),
 (0.999348521232605, 'annoyingly'),
 (0.999348521232605, "gels'")]

In [23]:
model.get_nearest_neighbors("banglore")

[(0.0, 'of'),
 (0.0, 'a'),
 (0.0, 'for'),
 (0.0, 'in'),
 (0.0, 'is'),
 (0.0, '</s>'),
 (0.0, 'your\xa0skin\xa0feel'),
 (0.0, 'white\xa0ethnic\xa0jacket'),
 (0.0, 'world\xa0about\xa0your'),
 (0.0, '1991e')]