In [1]:
import pandas as pd

df= pd.read_csv("D:\\Data\\NLP\\abc_text\\ecommerce_dataset.csv", names=["category", "description"], header=None)
print(df.shape)
df.head(3)

(50425, 2)


Unnamed: 0,category,description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...


In [2]:
df.dropna(inplace=True)
df.shape

(50424, 2)

In [3]:
df.category.unique()

array(['Household', 'Books', 'Clothing & Accessories', 'Electronics'],
      dtype=object)

In [4]:
df.category.replace("Clothing & Accessories", "Clothing_Accessories", inplace=True)

In [5]:
df.category.unique()

array(['Household', 'Books', 'Clothing_Accessories', 'Electronics'],
      dtype=object)

In [6]:
df["category"] = "__label__" + df.category

In [7]:
df.head()

Unnamed: 0,category,description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...


In [8]:
df['category_description'] = df['category'] + ' ' + df['description']
df.head(3)

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__Household Paper Plane Design Framed W...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__Household SAF 'Floral' Framed Paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__Household SAF 'UV Textured Modern Art...


In [9]:
import re

In [10]:
def preprocess(text):
    text = re.sub(r'[^\w\s\']',' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip().lower() 

In [11]:
df['category_description'] = df['category_description'].map(preprocess)
df.head()

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__household paper plane design framed w...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__household saf 'floral' framed paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__household saf 'uv textured modern art...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__household saf flower print framed pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__household incredible gifts india wood...


In [12]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [13]:
train.shape, test.shape

((40339, 3), (10085, 3))

In [14]:
train.to_csv("D:\\Data\\NLP\\abc_text\\ecommerce.train", columns=["category_description"], index=False, header=False)
test.to_csv("D:\\Data\\NLP\\abc_text\\ecommerce.test", columns=["category_description"], index=False, header=False)

In [15]:
import fasttext

model = fasttext.train_supervised(input="D:\\Data\\NLP\\abc_text\\ecommerce.train")
model.test("D:\\Data\\NLP\\abc_text\\ecommerce.test")

(10085, 0.9704511650966783, 0.9704511650966783)

In [16]:
model.predict("wintech assemble desktop pc cpu 500 gb sata hdd 4 gb ram intel c2d processor 3")

(('__label__electronics',), array([0.99608362]))

In [17]:
model.predict("ockey men's cotton t shirt fabric details 80 cotton 20 polyester super combed cotton rich fabric")

(('__label__clothing_accessories',), array([1.00001001]))

In [18]:
model.get_nearest_neighbors("fabric")

[(0.9995719194412231, 'trk'),
 (0.9993277192115784, 'classiness'),
 (0.9982336163520813, 'girls'),
 (0.9969492554664612, 'neutralize'),
 (0.9964252710342407, 'ruffles'),
 (0.9955223202705383, 'conjugated'),
 (0.9953688979148865, 'quads'),
 (0.9944806098937988, 'kidswear'),
 (0.9938527941703796, 'wheather'),
 (0.9938176870346069, 'amosfun')]

In [19]:
model.get_nearest_neighbors("processor")

[(0.9958704113960266, '5580'),
 (0.9957769513130188, 'âusb'),
 (0.9957769513130188, '107mhz'),
 (0.9957769513130188, 'âmode'),
 (0.9957769513130188, 'mircophone'),
 (0.9957452416419983, 'mc32ga'),
 (0.9956132173538208, 'highlyflexible'),
 (0.995585560798645, '02s'),
 (0.995585560798645, "off'"),
 (0.995585560798645, 'phonegraphy')]

In [20]:
model = fasttext.train_supervised(input="D:\\Data\\NLP\\abc_text\\ecommerce.train", lr=0.5, epoch=10, wordNgrams=2, bucket=200000, dim=50, loss='ova')
model.test("D:\\Data\\NLP\\abc_text\\ecommerce.test")

(10085, 0.9837382250867626, 0.9837382250867626)

In [21]:
model.get_nearest_neighbors("fabric")

[(0.9920938611030579, 'stylish'),
 (0.9891014099121094, 'girls'),
 (0.9890385866165161, 'length'),
 (0.9849764108657837, 'look'),
 (0.9843204617500305, 'shirt'),
 (0.984307587146759, 'comfortable'),
 (0.9774779081344604, 'lace'),
 (0.9757729172706604, 'jeans'),
 (0.9749115109443665, 'neck'),
 (0.9736604690551758, 'bust')]

In [22]:
model.get_nearest_neighbors("processor")

[(0.99712735414505, 'card'),
 (0.9928795695304871, 'adapter'),
 (0.9924271702766418, 'fi'),
 (0.9916213154792786, 'hp'),
 (0.9906498789787292, 'tank'),
 (0.9898119568824768, 'wi'),
 (0.9886986017227173, '23'),
 (0.9882738590240479, 'cable'),
 (0.9877035617828369, 'pc'),
 (0.9864511489868164, 'sound')]