In [61]:
import pandas as pd
import spacy
from tqdm import tqdm
tqdm.pandas()

from spacy.tokens import DocBin


In [10]:
df = pd.read_csv('./raw_data/ecommerceDataset.csv', names=['Ptype','Description'])

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50424 entries, 0 to 50424
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Ptype        50424 non-null  object
 1   Description  50424 non-null  object
dtypes: object(2)
memory usage: 1.2+ MB


In [21]:
df.head(10)

Unnamed: 0,Ptype,Description
0,Clothing & Accessories,Jockey Men's Cotton Vest (Pack of 3) (Modern C...
1,Household,Clearline CLR 009 1000-Watt Soup Maker and Ble...
2,Electronics,iVoltaa 3.5mm Braided Aux (Auxiliary) Audio Ca...
3,Electronics,MikroTik Wireless Access Point RB951Ui-2HnD mi...
4,Household,"Bull 4 Socket,4 Switch,3 M Wire Extension Boar..."
5,Electronics,Systene Clear Sound High Bass Touch Two Stereo...
6,Household,DAMAC Multi Microfibre Super Soft Diwan Set (8...
7,Household,"Homesake Frost White Tealight Holder, Set of 4..."
8,Household,Home Puff Silicone Premium Shredder Vegetable ...
9,Electronics,xcluma Antenna for GSM FCT Device GSM FCT Cabl...


In [22]:
df = df.sample(frac=1).reset_index(drop=True)

In [23]:
df.head(10)

Unnamed: 0,Ptype,Description
0,Household,Rajesh LED Bulb for Automatic Sewing Machines ...
1,Electronics,Wooky Aqua Stone-10 5W Wireless Waterproof Blu...
2,Electronics,Amkette Xcite Pro USB Keyboard (Black) The new...
3,Books,24 Years UPSC IAS/ IPS Prelims Topic-wise Solv...
4,Books,Group Discussion Evam Interview Ki Tayari
5,Electronics,"APC BX600C-IN 600VA, 230V Back UPS Battery bac..."
6,Clothing & Accessories,BODYCARE Printed Girls Bloomer Pack of 6 from ...
7,Electronics,Adichai Educational Toy Real Working Science M...
8,Books,Orphan X (Evan Smoak)
9,Books,Action Shoes Men's Sneakers What one needs is ...


In [24]:
df.isna().sum()

Ptype          0
Description    1
dtype: int64

In [26]:
df = df.dropna()

In [27]:
df

Unnamed: 0,Ptype,Description
0,Household,Rajesh LED Bulb for Automatic Sewing Machines ...
1,Electronics,Wooky Aqua Stone-10 5W Wireless Waterproof Blu...
2,Electronics,Amkette Xcite Pro USB Keyboard (Black) The new...
3,Books,24 Years UPSC IAS/ IPS Prelims Topic-wise Solv...
4,Books,Group Discussion Evam Interview Ki Tayari
...,...,...
50420,Books,"Meditations (Modern Library MM) Review “Here, ..."
50421,Books,Handbook of Mechanical Engineering
50422,Clothing & Accessories,WEXFORD Men's Cotton Polo (Wex-Wfe010B) Wexfor...
50423,Electronics,Andoer V12 1080P Full HD 16X Digital Zoom Reco...


In [25]:
df.duplicated().sum()

22622

In [29]:
df = df.drop_duplicates()

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27802 entries, 0 to 50408
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Ptype        27802 non-null  object
 1   Description  27802 non-null  object
dtypes: object(2)
memory usage: 651.6+ KB


In [32]:
df = df.reset_index(drop=True)

In [33]:
df['Ptype'].unique()

array(['Household', 'Electronics', 'Books', 'Clothing & Accessories'],
      dtype=object)

In [34]:
df['Ptype'].value_counts()

Ptype
Household                 10564
Books                      6256
Clothing & Accessories     5674
Electronics                5308
Name: count, dtype: int64

In [39]:
nlp = spacy.load('en_core_web_md', exclude=['tok2vec', 'tagger', 'morphologizer', 'parser', 'attribute_ruler', 'ner'])

In [41]:
nlp.pipe_names

CPU times: total: 0 ns
Wall time: 0 ns


['lemmatizer']

In [49]:
%%time
df['docs'] = list(nlp.pipe(df['Description'], n_process=1, batch_size=2000))



CPU times: total: 391 ms
Wall time: 16.7 s


In [50]:
def lemmatize(doc):
    words = []
    for token in doc:
        if (token.is_stop != True) and (token.is_punct != True) and (token.is_space != True) and (token.is_digit != True):
            words.append(token.lemma_)
    return ' '.join(words)


In [52]:
df['clear_text'] = df['docs'].progress_apply(lemmatize)

100%|██████████| 27802/27802 [00:01<00:00, 16684.23it/s]


In [58]:
train = df[['clear_text', 'Ptype']][:20000]
dev = df[['clear_text', 'Ptype']][20000:23000]
test = df[['clear_text', 'Ptype']][23000:]

In [60]:
train.shape, dev.shape, test.shape

((20000, 2), (3000, 2), (4802, 2))

In [66]:
train = train.reset_index(drop=True)
dev = dev.reset_index(drop=True)
test = test.reset_index(drop=True)

In [63]:
def create_docbin(data, outfile):
    db = DocBin()
    categories = data['Ptype'].unique()
    for i in range(data.shape[0]):    
        doc = nlp.make_doc(data['clear_text'][i])
        doc.cats = {category: 0 for category in categories}
        doc.cats[data['Ptype'][i]] = 1
        db.add(doc)
    db.to_disk(outfile)

In [67]:
create_docbin(train, "./train.spacy")
create_docbin(dev, "./dev.spacy")
create_docbin(test, "./test.spacy")


In [68]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [69]:
!python -m spacy train config.cfg --paths.train ./train.spacy  --paths.dev ./dev.spacy --output model

^C


[38;5;2m✔ Created output directory: model[0m
[38;5;4mℹ Saving to output directory: model[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ----------  ------
  0       0          0.19        9.78    0.10
  0     200         23.03       85.97    0.86
  0     400         13.75       89.85    0.90
  0     600          9.98       91.27    0.91
  0     800          9.14       91.68    0.92
  0    1000          8.13       92.72    0.93
  0    1200          7.10       93.20    0.93
  0    1400          6.22       93.80    0.94
  0    1600          6.78       93.85    0.94
  0    1800          6.98       94.24    0.94
  0    2000          5.11       94.56    0.95
  0    2200          6.56       94.86    0.95
  0    2400          5.17       95.16    0.95
  0    2600          4.76       95.34    0.95
  0    2800        

In [70]:
!python -m spacy evaluate model/model-best/ --output metrics.json ./test.spacy

[38;5;4mℹ Using CPU[0m
[1m

TOK                 100.00
TEXTCAT (macro F)   94.21 
SPEED               315756

[1m

                             P       R       F
Household                92.97   96.36   94.63
Electronics              93.44   91.70   92.56
Books                    94.10   91.63   92.85
Clothing & Accessories   97.57   96.01   96.78

[1m

                         ROC AUC
Household                   0.98
Electronics                 0.98
Books                       0.97
Clothing & Accessories      0.99

[38;5;2m✔ Saved results to metrics.json[0m


In [71]:
nlp = spacy.load("./model/model-best")

In [72]:
nlp.pipe_names

['textcat']

In [74]:
doc = nlp(test['clear_text'][1])

In [75]:
doc.cats

{'Household': 0.9998680353164673,
 'Electronics': 0.00013167731231078506,
 'Books': 1.5873418135470274e-07,
 'Clothing & Accessories': 9.72675877619622e-08}