In [77]:
import pandas as pd

import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score
import pickle
from utils import download_nltk_dependencies

In [78]:
download_nltk_dependencies()

Downloading nltk dependencies, these are downloaded only once


[nltk_data] Downloading package wordnet to
[nltk_data]     /home/mohitlakshya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/mohitlakshya/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mohitlakshya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/mohitlakshya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [29]:
df = pd.read_parquet('./data/jio_mart_items.parquet')
df.head()

Unnamed: 0,category,sub_category,href,items
0,Groceries,Fruits & Vegetables,https://www.jiomart.com/c/groceries/fruits-veg...,Fresh Dates (Pack) (Approx 450 g - 500 g)
0,Groceries,Fruits & Vegetables,https://www.jiomart.com/c/groceries/fruits-veg...,Tender Coconut Cling Wrapped (1 pc) (Approx 90...
0,Groceries,Fruits & Vegetables,https://www.jiomart.com/c/groceries/fruits-veg...,Dates Imported (Approx 400 g - 500 g)
0,Groceries,Fruits & Vegetables,https://www.jiomart.com/c/groceries/fruits-veg...,Papaya (Each) (Approx. 800 g - 1600 g)
0,Groceries,Fruits & Vegetables,https://www.jiomart.com/c/groceries/fruits-veg...,Watermelon Kiran Big 1 pc (Approx. 2800 g - 40...


In [30]:
print(df.sub_category.unique())

['Fruits & Vegetables' 'Premium Fruits' 'Dairy & Bakery' 'Staples'
 'Snacks & Branded Foods' 'Beverages' 'Personal Care' 'Home Care'
 'Apparel' 'Mom & Baby Care' 'Books' 'Pets' 'Kitchenware' 'Dining'
 'Furnishing' 'Home Decor' 'Furniture' 'Home Appliances'
 'Toys, Games & Fitness' 'Electrical' 'Bathroom & Laundry Accessories'
 'Disposables' 'Stationery' 'Bags & Travel Luggage'
 'Mops, Brushes & Scrubs' 'Auto Care' 'Garden & Outdoor'
 'Carpentry & work accessories' 'Pooja Needs' 'Bathroom & Laundry'
 'Industrial & Scientific Supplies' 'Building Supplies & Measuring Tools'
 'Hardware & Plumbing' 'Home Safety & Automation'
 'Kitchen & Bath Fixtures' 'Paint, Wall Treatments & Supplies'
 'Power & Hand Tools' 'Handloom & Handicraft' 'Personal Wear' 'Men'
 'Women' 'Boys' 'Girls' 'Junior Boys' 'Junior Girls' 'Infants' 'Tech'
 'Mobiles & Tablets' 'TV & Speaker' 'Computers' 'Cameras'
 'Kitchen Appliances' 'Personal Care & Grooming' 'Smart Devices' 'Gaming'
 'Accessories' 'Phones' 'Office Product

In [31]:
# create shop mapping
vegetable_shop = ['Fruits & Vegetables', 'Premium Fruits']
dairy_shop = ['Dairy & Bakery']
kirana_shop = ['Staples', 'Snacks & Branded Foods', 'Bathroom & Laundry Accessories', 'Pooja Needs', 'Beverages', 'Mops, Brushes & Scrubs', 'Disposables']
cloth_shop = ['Apparel', 'Personal Wear', 'Men', 'Women', 'Boys', 'Girls', 'Junior Boys', 'Junior Girls', 'Infants']
pharmacy_shop = ['Personal Care', 'Personal Care & Grooming', 'Mom & Baby Care', 'Wellness', 'Fitness', 'Ayush', 'Covid Essentials', 'Health Care Devices', 'Treatments', 'Mom & Baby']
beauty_shop = ['Make-Up', 'Hair', 'Skin Care', 'Fragrances', "Men's Grooming", 'Tools & Appliances']
sports_shop = ['Toys, Games & Fitness']
hardware_shop = ['Carpentry & work accessories', 'Industrial & Scientific Supplies', 'Building Supplies & Measuring Tools', 'Hardware & Plumbing', 'Home Safety & Automation', 'Kitchen & Bath Fixtures', 'Paint, Wall Treatments & Supplies']
electronics_shop = ['Power & Hand Tools', 'Electrical', 'Home Appliances']
mobile_shop = ['Tech', 'Mobiles & Tablets', 'TV & Speaker', 'Computers', 'Cameras', 'Kitchen Appliances', 'Smart Devices', 'Gaming', 'Phones', 'Accessories']
stationary_shop = ['Office Products', 'Books', 'Stationery']
jewellery_shop = ['Fine Jewellery']
home_decor_shop = ['Furniture', 'Garden & Outdoor', 'Home Decor', 'Furnishing', 'Kitchenware', 'Dining']

shops = {'vegetable_shop': vegetable_shop, 'dairy_shop': dairy_shop, 'kirana_shop': kirana_shop, 'cloth_shop': cloth_shop, 'pharamacy_shop': pharmacy_shop,
        'beauty_shop': beauty_shop, 'sports_shop': sports_shop, 'hardware_shop': hardware_shop, 'electronics_shop': electronics_shop, 'mobile_shop': mobile_shop,
        'stationary_shop': stationary_shop, 'jewellery_shop': jewellery_shop, 'furniture_shop': home_decor_shop}

In [32]:
def shop_name(x):
    shop = None
    for k, v in shops.items():
        if x in v:
            shop = k
    if shop is None:
        shop = 'other_shop' 
    return shop

df['shop'] = df['sub_category'].apply(lambda x: shop_name(x))

In [33]:
df.shape

(158172, 5)

We have total 158172 product items

In [34]:
# remove rows where there is no item description
df = df.loc[~df['items'].isna(), :]

Cleaning Text data

1. Remove non-alphanumeric characters
2. Remove stop words
3. Lemmatize each line of text

In [35]:
def clean_text(words):
    words = re.sub('[^a-zA-Z]', " ", words)
    text = words.lower().split()
    return " ".join(text)

def remove_stop_words(text, stop_words):
    text = [word.lower() for word in text.split() if word.lower() not in stop_words]
    return " ".join(text)

def lemmatize_words(text, lemma):
    lem_text = [lemma.lemmatize(word) for word in text.split()]
    return " ".join(lem_text)

stop_words = stopwords.words('english')
stop_words.extend(['approx', 'g', 'pc'])
lemma = WordNetLemmatizer()

df['clean_items'] = df['items'].apply(lambda x: clean_text(x))
df['clean_items'] = df['clean_items'].apply(lambda x: remove_stop_words(x, stop_words))
df['clean_items'] = df['clean_items'].apply(lambda x: lemmatize_words(x, lemma))

df.head()


Unnamed: 0,category,sub_category,href,items,shop,clean_items
0,Groceries,Fruits & Vegetables,https://www.jiomart.com/c/groceries/fruits-veg...,Fresh Dates (Pack) (Approx 450 g - 500 g),vegetable_shop,fresh date pack
0,Groceries,Fruits & Vegetables,https://www.jiomart.com/c/groceries/fruits-veg...,Tender Coconut Cling Wrapped (1 pc) (Approx 90...,vegetable_shop,tender coconut cling wrapped
0,Groceries,Fruits & Vegetables,https://www.jiomart.com/c/groceries/fruits-veg...,Dates Imported (Approx 400 g - 500 g),vegetable_shop,date imported
0,Groceries,Fruits & Vegetables,https://www.jiomart.com/c/groceries/fruits-veg...,Papaya (Each) (Approx. 800 g - 1600 g),vegetable_shop,papaya
0,Groceries,Fruits & Vegetables,https://www.jiomart.com/c/groceries/fruits-veg...,Watermelon Kiran Big 1 pc (Approx. 2800 g - 40...,vegetable_shop,watermelon kiran big


In [36]:
# save cleaned data
df.to_parquet('./data/jio_mart_items_cleaned.parquet')

In [37]:
# split into train and test
train, test = train_test_split(df, test_size=0.3, random_state=100)

In [38]:
# create tagged document for Doc2Vec
train_tag = train.apply(lambda x: TaggedDocument(words=word_tokenize(x['clean_items']), tags=[x.shop]), axis=1)
test_tag = test.apply(lambda x: TaggedDocument(words=word_tokenize(x['clean_items']), tags=[x.shop]), axis=1)

In [39]:
# creating a Doc2Vec model

# dm = 0, using distributed bag of words
# vector_size = 100, word embeddings shape
# window = 2, predict every second word
# sample = 0, threshold for which higher frequency words are randomly down sampled
# min_count = 2, ignores all words with total frequency lower than this

doc = Doc2Vec(dm=0, vector_size=100, min_count=2, window=2, sample=0)
doc.build_vocab(train_tag)
doc.corpus_total_words

1035586

In [40]:
# train model
doc.train(train_tag, total_examples=doc.corpus_count, epochs=10)

In [41]:
# save model
doc.save('./models/model.doc2vec')

In [42]:
# building final vector feature for classifier
def final_vector(model, input_docs):
    targets, feature_vectors = zip(*[(doc.tags[0], model.infer_vector(doc.words)) for doc in input_docs])
    return targets, feature_vectors

In [43]:
y_train, x_train = final_vector(doc, train_tag)
y_test, x_test = final_vector(doc, test_tag)

In [44]:
# train the classifer
log_reg = LogisticRegression(n_jobs=4, C=5)
log_reg.fit(x_train, y_train)
y_pred = log_reg.predict(x_test)

In [45]:
# test the model
print(f'Testing accuracy: {accuracy_score(y_pred, y_test)}')
print(f'F1 score: {f1_score(y_test, y_pred, average="weighted")}')

Testing accuracy: 0.9443531048438093
F1 score: 0.9441358111626635


In [46]:
print(classification_report(y_test, y_pred))

                  precision    recall  f1-score   support

     beauty_shop       0.90      0.90      0.90      2723
      cloth_shop       0.97      0.99      0.98      7759
      dairy_shop       0.97      0.83      0.89       162
electronics_shop       0.95      0.90      0.92      1458
  furniture_shop       0.96      0.96      0.96      8561
   hardware_shop       0.94      0.85      0.89       485
  jewellery_shop       1.00      1.00      1.00        23
     kirana_shop       0.93      0.95      0.94      7229
     mobile_shop       0.97      0.97      0.97      4581
      other_shop       0.96      0.95      0.95      3637
  pharamacy_shop       0.90      0.89      0.90      5723
     sports_shop       0.94      0.94      0.94      2491
 stationary_shop       0.95      0.93      0.94      2511
  vegetable_shop       1.00      0.54      0.70        99

        accuracy                           0.94     47442
       macro avg       0.95      0.90      0.92     47442
    weighted

In [47]:
# save logistic regression model
model_name = 'logistic_regr_model.pkl'
with open(f'./models/{model_name}', 'wb') as file:
    pickle.dump(log_reg, file)

In [76]:
feature = doc.infer_vector('face'.split(' '))

log_reg.predict(feature.reshape(1, -1)).tolist()[0]

'cloth_shop'