In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold

import joblib

from utils import download_nltk_dependencies, get_stop_words, clean_text, lemmatize_words, remove_stop_words, print_accuracies, print_classification_report

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

In [3]:
download_nltk_dependencies()

Downloading nltk dependencies, these are downloaded only once


In [4]:
df = pd.read_parquet('./data/jio_mart_items.parquet')
df.head()

Unnamed: 0,category,sub_category,href,items
0,Groceries,Fruits & Vegetables,https://www.jiomart.com/c/groceries/fruits-veg...,Fresh Dates (Pack) (Approx 450 g - 500 g)
0,Groceries,Fruits & Vegetables,https://www.jiomart.com/c/groceries/fruits-veg...,Tender Coconut Cling Wrapped (1 pc) (Approx 90...
0,Groceries,Fruits & Vegetables,https://www.jiomart.com/c/groceries/fruits-veg...,Dates Imported (Approx 400 g - 500 g)
0,Groceries,Fruits & Vegetables,https://www.jiomart.com/c/groceries/fruits-veg...,Papaya (Each) (Approx. 800 g - 1600 g)
0,Groceries,Fruits & Vegetables,https://www.jiomart.com/c/groceries/fruits-veg...,Watermelon Kiran Big 1 pc (Approx. 2800 g - 40...


In [5]:
print(df.sub_category.unique())

['Fruits & Vegetables' 'Premium Fruits' 'Dairy & Bakery' 'Staples'
 'Snacks & Branded Foods' 'Beverages' 'Personal Care' 'Home Care'
 'Apparel' 'Mom & Baby Care' 'Books' 'Pets' 'Kitchenware' 'Dining'
 'Furnishing' 'Home Decor' 'Furniture' 'Home Appliances'
 'Toys, Games & Fitness' 'Electrical' 'Bathroom & Laundry Accessories'
 'Disposables' 'Stationery' 'Bags & Travel Luggage'
 'Mops, Brushes & Scrubs' 'Auto Care' 'Garden & Outdoor'
 'Carpentry & work accessories' 'Pooja Needs' 'Bathroom & Laundry'
 'Industrial & Scientific Supplies' 'Building Supplies & Measuring Tools'
 'Hardware & Plumbing' 'Home Safety & Automation'
 'Kitchen & Bath Fixtures' 'Paint, Wall Treatments & Supplies'
 'Power & Hand Tools' 'Handloom & Handicraft' 'Personal Wear' 'Men'
 'Women' 'Boys' 'Girls' 'Junior Boys' 'Junior Girls' 'Infants' 'Tech'
 'Mobiles & Tablets' 'TV & Speaker' 'Computers' 'Cameras'
 'Kitchen Appliances' 'Personal Care & Grooming' 'Smart Devices' 'Gaming'
 'Accessories' 'Phones' 'Office Product

In [6]:
# create shop mapping
vegetable_shop = ['Fruits & Vegetables', 'Premium Fruits']
dairy_shop = ['Dairy & Bakery']
kirana_shop = ['Staples', 'Snacks & Branded Foods', 'Bathroom & Laundry Accessories', 'Pooja Needs', 'Beverages', 'Mops, Brushes & Scrubs', 'Disposables']
cloth_shop = ['Apparel', 'Personal Wear', 'Men', 'Women', 'Boys', 'Girls', 'Junior Boys', 'Junior Girls', 'Infants']
pharmacy_shop = ['Personal Care', 'Personal Care & Grooming', 'Mom & Baby Care', 'Wellness', 'Fitness', 'Ayush', 'Covid Essentials', 'Health Care Devices', 'Treatments', 'Mom & Baby']
beauty_shop = ['Make-Up', 'Hair', 'Skin Care', 'Fragrances', "Men's Grooming", 'Tools & Appliances']
sports_shop = ['Toys, Games & Fitness']
hardware_shop = ['Carpentry & work accessories', 'Industrial & Scientific Supplies', 'Building Supplies & Measuring Tools', 'Hardware & Plumbing', 'Home Safety & Automation', 'Kitchen & Bath Fixtures', 'Paint, Wall Treatments & Supplies']
electronics_shop = ['Power & Hand Tools', 'Electrical', 'Home Appliances']
mobile_shop = ['Tech', 'Mobiles & Tablets', 'TV & Speaker', 'Computers', 'Cameras', 'Kitchen Appliances', 'Smart Devices', 'Gaming', 'Phones', 'Accessories']
stationary_shop = ['Office Products', 'Books', 'Stationery']
jewellery_shop = ['Fine Jewellery']
home_decor_shop = ['Furniture', 'Garden & Outdoor', 'Home Decor', 'Furnishing', 'Kitchenware', 'Dining']

shops = {'vegetable_shop': vegetable_shop, 'dairy_shop': dairy_shop, 'kirana_shop': kirana_shop, 'cloth_shop': cloth_shop, 'pharamacy_shop': pharmacy_shop,
        'beauty_shop': beauty_shop, 'sports_shop': sports_shop, 'hardware_shop': hardware_shop, 'electronics_shop': electronics_shop, 'mobile_shop': mobile_shop,
        'stationary_shop': stationary_shop, 'jewellery_shop': jewellery_shop, 'furniture_shop': home_decor_shop}

In [7]:
def shop_name(x):
    shop = None
    for k, v in shops.items():
        if x in v:
            shop = k
    if shop is None:
        shop = 'other_shop' 
    return shop

df['shop'] = df['sub_category'].apply(lambda x: shop_name(x))

In [8]:
df.shape

(158172, 5)

We have total 158172 product items

In [9]:
# remove rows where there is no item description
df = df.loc[~df['items'].isna(), :]

Checking for class imbalance

In [10]:
summary = df.groupby('shop')['items'].count().reset_index()
summary['perc'] = summary['items']*100/summary['items'].sum()
summary

Unnamed: 0,shop,items,perc
0,beauty_shop,9222,5.831579
1,cloth_shop,25978,16.42732
2,dairy_shop,516,0.326295
3,electronics_shop,4912,3.106128
4,furniture_shop,28500,18.02212
5,hardware_shop,1613,1.019989
6,jewellery_shop,72,0.04553
7,kirana_shop,24024,15.191698
8,mobile_shop,15352,9.707915
9,other_shop,11850,7.493408


Cleaning Text data

1. Remove non-alphanumeric characters
2. Remove stop words
3. Lemmatize each line of text

In [11]:
stop_words = get_stop_words()
lemma = WordNetLemmatizer()

df['clean_items'] = df['items'].apply(lambda x: clean_text(x))
df['clean_items'] = df['clean_items'].apply(lambda x: remove_stop_words(x, stop_words))
df['clean_items'] = df['clean_items'].apply(lambda x: lemmatize_words(x, lemma))

df.head()


Unnamed: 0,category,sub_category,href,items,shop,clean_items
0,Groceries,Fruits & Vegetables,https://www.jiomart.com/c/groceries/fruits-veg...,Fresh Dates (Pack) (Approx 450 g - 500 g),vegetable_shop,fresh date pack
0,Groceries,Fruits & Vegetables,https://www.jiomart.com/c/groceries/fruits-veg...,Tender Coconut Cling Wrapped (1 pc) (Approx 90...,vegetable_shop,tender coconut cling wrapped
0,Groceries,Fruits & Vegetables,https://www.jiomart.com/c/groceries/fruits-veg...,Dates Imported (Approx 400 g - 500 g),vegetable_shop,date imported
0,Groceries,Fruits & Vegetables,https://www.jiomart.com/c/groceries/fruits-veg...,Papaya (Each) (Approx. 800 g - 1600 g),vegetable_shop,papaya
0,Groceries,Fruits & Vegetables,https://www.jiomart.com/c/groceries/fruits-veg...,Watermelon Kiran Big 1 pc (Approx. 2800 g - 40...,vegetable_shop,watermelon kiran big


In [12]:
# save cleaned data
df.to_parquet('./data/jio_mart_items_cleaned.parquet')

In [13]:
# train test split
x_train, x_test, y_train, y_test = train_test_split(df['clean_items'], df['shop'], stratify=df['shop'])

In [14]:
# grid search on multiple models with different parameters

# Initialze the estimators
clf1 = RandomForestClassifier(random_state=42)
clf2 = SVC(probability=True, random_state=42)
clf3 = LogisticRegression(random_state=42)
clf4 = DecisionTreeClassifier(random_state=42)
clf5 = KNeighborsClassifier()
clf6 = MultinomialNB()

# tfidf parameters
tfidf_params = {
    'tfidf__max_df': (0.25, 0.5, 0.75)
}

classifiers = [clf1, clf2, clf3, clf4, clf5, clf6]

params = [{'classifier': [clf]} for clf in classifiers]

In [15]:
# making pipeline
# using smote to handle class imbalances

pipeline = Pipeline(
    [
        ('tfidf', TfidfVectorizer()),
        ('smote', SMOTE(random_state=100)),
        ('classifier', clf1)
    ]
)

In [16]:
# making cv
stratified_kfold = StratifiedKFold(n_splits=3,
                                    shuffle=True,
                                    random_state=11)

In [17]:
# fit the model using grid search

gs = GridSearchCV(
    estimator = pipeline,
    param_grid=params,
    scoring=['f1_weighted', 'precision_weighted', 'recall_weighted'],
    cv=stratified_kfold,
    refit='f1_weighted'
    
)

gs.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [18]:
# find best estimator
print(gs.best_estimator_)
print(gs.best_params_)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('smote', SMOTE(random_state=100)),
                ('classifier', SVC(probability=True, random_state=42))])
{'classifier': SVC(probability=True, random_state=42)}


In [19]:
# test the model
y_pred = gs.predict(x_test)

print_accuracies(y_pred, y_test)

print_classification_report(y_pred, y_test)

Testing accuracy: 0.9812065258631593
F1 score: 0.9811910503809396
                  precision    recall  f1-score   support

     beauty_shop       0.97      0.96      0.97      2344
      cloth_shop       0.99      1.00      1.00      6459
      dairy_shop       0.94      0.95      0.94       128
electronics_shop       0.98      0.99      0.98      1215
  furniture_shop       0.98      0.99      0.99      7089
   hardware_shop       0.92      0.98      0.95       380
  jewellery_shop       1.00      1.00      1.00        18
     kirana_shop       0.99      0.97      0.98      6089
     mobile_shop       0.99      0.99      0.99      3860
      other_shop       0.99      0.99      0.99      2964
  pharamacy_shop       0.96      0.96      0.96      4821
     sports_shop       0.97      0.98      0.98      2023
 stationary_shop       0.97      0.97      0.97      2077
  vegetable_shop       0.81      0.99      0.89        68

        accuracy                           0.98     39535
    

In [20]:
# save the model
joblib.dump(gs, './models/grid_search_best_model.pkl')

['grid_search_best_model.pkl']