In [1]:
import pickle

In [2]:
import pandas as pd

# data transformation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer

# binary relevance
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier

# classification models
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

# performance metric
from sklearn.metrics import f1_score

# model pipeline
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.compose import ColumnTransformer 

# text processing library
# from text_processing import TextProcessing

# class TextProcessor(BaseEstimator):

#     def __init__(self, text_preprocessing_model, text_column):
#         self.text_column = text_column
#         self.text_preprocessing_model = text_preprocessing_model

#     def fit(self, documents, y=None):
#         return self

#     def transform(self, x_dataset):
#         x_dataset['cleaned_text'] = x_dataset[self.text_column].apply(lambda x: self.text_preprocessing_model.clean_text(x))
#         return x_dataset
import sister

class SentenceEmbedding():

    def __init__(self, embedding_type = None):
        # if embedding_type == 'bert':
        #     self.sentence_embedding = sister.BertEmbedding(lang="en")
        # else:
        #     self.sentence_embedding = sister.MeanEmbedding(lang="en")
        if embedding_type == 'bert':
            self.sentence_embedding = sentence_embedding
        else:
            self.sentence_embedding = sentence_embedding_bert

    def get_sentence_embedding(self, sentence):
        if type(sentence) == str:
            return self.sentence_embedding(sentence)
        else:
            return [self.sentence_embedding(str(sent)) for sent in list(sentence)]

class TextVectorizer(BaseEstimator):

    def __init__(self, text_column, vectorizer_algorithm, embedding_type = None):
        self.text_column = text_column
        self.vectorizer_algorithm = vectorizer_algorithm
        self.vectorizer = None
        self.sentence_embedding = SentenceEmbedding(embedding_type) if vectorizer_algorithm == 'sentence_embeddings' else None

    def fit(self, x_dataset, y=None):
        if self.vectorizer_algorithm == 'count_vectorizer':
            pass
        elif self.vectorizer_algorithm == 'tfidf_vectorizer':
            self.vectorizer = TfidfVectorizer()
            self.vectorizer.fit(x_dataset)
        elif self.vectorizer_algorithm == 'sentence_embeddings':
            pass
        else:
            raise Exception(f'invalid vectorizer_algorithm: {vectorizer_algorithm}')
        return self

    def transform(self, x_dataset):
        if self.vectorizer_algorithm == 'count_vectorizer':
            pass
        elif self.vectorizer_algorithm == 'tfidf_vectorizer':
            x_dataset = self.vectorizer.transform(x_dataset)
        elif self.vectorizer_algorithm == 'sentence_embeddings':
            self.sentence_embedding.get_sentence_embedding(x_dataset)
        else:
            raise Exception(f'invalid vectorizer_algorithm: {vectorizer_algorithm}')
        return x_dataset

class TextClassification():

    def __init__(self, data, text_column, label_column, classification_type, model_algorithm, vectorizer_algorithm):
        
        # assigning params to instance variable
        self.data = data
        self.text_column = text_column
        self.label_column = label_column
        self.classification_type = classification_type
        self.model_algorithm = model_algorithm
        self.vectorizer_algorithm = vectorizer_algorithm
        
        # self.text_preprocessing_model = text_preprocessing_model

        # process data
        self.process_data()

        #building pipeline
        self._build_model_pipeline()

    def process_data(self,):
        
        self.data = self.data[[self.text_column, self.label_column]]
        
        # transforming target label
        if self.classification_type == 'multi-class':
            self.encoder = LabelEncoder()
            self.data[self.label_column] = self.encoder.fit_transform(self.data[self.label_column])
        else:
            self.encoder = MultiLabelBinarizer()
            self.data[self.label_column] = self.encoder.fit_transform(self.data[self.label_column])

        # splitting of training and test data
        self.train_x, self.test_x, self.train_y, self.test_y = train_test_split(self.data[self.text_column], self.data[self.label_column], test_size=0.2, shuffle=True, random_state=1)

    # building end to end pipeline
    def _build_model_pipeline(self,):

        pipeline_steps = list()
        pipeline_steps.append(('text_vectorizer', TextVectorizer(self.text_column, self.vectorizer_algorithm)))
        # pipeline_steps.append(('column_transformation', self._load_column_transformer()))

        if self.model_algorithm == 'LR' or self.model_algorithm == None:
            pipeline_steps.append(('LogisticRegression', OneVsRestClassifier(LogisticRegression())))

        elif self.model_algorithm == 'NB':
            pipeline_steps.append(('MultinomialNB', OneVsRestClassifier(MultinomialNB())))

        elif self.model_algorithm == 'SVC':
            pipeline_steps.append(('LinearSVC', OneVsRestClassifier(LinearSVC())))

        elif self.model_algorithm == 'XGB':
            pipeline_steps.append(('XGBClassifier', OneVsRestClassifier(XGBClassifier(verbosity = 0))))

        self.model_pipeline = Pipeline(steps=pipeline_steps)

    # pipeline component for transforming any columns in input dataframe
    def _load_column_transformer(self,):
        return None

    # training pipeline
    def train_pipeline(self,):
        self.model_pipeline.fit(self.train_x, self.train_y)

    # evaluating pipeline
    def evaluate_pipeline(self,):
        y_pred = self.model_pipeline.predict(self.test_x)
        metric_score = f1_score(self.test_y, y_pred, average="micro")
        print(f'F1 score for {self.model_algorithm}: {metric_score}')

    # prediction using trained pipeline
    def predict(self, input_text):
        input_text = [input_text]
        prediction = self.model_pipeline.predict(input_text)
        return self.encoder.inverse_transform(prediction)


### loading model

In [3]:
model_path = '/Users/mukul4.verma/Documents/workspace/catalog_indexer/src/models/classification_pipelines.pkl'
with open(model_path, 'rb') as file:
    model = pickle.load(file)

In [4]:
model

{'New_level1_LR': <__main__.TextClassification at 0x1107ee280>,
 'New_level1_NB': <__main__.TextClassification at 0x12e80d070>,
 'New_level1_SVC': <__main__.TextClassification at 0x12e8e3880>,
 'New_level2_LR': <__main__.TextClassification at 0x12e9ae1f0>,
 'New_level2_NB': <__main__.TextClassification at 0x12f1e97c0>,
 'New_level2_SVC': <__main__.TextClassification at 0x12f4b2b50>,
 'New_level3_LR': <__main__.TextClassification at 0x12f71cd00>,
 'New_level3_NB': <__main__.TextClassification at 0x12fb909d0>,
 'New_level3_SVC': <__main__.TextClassification at 0x12ffa9a00>,
 'Valued_level2_LR': <__main__.TextClassification at 0x1302e4df0>,
 'Valued_level2_NB': <__main__.TextClassification at 0x1304553a0>,
 'Valued_level2_SVC': <__main__.TextClassification at 0x1305d4220>}

### loading catalog

In [5]:
import pandas as pd
catalog = pd.read_csv("/Users/mukul4.verma/Documents/workspace/catalog_indexer/data/jiomart/raw/catalog_cleaned.csv")
catalog.shape

(121775, 14)

In [6]:
catalog

Unnamed: 0.1,Unnamed: 0,productid,productskuid,New Product Name,Quantity,New_level1,New_level2,Valued_level2,New_level3,brandname,orig_score,aggr_score,processed_title,processed_title_final
0,0,100026392,10030996.0,24 mantra organic cloves,50 g,masala,spices & masalas,spices & masalas,whole spices,24 Mantra,0.0,0.0,24 mantra organic clove,mantra organic clove
1,1,100145952,10157693.0,3 ply face mask,10 pcs,,masks,,face masks,Netplay,0.0,0.0,3 ply face mask,ply face mask
2,2,100139524,10151265.0,command white l plastic utility hook 1 hook 2 ...,3M,home,home improvement,,,3M,0.0,0.0,command white l plastic utility hook 1 hook 2 ...,command white plastic utility hook hook strip
3,3,100000074,10000117.0,a and w diet root beer aged vanilla,355 ml,,soft drinks,soft drinks,aerated drink,A & W,0.0,0.0,a and w diet root beer age vanilla,and diet root beer age vanilla
4,4,100088833,10101294.0,armr 100 herbal blackberry antihangover drink,60 ml,,,,over the counter remedies,ARMR,0.0,0.0,armr 100 herbal blackberry antihangover drink,armr herbal blackberry antihangover drink
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121770,121770,100028200,10033308.0,wonderland foods premium chaat masala flavoure...,100 g,masala,dry fruits & nuts,dry fruits & nuts,dried fruits,Wonderland Foods,0.0,0.0,wonderland food premium chaat masala flavour p...,wonderland food premium chaat masala flavour p...
121771,121771,100003253,10003851.0,zed candy mini jawbreaker,35 g,,candies,,,JAWBREAKER Sour,0.0,0.0,zed candy mini jawbreaker,zed candy mini jawbreaker
121772,121772,100028067,10033140.0,zen tofu pouch,200 g,dummy1,dummy2,dummy2,dummy3,Zen,0.0,0.0,zen tofu pouch,zen tofu pouch
121773,121773,100147561,10159302.0,irin stainless steel agarbatti stand,,,pooja needs,pooja needs,other pooja accessories,Irin,0.0,0.0,irin stainles steel agarbatti stand,irin stainles steel agarbatti stand


### predicting missing attributes

In [7]:
from text_processing import TextProcessing
from load_config import config

In [8]:
tp = TextProcessing()

In [9]:
def get_missing_attribute(model, text, label):
    if type(text) == str and (type(label) != str or len(label) < 1 or 'dummy'  in label):
        # text = tp.clean_text(str(text))
        return model.predict(text)[0]
    return label

# def get_missing_attribute(model, text, label):
#     if (type(label) != str or len(label) < 1 or 'dummy'  in label):
#         # text = tp.clean_text(str(text))
#         return model.predict(text)[0]
#     return label

In [10]:
domain_config = config['jiomart_autosuggest']

In [29]:
# temp_catalog = catalog.iloc[60:80]
temp_catalog = catalog

In [30]:
temp_catalog.head()

Unnamed: 0.1,Unnamed: 0,productid,productskuid,New Product Name,Quantity,New_level1,New_level2,Valued_level2,New_level3,brandname,orig_score,aggr_score,processed_title,processed_title_final
0,0,100026392,10030996.0,24 mantra organic cloves,50 g,masala,spices & masalas,spices & masalas,whole spices,24 Mantra,0.0,0.0,24 mantra organic clove,mantra organic clove
1,1,100145952,10157693.0,3 ply face mask,10 pcs,,masks,,face masks,Netplay,0.0,0.0,3 ply face mask,ply face mask
2,2,100139524,10151265.0,command white l plastic utility hook 1 hook 2 ...,3M,home,home improvement,,,3M,0.0,0.0,command white l plastic utility hook 1 hook 2 ...,command white plastic utility hook hook strip
3,3,100000074,10000117.0,a and w diet root beer aged vanilla,355 ml,,soft drinks,soft drinks,aerated drink,A & W,0.0,0.0,a and w diet root beer age vanilla,and diet root beer age vanilla
4,4,100088833,10101294.0,armr 100 herbal blackberry antihangover drink,60 ml,,,,over the counter remedies,ARMR,0.0,0.0,armr 100 herbal blackberry antihangover drink,armr herbal blackberry antihangover drink


In [31]:
import swifter
for column in domain_config['columns_for_missing_attributes']:
    temp_catalog[f'{column}_processed'] = temp_catalog.swifter.apply(lambda row: get_missing_attribute(model[f'{column}_SVC'], row["processed_title_final"], row[column]), axis = 1)
    # print(column)

# model['New_level1_SVC'].predict('mantra organic clove')

Dask Apply: 100%|██████████| 16/16 [23:56<00:00, 89.79s/it] 
Dask Apply: 100%|██████████| 16/16 [11:19<00:00, 42.49s/it]
Dask Apply:   0%|          | 0/16 [8:34:21<?, ?it/s]


KeyboardInterrupt: 

In [None]:
temp_catalog.columns

In [None]:
temp_catalog.to_csv('/Users/mukul4.verma/Documents/workspace/catalog_indexer/data/jiomart/processed/processed_catalog.csv', index = False)

In [16]:
temp_catalog[['New Product Name']+sorted([column for column in temp_catalog.columns if "level" in column])]

Unnamed: 0,New Product Name,New_level1,New_level1_processed,New_level2,New_level2_processed,New_level3,New_level3_processed,Valued_level2,Valued_level2_processed
60,asian airseal turn lock green plastic jar with...,,general purpose items,,kitchen,storage container,storage container,,bathroom ware
61,asparagus 1 bunch,vegetables,vegetables,vegetables,vegetables,,100% juice,,ready to cook
62,atul surti biscuits,,biscuits,cookies,cookies,cookies,cookies,,biscuits
63,avacado approx,dummy1,vegetables,dummy2,fresh vegetables,dummy3,garlic,dummy2,fresh vegetables
64,avon haiku perfumed skin softener,,beauty,face care,face care,creams,creams,face care,face care
65,ayur sun screen lotion spf,,lotions,skin care,skin care,lotion,lotion,skin care,skin care
66,ayur sun screen lotion spf,,lotions,skin care,skin care,lotion,lotion,skin care,skin care
67,ayur tulsi face pack,,beauty products,face care,face care,,face wash,,face care
68,ayur tulsi face pack,,beauty products,face care,face care,,face wash,,face care
69,ayush whitening rock salt toothpaste,,oral care,oral care,oral care,toothpastes,toothpastes,oral care,oral care


In [61]:
tp.clean_text("24 mantra organic cloves")

'24 mantra organic clove'

In [141]:
# sorted(list(catalog['New_level1'].dropna().unique()))

['agriculture products',
 'antiperspirant',
 'apparel',
 'atta',
 'baby',
 'bakery',
 'beauty',
 'beauty products',
 'beverages',
 'biscuits',
 'bread',
 'breakfast',
 'canned food',
 'chicken',
 'chocolates',
 'cleaning',
 'cocoa',
 'coffee',
 'confectionery',
 'consumable',
 'cookies',
 'crop management products',
 'crop protection',
 'dairy',
 'dals',
 'deodorant',
 'dry fruits & nuts',
 'dummy1',
 'edta chelates',
 'feminine needs',
 'foodgrains',
 'freshline',
 'frozen vegetarian',
 'general purpose items',
 'ghee',
 'health',
 'health drinks',
 'home',
 'home care',
 'hygiene',
 'instant',
 'laundry supplies',
 'living',
 'loose items',
 'lotions',
 'makeup',
 'marka',
 'marketing material',
 'masala',
 'masalas',
 'micronutrient',
 'milk',
 'mom',
 'noodles',
 'oils',
 'oral care',
 'other plant nutrients',
 'personal hygiene',
 'pickles',
 'rice',
 'salt',
 'sauces',
 'secondary nutrients',
 'secondary plant nutrients',
 'sexual wellness',
 'shaving products',
 'snacks',
 'soap

In [17]:
# catalog[catalog['Valued_level2'] == 'rice'][['New Product Name','Valued_level2']]

In [66]:
catalog[catalog['Valued_level2'] == 'atta & flours'][['New Product Name','Valued_level2']]

Unnamed: 0,New Product Name,Valued_level2
142,conscious food organic wheat atta,atta & flours
231,fps lapsi rawa,atta & flours
295,good life mp sharbatti atta,atta & flours
296,good life mp sharbatti atta,atta & flours
496,maayi appa soda,atta & flours
...,...,...
120922,anil samba ravai,atta & flours
121019,double horse appam idiyappam pathiri roasted r...,atta & flours
121372,bhagirathi kulith pith horse gram flour ready mix,atta & flours
121668,reliance select idly rawa,atta & flours


In [26]:
print(catalog['New_level1'].value_counts().tail(30))

milk                         18
edta chelates                15
rice                         15
sugar                        14
water                        14
salt                         13
tobacco                      12
crop management products      5
micronutrient                 5
welcome gift                  5
agriculture products          5
ghee                          4
secondary plant nutrients     3
confectionery                 3
specialty nutrients           3
sexual wellness               2
other plant nutrients         2
secondary nutrients           2
crop protection               2
sweets                        2
cocoa                         2
breakfast                     1
marka                         1
marketing material            1
canned food                   1
foodgrains                    1
veg seed                      1
cookies                       1
consumable                    1
antiperspirant                1
Name: New_level1, dtype: int64


In [28]:
print(catalog['New_level1'].value_counts().head(50))

masala                   7780
snacks                   4329
oils                     2064
dummy1                   1861
home                     1612
baby                      847
instant                   667
bakery                    617
hygiene                   558
dairy                     479
health drinks             425
general purpose items     403
beauty                    280
loose items               253
vegetables                253
home care                 207
soaps                     170
biscuits                  158
sauces                    148
living                    146
deodorant                 141
wellness                  136
masalas                   114
mom                       108
dry fruits & nuts         100
apparel                    99
makeup                     97
chocolates                 96
beauty products            86
oral care                  86
cleaning                   85
health                     82
frozen vegetarian          73
coffee    