# <span style="color:green"> <U/> Model & Feature Engineering</span>

In [1]:
#import all the necessary packages.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import seaborn as sns
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances
from scipy.sparse import hstack
import re
import pickle
from sklearn.preprocessing import MinMaxScaler
import sys

In [3]:
# we use the list of stop words that are downloaded from nltk lib.
import nltk # to download only once
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
print ('list of stop words:', stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Muheebpashasnr\AppData\Roaming\nltk_data...


list of stop words: {'herself', 'before', 'other', 'don', 'is', "should've", 'does', 'up', 'isn', 'their', 'because', "you're", 'y', 'she', 'hadn', 'whom', 'only', 're', 'myself', 'by', 'theirs', 'ma', "couldn't", 'to', "needn't", 'at', "didn't", 'doesn', 'her', 'off', 'between', 'here', 'needn', 'ain', 'that', 'themselves', 'being', 'me', "you'd", 'each', "mustn't", 'after', 'further', 'into', 'for', 'below', 'same', 'then', "aren't", 'his', 'an', 'under', "shouldn't", 'than', 'haven', 'weren', 'very', 'can', 'wasn', 'few', 't', 'or', "doesn't", 'are', 'ours', 'itself', 'been', "you'll", 'have', 'he', 'nor', 's', 'such', 'not', 'll', 'over', 'hasn', 'my', 'the', "haven't", 'during', "weren't", 'having', 'what', "hasn't", "isn't", 'has', 'mightn', 'through', 'we', 'shan', 'was', "you've", 'them', 'of', 'a', 'wouldn', 'if', 'ourselves', 'now', "shan't", 'couldn', 'didn', 'did', 'out', 'am', 'both', 'some', 'd', 'won', 'own', 'with', 'where', 'how', 'from', "it's", "don't", 'himself', 'y

[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# computing sentiment score for description feature
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Muheebpashasnr\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [5]:
# load preprocessed train data
df=pd.read_csv('train_preprocessed_with_clusterlabels.csv')
df.head()

Unnamed: 0,product,category,sub_category,brand,type,description,sale_price,market_price,discount_%,negative,neutral,positive,compound,sale_price_scaled,cluster_label
0,Flour - Corn,foodgrains_oil_masala,organicstaples,organictattva,organicflours,per calories fat,57.0,60.0,0.05,0.0,1.0,0.0,0.0,0.027308,1
1,Veg. Burger Patty,snacks_brandedfoods,frozenveggies_snacks,yummiez,frozenvegsnacks,real good veg burger patty choice eating beefy...,125.0,125.0,0.0,0.0,0.611,0.389,0.9484,0.06135,2
2,Dosa Rice/Akki,foodgrains_oil_masala,rice_riceproducts,supersaver,rawrice,rice dosa easy make premium dosa mix used make...,199.0,225.0,0.115556,0.0,0.811,0.189,0.8126,0.098396,3
3,Skimmed Milk Powder,gourmet_worldfood,cooking_bakingneeds,puramate,baking_cakedecorations,puramate skimmed milk powder makes wholesome m...,58.5,65.0,0.1,0.0,0.78,0.22,0.9723,0.028059,2
4,Kiwi Drink,gourmet_worldfood,drinks_beverages,alafresh,gourmetjuices_drinks,ala fresh kiwi fruit drink refreshing fruit dr...,10.0,10.0,0.0,0.0,0.667,0.333,0.8402,0.00378,2


In [6]:
df.columns

Index(['product', 'category', 'sub_category', 'brand', 'type', 'description',
       'sale_price', 'market_price', 'discount_%', 'negative', 'neutral',
       'positive', 'compound', 'sale_price_scaled', 'cluster_label'],
      dtype='object')

In [7]:
df.shape

(21648, 15)

- will vectorize preprocessed description text with tfidf weighted W2V

In [8]:
# loading glove vectors for w2v dimensions
with open(r'F:\Applied AI\Assignments\Self_Case_Study_1\glove_vectors','rb') as f:
    model=pickle.load(f)
    glove_words=set(model.keys()) # creating set of words in glove file

In [9]:
# tfidf vectorizer
tfidf=TfidfVectorizer()
tfidf_description=tfidf.fit_transform(df['description'])

In [10]:
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf.get_feature_names(), list(tfidf.idf_)))
tfidf_words = set(tfidf.get_feature_names())

### tfidf_W2V vectorization of preprocessed description feature

In [11]:
#vectorizing train  data using tfidf-W2v
# average Word2Vec
# compute tfidf word2vec for each description.
from tqdm import tqdm
description_tfidf_w2v_vectors = []; # the avg-w2v for each description is stored in this list
for description in tqdm(df['description']): # for each description
    vector = np.zeros(300) # as word vectors are of zero length
    tf_idf_weight =0; # num of words with a valid vector in the description
    for word in description.split(): # for each word in description
        if (word in glove_words) and (word in tfidf_words):
            vec = model[word] # getting the vector for each word
            # here we are multiplying idf value(dictionary[word]) and the tf value((description.count(word)/len(product.split())))
            tf_idf = dictionary[word]*(description.count(word)/len(description.split())) # getting the tfidf value for each word
            vector += (vec * tf_idf) # calculating tfidf weighted w2v
            tf_idf_weight += tf_idf
    if tf_idf_weight != 0:
        vector /= tf_idf_weight
    description_tfidf_w2v_vectors.append(vector)

100%|██████████████████████████████████| 21648/21648 [00:16<00:00, 1286.60it/s]


In [13]:
len(description_tfidf_w2v_vectors)

21648

### Label encoding categorical features `'category', 'sub_category', 'brand', 'type'`

In [14]:
#https://stephenallwright.com/label-encode-unseen-values/
from sklearn.preprocessing import LabelEncoder

def label_encode_columns(df, columns, encoders=None):
    if encoders is None:
        encoders = {}
    
        for col in columns:
            unique_values = list(df[col].unique())
            unique_values.append('Unseen')
            le = LabelEncoder().fit(unique_values)
            df[col] = le.transform(df[[col]])
            encoders[col] = le
    
    else:
        for col in columns:
            le = encoders.get(col)
            df[col] = [x if x in le.classes_ else 'Unseen' for x in df[col]]
            df[col] = le.transform(df[[col]])

    return df, encoders

In [15]:
columns=['category', 'sub_category', 'brand', 'type']
df, encoders=label_encode_columns(df, columns, encoders=None)

In [55]:
le_for_test=encoders # saving encoders for test data

In [56]:
le_for_test

{'category': LabelEncoder(),
 'sub_category': LabelEncoder(),
 'brand': LabelEncoder(),
 'type': LabelEncoder()}

In [16]:
df.head()

Unnamed: 0,product,category,sub_category,brand,type,description,sale_price,market_price,discount_%,negative,neutral,positive,compound,sale_price_scaled,cluster_label
0,Flour - Corn,7,69,1445,302,per calories fat,57.0,60.0,0.05,0.0,1.0,0.0,0.0,0.027308,1
1,Veg. Burger Patty,11,46,2138,176,real good veg burger patty choice eating beefy...,125.0,125.0,0.0,0.0,0.611,0.389,0.9484,0.06135,2
2,Dosa Rice/Akki,7,77,1876,349,rice dosa easy make premium dosa mix used make...,199.0,225.0,0.115556,0.0,0.811,0.189,0.8126,0.098396,3
3,Skimmed Milk Powder,9,21,1570,30,puramate skimmed milk powder makes wholesome m...,58.5,65.0,0.1,0.0,0.78,0.22,0.9723,0.028059,2
4,Kiwi Drink,9,31,56,193,ala fresh kiwi fruit drink refreshing fruit dr...,10.0,10.0,0.0,0.0,0.667,0.333,0.8402,0.00378,2


### will build a matrix from train data to compute cosine similarity 

In [19]:
# stacking all encoded categorical features, vectorized description text and scaled sael price,discount% and sentiment scores
X_train=np.hstack((description_tfidf_w2v_vectors,df['category'].values.reshape(-1,1),df['sub_category'].values.reshape(-1,1),df['brand'].values.reshape(-1,1), \
                  df['type'].values.reshape(-1,1),df['sale_price_scaled'].values.reshape(-1,1),df['discount_%'].values.reshape(-1,1), \
                  df['negative'].values.reshape(-1,1),df['neutral'].values.reshape(-1,1),df['positive'].values.reshape(-1,1), \
                  df['compound'].values.reshape(-1,1),df['cluster_label'].values.reshape(-1,1)))

In [20]:
X_train.shape

(21648, 311)

In [21]:
#saving to pkl file
pickle.dump(X_train,open('X_train.pkl','wb'))

## Model Building

- will give attributes of query product
- query product should have `category,sub_category,brand,type,description,sale_price,market_price`
- using the above features will do encoding,vectorizing ,scale sale price to train data,compute discount_%, calculate sentiment scores, and assign to nearest cluster 

In [22]:
# function to check all features are given for query product
def missing_features(data):
    """to check any missing or blank or nan values in qury product """
    data.replace('None',np.nan,inplace=True) # replace None with Nan
    data.replace(r'^\s*$',np.nan,regex=True,inplace=True) # replace empty string with NaN
    
   # first to check whether the data has the 8 listed columns[category,sub_category,brand,type,description,sale_price,market_price]
   # to check for any missing values
    if(data.shape[1]<7) or data.isna().any().any():
        print('Please check datapoint\n')
        print('The data has missing values in'+ str(list(data.columns[data.isna().any()]))+ 'columns')
        
        return False
    else:
        return True
       
        

In [23]:
testcase=pd.read_csv('missing_column.csv')
testcase

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,9766,Food Package - Medium,Cleaning & Household,"Disposables, Garbage Bag",,50.0,50.0,"Aluminium Foil, Clingwrap",,


In [24]:
#test case of missing_features func
a=missing_features(testcase) #with msiing column
a

Please check datapoint

The data has missing values in['brand', 'rating', 'description']columns


False

In [25]:
#removing sale price
testcase['sale_price']=''
testcase

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,9766,Food Package - Medium,Cleaning & Household,"Disposables, Garbage Bag",,,50.0,"Aluminium Foil, Clingwrap",,


In [26]:
a=missing_features(testcase) #with missing column
a

Please check datapoint

The data has missing values in['brand', 'sale_price', 'rating', 'description']columns


False

In [27]:
print(type(df.loc[[1],:]))
df.loc[[1],:]

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,product,category,sub_category,brand,type,description,sale_price,market_price,discount_%,negative,neutral,positive,compound,sale_price_scaled,cluster_label
1,Veg. Burger Patty,11,46,2138,176,real good veg burger patty choice eating beefy...,125.0,125.0,0.0,0.0,0.611,0.389,0.9484,0.06135,2


In [28]:
b=missing_features(df.loc[[1],:])
b

True

In [29]:
# function to check for sale price range 
sale_price_minima=df['sale_price'].min()-(df['sale_price'].min()*0.15) # 15 less than min 
sale_price_maxima=df['sale_price'].max()+(df['sale_price'].max()*0.15)
print(sale_price_minima,'\t',sale_price_maxima)

2.0825 	 2300.0


In [30]:
def salepricecheck(data,train_data):
    '''function to check data has sale price relavant to other produts in same brand or whole train data'''
    if data['sale_price'].values.size!=0: #https://stackoverflow.com/questions/11295609/how-can-i-check-whether-a-numpy-array-is-empty-or-not
        if data['brand'].notna().all():
            g=train_data.groupby(['brand'])['sale_price'] # grouping train data based on brand to get group of query product brand
            minimum=g.get_group((data['brand'].values[0])).min() # getting minimum sale price of query product brand in train data
            minimum = minimum-(minimum*0.15) # 15% tolerance
            
            maximum=g.get_group((data['brand'].values[0])).max()
            maximum = maximum + (maximum*0.15)
            
            if minimum <= float(data['sale_price'].values[0]) <= maximum:
                return True
            else:
                print('The sale_price of query product is not in range of other products in same brand,Check sale_price')
                return False
        else:
            if sale_price_minima <= float(data['sale_price'].values[0]) <= sale_price_maxima: # if brand is not available then will see in whole train data
                return True
            else:
                print('The sale_price of query product is not in range of train data products,please check sale_price')
                return False
    else:
        print('No sale_price for query product given')    
        return False
    

In [31]:
#test cases for salepricecheck
a=df.loc[[1],:]
a

Unnamed: 0,product,category,sub_category,brand,type,description,sale_price,market_price,discount_%,negative,neutral,positive,compound,sale_price_scaled,cluster_label
1,Veg. Burger Patty,11,46,2138,176,real good veg burger patty choice eating beefy...,125.0,125.0,0.0,0.0,0.611,0.389,0.9484,0.06135,2


In [32]:
out=salepricecheck(a,df)
out

True

In [33]:
# modifying a
a['brand']=np.nan
a

Unnamed: 0,product,category,sub_category,brand,type,description,sale_price,market_price,discount_%,negative,neutral,positive,compound,sale_price_scaled,cluster_label
1,Veg. Burger Patty,11,46,,176,real good veg burger patty choice eating beefy...,125.0,125.0,0.0,0.0,0.611,0.389,0.9484,0.06135,2


In [34]:
out=salepricecheck(a,df)
out

True

In [35]:
a['sale_price']=2301 # chamged sale price above maximum
a

Unnamed: 0,product,category,sub_category,brand,type,description,sale_price,market_price,discount_%,negative,neutral,positive,compound,sale_price_scaled,cluster_label
1,Veg. Burger Patty,11,46,,176,real good veg burger patty choice eating beefy...,2301,125.0,0.0,0.0,0.611,0.389,0.9484,0.06135,2


In [36]:
out=salepricecheck(a,df)
out

The sale_price of query product is not in range of train data products,please check sale_price


False

In [39]:
# changing brand to Nivea and replacing sale price lower than the brand group minima with 15% tolerance
a['brand']=2138
g=df.groupby(['brand'])['sale_price']
minima=g.get_group((2138)).min()
print(minima)
a['sale_price']=(minima)-(minima*.30)
a

95.0


Unnamed: 0,product,category,sub_category,brand,type,description,sale_price,market_price,discount_%,negative,neutral,positive,compound,sale_price_scaled,cluster_label
1,Veg. Burger Patty,11,46,2138,176,real good veg burger patty choice eating beefy...,66.5,125.0,0.0,0.0,0.611,0.389,0.9484,0.06135,2


In [40]:
out=salepricecheck(a,df)
out

The sale_price of query product is not in range of other products in same brand,Check sale_price


False

In [41]:
# function to get sentiment scores 
from nltk.sentiment.vader import SentimentIntensityAnalyzer

def get_scores(data):
    """retuens sentiment analysis scores for description feature"""
    sia=SentimentIntensityAnalyzer()
    
    negative=[]
    neu=[]
    pos=[]
    compound=[]
    if 'description' in data.columns:
        for value in (data['description']):
            i=sia.polarity_scores(value)['neg']
            j=sia.polarity_scores(value)['neu']
            k=sia.polarity_scores(value)['pos']
            l=sia.polarity_scores(value)['compound']
            
            negative.append(i)
            neu.append(j)
            pos.append(k)
            compound.append(l)
            
    data['negative']=negative
    data['neutral']=neu
    data['positive']=pos
    data['compound']=compound
    
    return data       

In [42]:
# function to get tfidf weighted word2vec

def get_tfidf_w2v(data):
    """retuens tfidf weighted w2v for description feature"""
    
    description_tfidf_w2v_vectors = []; # the avg-w2v for each description is stored in this list
    if 'description' in data.columns:
        for description in (data['description']):
            vector = np.zeros(300) # as word vectors are of zero length
            tf_idf_weight =0; # num of words with a valid vector in the description
            for word in description.split(): # for each word in description
                if (word in glove_words) and (word in tfidf_words):
                    vec = model[word] # getting the vector for each word
                    # here we are multiplying idf value(dictionary[word]) and the tf value((description.count(word)/len(product.split())))
                    tf_idf = dictionary[word]*(description.count(word)/len(description.split())) # getting the tfidf value for each word
                    vector += (vec * tf_idf) # calculating tfidf weighted w2v
                    tf_idf_weight += tf_idf
            if tf_idf_weight != 0:
                vector /= tf_idf_weight
            description_tfidf_w2v_vectors.append(vector)
    
    return description_tfidf_w2v_vectors
            

In [43]:
# function to preprocess categorical text features category,sub_category,brand,type

def categorical_preprocess(text):
    """to preprocess categorical features,use .apply for applying function"""
    text=text.str.replace('&','_') # replacing & with _
    text=text.str.replace(',','_') # replacing , with _
    text=text.str.replace("'",'') #replacing '' with ''(no space)
    text=text.str.replace(" ",'') # removing white spaces
    text=text.str.lower() # to lower case
    text=text.str.strip() # removing trailing and leading white space
    
    return text

In [44]:
# function to preprocess description text feature

def preprocess_description(text):
    """ Function which does preprocesiing on prodcut title feature,
        removes stopwords, replaces special character with space, converts to lower case,
    """
    preprocessed_description=[]
    for description in text:
        
        #Delete all the data which are present in the brackets
        description = re.sub(r'\([^()]*\)',' ',description)
        
        #removing urls
        description = re.sub(r'http\S+',' ',description)
        description = re.sub('[^A-Za-z]+', ' ', description) # remove all characters except a-z and A-Z and replace with white space
        # https://gist.github.com/sebleier/554280
        description = ' '.join(word for word in description.split() if word.lower() not in stop_words) # removing stop words
        description = ' '.join(word for word in description.split() if len(word)>2) # removing single letter and two letter words
        description = description.lower().strip()
        preprocessed_description.append(description)
        
    return preprocessed_description

### Function to get similar products of query product(given as dataframe)

In [45]:
#www.geeksforgeeks.org/k-means
# loading cluster centres(means) on train data
with open ('cluster_centers.pkl','rb') as f:
    cluster_centers=pickle.load(f)
    
def get_clusterlabel(X,means=cluster_centers):
    """ to get cluster label"""
    minimum=sys.maxsize # initializing minimum as maximum integer so that the distances will be less than that
    index=-1
    for i in range(len(means)):
        dis=np.linalg.norm(X - means[i]) #https://www.geeksforgeeks.org/calculate-the-euclidean-distance-using-numpy/
        
        if (dis<minimum):
            minmum=dis
            index=i
            
    return index

In [156]:
from sklearn.preprocessing import MinMaxScaler
import sys

def get_similar_products(query,train_data,X_train=X_train,num_results=11):
    """function to give similar products from train_data for query product """
    
    # query: query product
    # train_data: preprocessed train data with all features
    #X_train: matrix to compute cosine similarity
    # num_results: number of similar products to show
    
    if missing_features(query):
        
        # preprocessing categorical columns
        query[['category','sub_category','brand','type']]=query[['category','sub_category','brand','type']].apply(categorical_preprocess)
        
        # encoding categorical features category,sub_category,brand,type
        columns=['category', 'sub_category', 'brand', 'type']
        query, encoders=label_encode_columns(query, columns, le_for_test) # using encoders =encoders got by fitting on
        
        if salepricecheck(query,train_data):
            # preprocessing description
            query['description']=preprocess_description(query['description'].values)
        
            #calculating discount_%
            if 'discount_%' not in query.columns:
                query['discount_%']=(query['market_price']-query['sale_price'])/query['market_price']
        
            query=get_scores(query) # to get sentiment scores
        
            # scaling sale price
            scaler = MinMaxScaler()
            scaler.fit(train_data['sale_price'].values.reshape(-1,1))
            query['sale_price_scaled']=scaler.transform(query['sale_price'].values.reshape(-1,1))
        
            # to get cluster label
            X_q=np.hstack((query['sale_price_scaled'].values.reshape(-1,1),query['discount_%'].values.reshape(-1,1), \
                           query['negative'].values.reshape(-1,1),query['neutral'].values.reshape(-1,1), \
                           query['positive'].values.reshape(-1,1),query['compound'].values.reshape(-1,1)))
        
            query['cluster_label']=get_clusterlabel(X_q) # function to classify item to nearest means(cluster_centres)
        
            tfidf_w2v_vector=get_tfidf_w2v(query) # function to vectorize description text after preprocessing
        
            
            # stacking all values
            #https://numpy.org/doc/stable/reference/generated/numpy.concatenate.html#numpy.concatenate
            X=np.hstack((tfidf_w2v_vector,query['category'].values.reshape(-1,1),query['sub_category'].values.reshape(-1,1), \
                         query['brand'].values.reshape(-1,1),query['type'].values.reshape(-1,1), \
                         query['sale_price_scaled'].values.reshape(-1,1),query['discount_%'].values.reshape(-1,1), \
                         query['negative'].values.reshape(-1,1),query['neutral'].values.reshape(-1,1), \
                         query['positive'].values.reshape(-1,1),query['compound'].values.reshape(-1,1), \
                         query['cluster_label'].values.reshape(-1,1)))
        
            # till now we have preprocessed and vectorized query product
            # now will compute cosine similarities and suggest similar products based on cosine similarity
            cosine_sim=cosine_similarity(X_train,X)
            # np.argsort will return indices of the nearest products 
            indices = np.argsort(cosine_sim.flatten())[-num_results:-1]
            # -1 given to exclude the searched product itself from showing in recommendations as cosinine similarity will be 1 for same product
            # flipping the indices so that the product with more similarity is shown first
            # argsort will do sorting of indices from smallest to largest value
            indices=np.flip(indices)
            #psimilarity will store the similarity 
            psimilarity  = np.sort(cosine_sim.flatten())[-num_results:-1]
            psimilarity = np.flip(psimilarity)
        
            print('The searched/Queried product is:\n',query['product'].values[0])
            print('\nTop '+str(num_results-1)+' Similar products for "'+'\033[1m'+query['product'].values[0]+'\033[0m' +'" are:')
            print("="*70,'\n')
        
            data=train_data[['product','discount_%']].loc[indices]
            data['similarity']=psimilarity.tolist() # adding similarity scores as a new column to data
           
            lst=[] # list to store indices after sorting
            for ind in data.index:
                lst.append(ind)
                print(ind,":",data['product'][ind])
                print('Cosine Similarity with queried product is :',np.round(data['similarity'][ind],6))
                print('Discount %: ',np.round(data['discount_%'][ind])) # restoring discount to original scale 
                print('-'*50,'\n')
            
            return train_data.loc[lst]
    else:
        print('Please check query point for any missing or incomplete information')
        

In [2]:
# loading test data to check model performance
test=pd.read_csv('test_raw.csv')
test.drop(['discount_%'],axis=1,inplace=True)
test.head(2)

Unnamed: 0,product,category,sub_category,brand,type,description,sale_price,market_price
0,Vittora Plastic Shoe Rack/Shoe Stand - 2 Steps...,"Kitchen, Garden & Pets",Storage & Accessories,Aristo,Racks & Holders,Virgin plastic made strong and elegant 2 step ...,699.0,923.0
1,Power Plus Disinfectant Toilet Cleaner 1 L + C...,Cleaning & Household,All Purpose Cleaners,Harpic,Toilet Cleaners,Harpic Disinfectant Toilet Cleaner Liquid - Ro...,274.0,324.0


In [158]:
test.shape

(5488, 8)

### checking for some random query products

In [3]:
query=test.loc[[3],:]
query

Unnamed: 0,product,category,sub_category,brand,type,description,sale_price,market_price
3,Kantan Watermelon Slice,Gourmet & World Food,Chocolates & Biscuits,Fini,"Marshmallow, Candy, Jelly",Fini Fizzy Watermelon Slices containing fizzy ...,110.0,110.0


In [4]:
query.to_csv('query3.csv',index=False)

In [160]:
le_for_test['category'].classes_

array(['Unseen', 'babycare', 'bakery_cakes_dairy', 'beauty_hygiene',
       'beverages', 'cleaning_household', 'eggs_meat_fish',
       'foodgrains_oil_masala', 'fruits_vegetables', 'gourmet_worldfood',
       'kitchen_garden_pets', 'snacks_brandedfoods'], dtype='<U21')

In [161]:
query

Unnamed: 0,product,category,sub_category,brand,type,description,sale_price,market_price
3,Kantan Watermelon Slice,Gourmet & World Food,Chocolates & Biscuits,Fini,"Marshmallow, Candy, Jelly",Fini Fizzy Watermelon Slices containing fizzy ...,110.0,110.0


In [162]:
query['category']

3    Gourmet & World Food
Name: category, dtype: object

In [163]:
# get similar products
similar_products_3=get_similar_products(query,df,X_train=X_train,num_results=11)
similar_products_3

The searched/Queried product is:
 Kantan Watermelon Slice

Top 10 Similar products for "[1mKantan Watermelon Slice[0m" are:

8558 : Kantan Cola Bottles Gum
Cosine Similarity with queried product is : 0.999986
Discount %:  0.0
-------------------------------------------------- 

13075 : Kantan Bubble Gum Balls
Cosine Similarity with queried product is : 0.999985
Discount %:  0.0
-------------------------------------------------- 

16625 : Kantan Bubble Gum Balls
Cosine Similarity with queried product is : 0.999985
Discount %:  0.0
-------------------------------------------------- 

5895 : Pucks Crunchy Jellies
Cosine Similarity with queried product is : 0.999985
Discount %:  0.0
-------------------------------------------------- 

5774 : Tennis Balls Gum
Cosine Similarity with queried product is : 0.999985
Discount %:  0.0
-------------------------------------------------- 

21486 : Kantan Tennis Balls
Cosine Similarity with queried product is : 0.999983
Discount %:  0.0
------------

Unnamed: 0,product,category,sub_category,brand,type,description,sale_price,market_price,discount_%,negative,neutral,positive,compound,sale_price_scaled,cluster_label
8558,Kantan Cola Bottles Gum,9,17,660,263,fini bubblegum cola bottles supplied fini come...,110.0,110.0,0.0,0.0,0.572,0.428,0.9136,0.053841,2
13075,Kantan Bubble Gum Balls,9,17,660,263,kantan bubble gum balls gluten free percent yu...,110.0,110.0,0.0,0.0,0.427,0.573,0.9442,0.053841,2
16625,Kantan Bubble Gum Balls,9,17,660,263,kantan bubble gum balls gluten free percent yu...,40.0,40.0,0.0,0.0,0.427,0.573,0.9442,0.018798,2
5895,Pucks Crunchy Jellies,9,17,660,263,pucks crunchy jellies gluten free percent fat ...,125.0,125.0,0.0,0.0,0.326,0.674,0.9709,0.06135,2
5774,Tennis Balls Gum,9,17,660,263,tennis balls gum gluten free percent yummy per...,125.0,125.0,0.0,0.0,0.447,0.553,0.9442,0.06135,2
21486,Kantan Tennis Balls,9,17,660,263,sporty kids ones well love throw ball around c...,40.0,40.0,0.0,0.0,0.472,0.528,0.9584,0.018798,2
18589,Cola Bottles - Sugar,9,17,660,263,sin gluten free percent grasa fat percent yumm...,225.0,225.0,0.0,0.187,0.466,0.347,0.4767,0.111411,3
20781,Crazy Roller,9,17,660,263,crazy roller gluten free percent yummy percent...,110.0,110.0,0.0,0.091,0.302,0.608,0.9287,0.053841,2
970,Moments,9,17,652,257,introducing ferrero rocher moments special pra...,175.0,175.0,0.0,0.0,0.574,0.426,0.9854,0.086381,2
10965,Moments,9,17,652,257,introducing ferrero rocher moments special pra...,349.0,349.0,0.0,0.0,0.574,0.426,0.9854,0.173488,2


In [164]:
#Summary of results
#https://pandas.pydata.org/docs/user_guide/style.html
#https://stackoverflow.com/questions/59535426/can-you-change-the-caption-font-size-using-pandas-styling
#https://datascientyst.com/set-caption-customize-font-size-color-in-pandas-dataframe/
query=test.loc[[3],:]
d=pd.DataFrame({'Similar products':similar_products_3['product'].values})
d.style.set_caption('Similar products for: '+'"'+query['product'].values+'"').set_table_styles([{'selector':'caption',
                                                                                         'props':[('color','black'),
                                                                                                  ('font-weight', 'bold'),
                                                                                                  ('text-align','left'),
                                                                                                  ('background-color','skyblue'),
                                                                                                 ('font-size', '12px')]}])

Unnamed: 0,Similar products
0,Kantan Cola Bottles Gum
1,Kantan Bubble Gum Balls
2,Kantan Bubble Gum Balls
3,Pucks Crunchy Jellies
4,Tennis Balls Gum
5,Kantan Tennis Balls
6,Cola Bottles - Sugar
7,Crazy Roller
8,Moments
9,Moments


In [5]:
query=test.loc[[10],:]
query

Unnamed: 0,product,category,sub_category,brand,type,description,sale_price,market_price
10,Stainless Steel Water Bottle - Green BB 498 2,"Kitchen, Garden & Pets",Storage & Accessories,DP,Water & Fridge Bottles,"This green coloured water bottle, made of stai...",249.0,460.0


In [6]:
query.to_csv('query10.csv',index=False)

In [7]:
query['sale_price']=2700
query.to_csv('query_salepreiceerror.csv',index=False)

In [166]:
# changing sale price to check for saleprice check
query['sale_price']=2700
similar_products=get_similar_products(query,df,X_train=X_train,num_results=11)
similar_products

The sale_price of query product is not in range of other products in same brand,Check sale_price


In [8]:
query['brand']=' '
query.to_csv('missingbrand.csv',index=False)

In [167]:
query['sale_price']=249
query['brand']=' ' #keepingspace in brand
similar_products=get_similar_products(query,df,X_train=X_train,num_results=11)
similar_products

Please check datapoint

The data has missing values in['brand']columns
Please check query point for any missing or incomplete information


In [9]:
query=test.loc[[765],:]
query.to_csv('query765.csv',index=False)

In [168]:
query=test.loc[[765],:]
similar_products=get_similar_products(query,df,X_train=X_train,num_results=11)
similar_products

The searched/Queried product is:
 Brahmi Bhringaraj Taila - Anti Graying

Top 10 Similar products for "[1mBrahmi Bhringaraj Taila - Anti Graying[0m" are:

16751 : Moroccan Argan Hair Serum  - Nourishing, Conditioning
Cosine Similarity with queried product is : 0.999999
Discount %:  0.0
-------------------------------------------------- 

19261 : Evening Primrose Oil - Vegetarian Capsule (500 mg)
Cosine Similarity with queried product is : 0.999998
Discount %:  0.0
-------------------------------------------------- 

5783 : Pro Keratin & Argan Oil Hair Nourishing Smooth Therapy Spray
Cosine Similarity with queried product is : 0.999998
Discount %:  0.0
-------------------------------------------------- 

15588 : Cold Pressed Rosemary Lavender Healthy Hair Oil For Thick & Strong Hair
Cosine Similarity with queried product is : 0.999998
Discount %:  0.0
-------------------------------------------------- 

3353 : 10-In-1 Bioactive Hair Oil - 10 Pure Oils in 1
Cosine Similarity with queri

Unnamed: 0,product,category,sub_category,brand,type,description,sale_price,market_price,discount_%,negative,neutral,positive,compound,sale_price_scaled,cluster_label
16751,"Moroccan Argan Hair Serum - Nourishing, Condi...",3,50,1849,203,get soft glossy hair seconds light non greasy ...,675.0,675.0,0.0,0.06,0.605,0.335,0.9929,0.336687,2
19261,Evening Primrose Oil - Vegetarian Capsule (500...,3,50,1835,203,product contains evening primrose oil known he...,295.0,295.0,0.0,0.121,0.671,0.208,0.5719,0.146454,3
5783,Pro Keratin & Argan Oil Hair Nourishing Smooth...,3,50,1849,205,botanica pro keratin argan oil smooth therapy ...,499.0,499.0,0.0,0.074,0.608,0.318,0.997,0.24858,2
15588,Cold Pressed Rosemary Lavender Healthy Hair Oi...,3,50,1809,203,bring home soulflower rosemary lavender health...,300.0,400.0,0.25,0.065,0.507,0.428,0.9423,0.148957,4
3353,10-In-1 Bioactive Hair Oil - 10 Pure Oils in 1,3,50,1849,203,botanica years research brought breakthrough h...,799.0,899.0,0.111235,0.085,0.812,0.102,0.031,0.398763,1
6240,"Flaxseed Oil - Omega-3, Omega-6, Omega-9 Veget...",3,50,1835,203,flaxseed oil useful strengthening reduces vata...,235.0,235.0,0.0,0.072,0.335,0.593,0.8979,0.116418,2
16132,Garlic Oil - Vegetarian Capsule 500 mg,3,50,1835,203,product contains garlic oil known help proper ...,220.0,220.0,0.0,0.054,0.614,0.332,0.8271,0.108908,2
18939,Garlic Oil - Vegetarian Capsule 500 mg,3,50,1835,203,product contains garlic oil known help proper ...,220.0,220.0,0.0,0.054,0.614,0.332,0.8271,0.108908,2
20312,Ultralights Highlighting Kit - Coffee Collecti...,3,50,1856,202,ready give hair gorgeous vibrant highlights or...,160.0,160.0,0.0,0.0,0.59,0.41,0.9843,0.078872,2
7970,Ultralights Highlighting Kit - Coffee Collecti...,3,50,1856,202,ready give hair gorgeous vibrant highlights or...,160.0,160.0,0.0,0.0,0.59,0.41,0.9843,0.078872,2


In [169]:
d=pd.DataFrame({'Similar products':similar_products['product'].values})
d.style.set_caption('Similar products for: '+'"'+query['product'].values+'"').set_table_styles([{'selector':'caption',
                                                                                         'props':[('color','black'),
                                                                                                  ('font-weight', 'bold'),
                                                                                                  ('text-align','left'),
                                                                                                  ('background-color','skyblue'),
                                                                                                 ('font-size', '12px')]}])

Unnamed: 0,Similar products
0,"Moroccan Argan Hair Serum - Nourishing, Conditioning"
1,Evening Primrose Oil - Vegetarian Capsule (500 mg)
2,Pro Keratin & Argan Oil Hair Nourishing Smooth Therapy Spray
3,Cold Pressed Rosemary Lavender Healthy Hair Oil For Thick & Strong Hair
4,10-In-1 Bioactive Hair Oil - 10 Pure Oils in 1
5,"Flaxseed Oil - Omega-3, Omega-6, Omega-9 Vegetarian Capsule"
6,Garlic Oil - Vegetarian Capsule 500 mg
7,Garlic Oil - Vegetarian Capsule 500 mg
8,"Ultralights Highlighting Kit - Coffee Collection, Mocha Brown"
9,"Ultralights Highlighting Kit - Coffee Collection, Hazel Brown"


In [13]:
query=test.loc[[5232],:]
query.to_csv('query5232.csv',index=False)

In [170]:
query=test.loc[[5232],:]
similar_products=get_similar_products(query,df,X_train=X_train,num_results=11)
similar_products

The searched/Queried product is:
 Sauce - Chilli, Hot

Top 10 Similar products for "[1mSauce - Chilli, Hot[0m" are:

12205 : Sauce - Sweet Thai chilli Sauce
Cosine Similarity with queried product is : 0.999997
Discount %:  0.0
-------------------------------------------------- 

104 : Sauce - Black Bean Sauce
Cosine Similarity with queried product is : 0.999997
Discount %:  0.0
-------------------------------------------------- 

8755 : Sauce - Dark Soy
Cosine Similarity with queried product is : 0.999997
Discount %:  0.0
-------------------------------------------------- 

1340 : Sauce - Black Pepper
Cosine Similarity with queried product is : 0.999996
Discount %:  0.0
-------------------------------------------------- 

5396 : Sauce - Teriyaki
Cosine Similarity with queried product is : 0.999995
Discount %:  0.0
-------------------------------------------------- 

12480 : Ongs Marinade - Honey Soy 255 gm Bottle
Cosine Similarity with queried product is : 0.999995
Discount %:  0.0
-

Unnamed: 0,product,category,sub_category,brand,type,description,sale_price,market_price,discount_%,negative,neutral,positive,compound,sale_price_scaled,cluster_label
12205,Sauce - Sweet Thai chilli Sauce,9,79,1430,395,add black bean sauce oriental recipes give aut...,240.0,240.0,0.0,0.0,0.816,0.184,0.8591,0.118921,2
104,Sauce - Black Bean Sauce,9,79,1430,395,ongs brand aims bring flavours asia easy use s...,240.0,240.0,0.0,0.0,0.723,0.277,0.9442,0.118921,2
8755,Sauce - Dark Soy,9,79,1430,395,full bodied dark reddish brown sauce rich arom...,240.0,240.0,0.0,0.0,0.749,0.251,0.8271,0.118921,2
1340,Sauce - Black Pepper,9,79,1430,395,mild spice sauce packed right amount black pep...,240.0,240.0,0.0,0.0,0.748,0.252,0.9022,0.118921,2
5396,Sauce - Teriyaki,9,79,1430,395,truly convenient oriental sauce sweet savoury ...,290.0,290.0,0.0,0.0,0.638,0.362,0.9517,0.143951,2
12480,Ongs Marinade - Honey Soy 255 gm Bottle,9,79,1430,395,ongs honey soy marinade directions add tablesp...,210.0,210.0,0.0,0.0,1.0,0.0,0.0,0.103902,1
3603,Sauce - Stir Fry,9,79,1430,395,oriental one sauce perfectly blended authentic...,240.0,240.0,0.0,0.0,0.667,0.333,0.9432,0.118921,2
15822,Sambal - Oelek,9,79,1430,395,sambal hot paste famous south east asia made m...,240.0,240.0,0.0,0.0,1.0,0.0,0.0,0.118921,1
14382,Sauce - Light Soy,9,79,1430,395,boost taste snacks,240.0,240.0,0.0,0.0,0.426,0.574,0.4019,0.118921,3
15101,Schezwan Dip,9,79,1443,395,organic nation chutneys blended right spices i...,89.0,89.0,0.0,0.0,0.64,0.36,0.959,0.043328,2


In [171]:
d=pd.DataFrame({'Similar products':similar_products['product'].values})
d.style.set_caption('Similar products for: '+'"'+query['product'].values+'"').set_table_styles([{'selector':'caption',
                                                                                         'props':[('color','black'),
                                                                                                  ('font-weight', 'bold'),
                                                                                                  ('text-align','left'),
                                                                                                  ('background-color','skyblue'),
                                                                                                 ('font-size', '12px')]}])

Unnamed: 0,Similar products
0,Sauce - Sweet Thai chilli Sauce
1,Sauce - Black Bean Sauce
2,Sauce - Dark Soy
3,Sauce - Black Pepper
4,Sauce - Teriyaki
5,Ongs Marinade - Honey Soy 255 gm Bottle
6,Sauce - Stir Fry
7,Sambal - Oelek
8,Sauce - Light Soy
9,Schezwan Dip


In [14]:
query=test.loc[[2607],:]
query

Unnamed: 0,product,category,sub_category,brand,type,description,sale_price,market_price
2607,Instant Pasta - Tomato Salsa,Snacks & Branded Foods,"Noodle, Pasta, Vermicelli",Weikfield,Instant Pasta,Instant Pasta with Sauce Maker,22.0,25.0


In [15]:
query=test.loc[[2607],:]
query.to_csv('query2607.csv')

In [172]:
query=test.loc[[2607],:]
similar_products=get_similar_products(query,df,X_train=X_train,num_results=11)
similar_products

The searched/Queried product is:
 Instant Pasta - Tomato Salsa

Top 10 Similar products for "[1mInstant Pasta - Tomato Salsa[0m" are:

6692 : 1-2-3 Noodles - Chicken Flavour
Cosine Similarity with queried product is : 0.999998
Discount %:  0.0
-------------------------------------------------- 

2013 : X-press Instant Noodles - Masala Delight, Super Saver Pack
Cosine Similarity with queried product is : 0.999997
Discount %:  0.0
-------------------------------------------------- 

17915 : 1-2-3 Noodles - Pure Vegetarian
Cosine Similarity with queried product is : 0.999996
Discount %:  0.0
-------------------------------------------------- 

9277 : Organic Black Pepper/Kari Menasu
Cosine Similarity with queried product is : 0.999995
Discount %:  0.0
-------------------------------------------------- 

10133 : Mortadella - Chicken, Olive
Cosine Similarity with queried product is : 0.999994
Discount %:  0.0
-------------------------------------------------- 

5029 : A Flavouring Agent -

Unnamed: 0,product,category,sub_category,brand,type,description,sale_price,market_price,discount_%,negative,neutral,positive,compound,sale_price_scaled,cluster_label
6692,1-2-3 Noodles - Chicken Flavour,11,65,2086,230,wai wai instant way mouthwatering noodles pref...,12.0,12.0,0.0,0.0,0.741,0.259,0.8442,0.004781,2
2013,"X-press Instant Noodles - Masala Delight, Supe...",11,65,2086,230,wai wai press white noodle available vegetable...,60.0,60.0,0.0,0.0,0.561,0.439,0.9686,0.02881,2
17915,1-2-3 Noodles - Pure Vegetarian,11,65,2086,230,wai wai instant way mouthwatering noodles pref...,28.0,35.0,0.2,0.0,1.0,0.0,0.0,0.012791,1
9277,Organic Black Pepper/Kari Menasu,7,59,1945,213,rich flavour large sized peppercorns stems fac...,175.0,175.0,0.0,0.0,0.734,0.266,0.7845,0.086381,2
10133,"Mortadella - Chicken, Olive",11,46,1588,174,quickee presents deliciously mouth watering ra...,195.0,195.0,0.0,0.036,0.808,0.156,0.6124,0.096393,3
5029,A Flavouring Agent - Grape,7,59,1968,213,avail wide range liquid food colours extensive...,35.0,35.0,0.0,0.0,0.787,0.213,0.7964,0.016295,3
15566,Momos - Vegetable,11,46,1549,176,made premium ingredients preservatives prasuma...,150.0,150.0,0.0,0.0,0.804,0.196,0.743,0.073865,3
17536,Hair Growth Serum With Vitamin C - Oil Free Fo...,3,60,1832,201,vitamin beard hair growth serum men unique oil...,549.0,549.0,0.0,0.011,0.494,0.495,0.9987,0.27361,2
8695,"Smoked Chicken Frankfurter - 7""",11,46,1629,174,republic chicken presents authentic range chic...,540.0,600.0,0.1,0.048,0.481,0.471,0.9485,0.269105,2
20867,Udupi Kashaya Milk Mix,7,59,2011,213,udupi ruchi udupi kashaya traditional sought b...,142.5,150.0,0.05,0.106,0.646,0.247,0.5719,0.070111,3


In [173]:
d=pd.DataFrame({'Similar products':similar_products['product'].values})
d.style.set_caption('Similar products for: '+'"'+query['product'].values+'"').set_table_styles([{'selector':'caption',
                                                                                         'props':[('color','black'),
                                                                                                  ('font-weight', 'bold'),
                                                                                                  ('text-align','left'),
                                                                                                  ('background-color','skyblue'),
                                                                                                 ('font-size', '12px')]}])

Unnamed: 0,Similar products
0,1-2-3 Noodles - Chicken Flavour
1,"X-press Instant Noodles - Masala Delight, Super Saver Pack"
2,1-2-3 Noodles - Pure Vegetarian
3,Organic Black Pepper/Kari Menasu
4,"Mortadella - Chicken, Olive"
5,A Flavouring Agent - Grape
6,Momos - Vegetable
7,Hair Growth Serum With Vitamin C - Oil Free Formula
8,"Smoked Chicken Frankfurter - 7"""
9,Udupi Kashaya Milk Mix


In [16]:
query=test.loc[[1909],:]
query.to_csv('query1909.csv')

In [174]:
query=test.loc[[1909],:]
similar_products=get_similar_products(query,df,X_train=X_train,num_results=11)
similar_products

The searched/Queried product is:
 Solid Rim Power Active - Lemon

Top 10 Similar products for "[1mSolid Rim Power Active - Lemon[0m" are:

12357 : Toilet Rimblock Dou Burst - Lemon
Cosine Similarity with queried product is : 0.999982
Discount %:  0.0
-------------------------------------------------- 

18041 : Solid Rim Blue Active - Fresh Flower
Cosine Similarity with queried product is : 0.999982
Discount %:  0.0
-------------------------------------------------- 

16013 : Solid Rim Colour Active Purple Water - Lavender
Cosine Similarity with queried product is : 0.999982
Discount %:  0.0
-------------------------------------------------- 

5973 : Acticlean In Cistern Toilet Block - Purple
Cosine Similarity with queried product is : 0.999982
Discount %:  0.0
-------------------------------------------------- 

18609 : Solid Rim Blue Active - Fresh Flower, Twin Pack
Cosine Similarity with queried product is : 0.999982
Discount %:  0.0
------------------------------------------------

Unnamed: 0,product,category,sub_category,brand,type,description,sale_price,market_price,discount_%,negative,neutral,positive,compound,sale_price_scaled,cluster_label
12357,Toilet Rimblock Dou Burst - Lemon,5,1,247,222,function formula cleaning foam cleans toilet e...,169.0,169.0,0.0,0.108,0.57,0.322,0.9001,0.083377,2
18041,Solid Rim Blue Active - Fresh Flower,5,1,247,222,function formula cleaning foam cleans toilet e...,169.0,169.0,0.0,0.108,0.57,0.322,0.9001,0.083377,2
16013,Solid Rim Colour Active Purple Water - Lavender,5,1,247,222,function formula cleaning foam cleans toilet e...,169.0,169.0,0.0,0.108,0.57,0.322,0.9001,0.083377,2
5973,Acticlean In Cistern Toilet Block - Purple,5,1,247,222,function formula cleaning foam cleans toilet e...,249.0,249.0,0.0,0.108,0.57,0.322,0.9001,0.123426,2
18609,"Solid Rim Blue Active - Fresh Flower, Twin Pack",5,1,247,222,function formula cleaning foam cleans toilet e...,299.0,299.0,0.0,0.108,0.57,0.322,0.9001,0.148457,2
8381,Solid Rim Power Active Flower,5,1,247,222,function formula cleaning foam cleans toilet e...,399.0,399.0,0.0,0.108,0.57,0.322,0.9001,0.198518,2
1233,Solid Rim Colour Active Blue Water - Fresh Flower,5,1,247,222,function formula cleaning foam cleans toilet e...,399.0,399.0,0.0,0.108,0.57,0.322,0.9001,0.198518,2
10810,Solid Rim Power Active Lemon Trio Pack,5,1,247,222,function formula cleaning foam cleans toilet e...,399.0,399.0,0.0,0.108,0.57,0.322,0.9001,0.198518,2
15000,"Toilet Gel Cleaner - Hygiene & Shine, Flower B...",5,1,247,222,bloo total hygiene shine toilet gel removes li...,259.0,259.0,0.0,0.117,0.572,0.311,0.8316,0.128432,2
3593,Toilet & Bathroom Wipes - Sweet Tulip,5,1,247,222,fragrance lock easy open close pack keeps wipe...,159.0,159.0,0.0,0.066,0.543,0.391,0.9371,0.078371,2


In [175]:
d=pd.DataFrame({'Similar products':similar_products['product'].values})
d.style.set_caption('Similar products for: '+'"'+query['product'].values+'"').set_table_styles([{'selector':'caption',
                                                                                         'props':[('color','black'),
                                                                                                  ('font-weight', 'bold'),
                                                                                                  ('text-align','left'),
                                                                                                  ('background-color','skyblue'),
                                                                                                 ('font-size', '12px')]}])

Unnamed: 0,Similar products
0,Toilet Rimblock Dou Burst - Lemon
1,Solid Rim Blue Active - Fresh Flower
2,Solid Rim Colour Active Purple Water - Lavender
3,Acticlean In Cistern Toilet Block - Purple
4,"Solid Rim Blue Active - Fresh Flower, Twin Pack"
5,Solid Rim Power Active Flower
6,Solid Rim Colour Active Blue Water - Fresh Flower
7,Solid Rim Power Active Lemon Trio Pack
8,"Toilet Gel Cleaner - Hygiene & Shine, Flower Burst"
9,Toilet & Bathroom Wipes - Sweet Tulip


In [176]:
query=test.loc[[3103],:]
similar_products=get_similar_products(query,df,X_train=X_train,num_results=11)
similar_products

The searched/Queried product is:
 Organic - Mustard Oil

Top 10 Similar products for "[1mOrganic - Mustard Oil[0m" are:

19843 : Organic - Urad Chilka
Cosine Similarity with queried product is : 0.999999
Discount %:  0.0
-------------------------------------------------- 

4299 : Organic - Masoor Malka
Cosine Similarity with queried product is : 0.999999
Discount %:  0.0
-------------------------------------------------- 

17800 : Organic - Almond
Cosine Similarity with queried product is : 0.999999
Discount %:  0.0
-------------------------------------------------- 

9628 : Organic - Cashew/Godambi
Cosine Similarity with queried product is : 0.999999
Discount %:  0.0
-------------------------------------------------- 

18572 : Organic - Wheat Dalia
Cosine Similarity with queried product is : 0.999999
Discount %:  0.0
-------------------------------------------------- 

10222 : Organic - Sesame Oil
Cosine Similarity with queried product is : 0.999999
Discount %:  0.0
----------------

Unnamed: 0,product,category,sub_category,brand,type,description,sale_price,market_price,discount_%,negative,neutral,positive,compound,sale_price_scaled,cluster_label
19843,Organic - Urad Chilka,7,69,2004,298,turn organic brought premium fresh healthy org...,87.75,117.0,0.25,0.0,0.661,0.339,0.9468,0.042702,4
4299,Organic - Masoor Malka,7,69,2004,298,turn organic brought premium fresh healthy org...,67.5,90.0,0.25,0.0,0.745,0.255,0.7351,0.032565,4
17800,Organic - Almond,7,69,2004,299,turn organic brought premium fresh healthy org...,105.0,105.0,0.0,0.0,0.643,0.357,0.6124,0.051338,3
9628,Organic - Cashew/Godambi,7,69,2004,299,turn organic brought premium fresh healthy org...,82.5,110.0,0.25,0.0,0.643,0.357,0.6124,0.040074,3
18572,Organic - Wheat Dalia,7,69,2004,302,turn organic brought premium fresh healthy org...,52.0,52.0,0.0,0.061,0.696,0.243,0.743,0.024805,3
10222,Organic - Sesame Oil,7,69,2004,300,could used cooking place edible oils help redu...,202.5,225.0,0.1,0.133,0.545,0.321,0.7351,0.100148,2
10094,Organic - Urad Dal/Uddina Bele,7,69,2004,298,turn organic brought premium fresh healthy org...,90.0,120.0,0.25,0.0,0.821,0.179,0.6124,0.043829,3
5103,Organic -Suji,7,69,2004,302,organic rava made grinding organic rice coarse...,52.0,52.0,0.0,0.078,0.777,0.145,0.4404,0.024805,3
9891,Organic - Urad Whole,7,69,2004,298,turn organic brought premium fresh healthy org...,115.0,115.0,0.0,0.0,0.821,0.179,0.6124,0.056344,3
3405,Rajma/Capparadavare Jammu,7,69,2004,298,turn organic rajma beans mature dried beans ma...,108.0,120.0,0.1,0.0,0.797,0.203,0.7269,0.05284,3


In [177]:
d=pd.DataFrame({'Similar products':similar_products['product'].values})
d.style.set_caption('Similar products for: '+'"'+query['product'].values+'"').set_table_styles([{'selector':'caption',
                                                                                         'props':[('color','black'),
                                                                                                  ('font-weight', 'bold'),
                                                                                                  ('text-align','left'),
                                                                                                  ('background-color','skyblue'),
                                                                                                 ('font-size', '12px')]}])

Unnamed: 0,Similar products
0,Organic - Urad Chilka
1,Organic - Masoor Malka
2,Organic - Almond
3,Organic - Cashew/Godambi
4,Organic - Wheat Dalia
5,Organic - Sesame Oil
6,Organic - Urad Dal/Uddina Bele
7,Organic -Suji
8,Organic - Urad Whole
9,Rajma/Capparadavare Jammu


###  Model deployed in Streamlit.
- Streamlit app is open to public, anyone can use it

[deployment link](https://maq090-big-basket-recommender-system2-streamlit2-qxcgyh.streamlitapp.com/)

### <u/> References:

- https://www.geeksforgeeks.org/
- https://seaborn.pydata.org/generated/seaborn
- https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.hist.html
- https://docs.python.org/3/library/re.html
- https://www.towardsdatascience.com/