# Libraries Import and data files

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import re
import pickle
import os
from similarity.cosine import Cosine


In [2]:
data_sample = 'sampledata.xlsx'
data_train = 'Train-Data.xlsx'

Preparing and Processing the data


In [3]:
df_sample = pd.read_excel(data_sample)
df_sample.head()

Unnamed: 0,Item Code,Description,Brand Name,Grammage,Sample MRP,discount,Selling Price,Barcode
0,2240,ADIDAS DYNAMIC PULSE SH GEL 250ML,Adidas,250 ml,87,0.1,78.3,480822663035
1,4890,Adidas Dynamic Pulse Shower Gel 250 Ml,Adidas,250 ml,30,0.1,27.0,553885072637
2,80330,ADIDAS EXTREME POWER AFTER SHAVE LOTION 100 ML,Adidas,100 ml,50,0.2,40.0,756588357262
3,3527,Adidas Extreme Power Deodorant 150 Ml,Adidas,150 ml,35,0.1,31.5,920738510151
4,90775,Jovees Gold 24 Carat Massage Gel 100 Gm,Jovees,100 gm,245,0.3,171.5,191041283701


In [4]:
df_train = pd.read_excel(data_train)
print(len(df_train))
df_train.head()

8000


Unnamed: 0,Item Code,Product Description,Brand Name,Grammage,MRP,Discount,Selling Price,Special Offer,Final Price,Category
0,100197,KKRWG 200 GM PLPCH SUPER DETERGENT POWDER POWE...,KKRWG,200 GM,439,0.48,228,0.47,121,D
1,100420,EEOTR 98 GM PLTUB OPTI BLACK DAZZLING SHINE,EEOTR,98 GM,58,0.06,55,0.02,53,A
2,100462,XTHVF 200 GM PLPCH WITH OPTICAL BRIGHTNER,XTHVF,200 GM,387,0.5,194,0.5,97,D
3,100473,ATYIX Sandalum Agarbatti,ATYIX,50 gm\n,889,0.31,613,0.08,564,G
4,100527,FBIMG 500M PBT COCOABUTTER LOTN COCOABUTTER & ...,FBIMG,,749,0.01,742,0.04,712,B


## Data cleaning

In [5]:
df = pd.DataFrame(df_train, columns=['Product Description', 'Brand Name', 'Grammage', 'Discount', 'Special Offer', 'Final Price'])
df.head()

Unnamed: 0,Product Description,Brand Name,Grammage,Discount,Special Offer,Final Price
0,KKRWG 200 GM PLPCH SUPER DETERGENT POWDER POWE...,KKRWG,200 GM,0.48,0.47,121
1,EEOTR 98 GM PLTUB OPTI BLACK DAZZLING SHINE,EEOTR,98 GM,0.06,0.02,53
2,XTHVF 200 GM PLPCH WITH OPTICAL BRIGHTNER,XTHVF,200 GM,0.5,0.5,97
3,ATYIX Sandalum Agarbatti,ATYIX,50 gm\n,0.31,0.08,564
4,FBIMG 500M PBT COCOABUTTER LOTN COCOABUTTER & ...,FBIMG,,0.01,0.04,712


Removing stopwords & stemming

In [6]:
def description_to_words(descr):
    nltk.download("stopwords", quiet=True)
    stemmer = PorterStemmer()
    
    text = descr
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem
    
    
    return ' '.join(words)

In [7]:
print(description_to_words(df['Product Description'][0]))

kkrwg 200 gm plpch super deterg powder power white


Create cache after processing training data

In [8]:
def preprocess_data(cache_dir=".", cache_file="preprocessed_data.pkl"):
    """Convert each review to words; read from cache if available."""

    # If cache_file is not None, try to read from it first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass  # unable to read from cache, but that's okay
    
    # If cache is missing, then do the heavy lifting
    if cache_data is None:
        # Preprocess training and test data to obtain words for each description
        words_train = df['Product Description'].apply(description_to_words)
#         words_train = [review_to_words(review) for review in data_train]
        
        # Write to cache file for future runs
        if cache_file is not None:
            cache_data = words_train
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache file:", cache_file)
    else:
        # Unpack data loaded from cache file
        words_train = cache_data
    
    return words_train

In [9]:
# Preprocess data
words_train = preprocess_data()

Read preprocessed data from cache file: preprocessed_data.pkl


In [10]:
len(words_train)

8000

In [11]:
words_train

0       kkrwg 200 gm plpch super deterg powder power w...
1                eeotr 98 gm pltub opti black dazzl shine
2                      xthvf 200 gm plpch optic brightner
3                                atyix sandalum agarbatti
4        fbimg 500m pbt cocoabutt lotn cocoabutt aloevera
                              ...                        
7995        keedw 30 ml cdbox anti age serum advanc vit c
7996                  kwhra 9 gm plpch tomato ketchup b w
7997                         feyay 200 gm plbot soya sauc
7998            xgqac 15 gm cdbox papaya massag cream b w
7999           qaegd 200 gm plpch deterg powder clr guard
Name: Product Description, Length: 8000, dtype: object

# Cosine Distance

In [12]:
cosine = Cosine(5)

In [13]:
words_cosine = words_train.apply(lambda s: cosine.get_profile(s)) 

In [14]:
words_cosine

0       {'kkrwg': 1, 'krwg2': 1, 'rwg20': 1, 'wg200': ...
1       {'eeotr': 1, 'eotr9': 1, 'otr98': 1, 'tr98g': ...
2       {'xthvf': 1, 'thvf2': 1, 'hvf20': 1, 'vf200': ...
3       {'atyix': 1, 'tyixs': 1, 'yixsa': 1, 'ixsan': ...
4       {'fbimg': 1, 'bimg5': 1, 'img50': 1, 'mg500': ...
                              ...                        
7995    {'keedw': 1, 'eedw3': 1, 'edw30': 1, 'dw30m': ...
7996    {'kwhra': 1, 'whra9': 1, 'hra9g': 1, 'ra9gm': ...
7997    {'feyay': 1, 'eyay2': 1, 'yay20': 1, 'ay200': ...
7998    {'xgqac': 1, 'gqac1': 1, 'qac15': 1, 'ac15g': ...
7999    {'qaegd': 1, 'aegd2': 1, 'egd20': 1, 'gd200': ...
Name: Product Description, Length: 8000, dtype: object

# Contextual Search

In [15]:
def user_input(text, brand):
    brand_boost = None
    priceThreshold = None
    words = description_to_words(text)
    if(' less ' in words):
        i = words.find(' less')
        j = words.find('rs')
        prices = re.findall("[0-9]+", words[i:j])
        words = words[:i]+words[j+2:]
        if(prices): priceThreshold = prices[0]
    if(brand):
        brand_boost = brand
    return words, brand_boost, priceThreshold

In [16]:
def get_matched_indexes(text, brand):
    y = user_input(text, brand)
    cosine_y = cosine.get_profile(y[0])
    cosine_distance = words_cosine.apply(lambda s: cosine.similarity_profiles(s, cosine_y))
    indexes = cosine_distance.nlargest(5).index
    temp_df = df.iloc[indexes, :]
    if(y[1]):
        temp_df = temp_df[temp_df['Brand Name']==y[1]]
    if(y[2]):
        temp_df = temp_df[temp_df['Final Price']<=int(y[2])]
    return temp_df

---
# Results

## 1. Search Term: Shampoos less than 300 rs

In [17]:
get_matched_indexes('Shampoos less than 300 rs', None)

Unnamed: 0,Product Description,Brand Name,Grammage,Discount,Special Offer,Final Price
0,KKRWG 200 GM PLPCH SUPER DETERGENT POWDER POWE...,KKRWG,200 GM,0.48,0.47,121
1,EEOTR 98 GM PLTUB OPTI BLACK DAZZLING SHINE,EEOTR,98 GM,0.06,0.02,53
2,XTHVF 200 GM PLPCH WITH OPTICAL BRIGHTNER,XTHVF,200 GM,0.5,0.5,97


## 2. Shampoos less than 300 rs; Brand Boost: Dove

In [18]:
get_matched_indexes('Shampoos less than 300 rs', 'Dove')

Unnamed: 0,Product Description,Brand Name,Grammage,Discount,Special Offer,Final Price


## 3. Adidas 

In [19]:
get_matched_indexes('Adidas', None)

Unnamed: 0,Product Description,Brand Name,Grammage,Discount,Special Offer,Final Price
0,KKRWG 200 GM PLPCH SUPER DETERGENT POWDER POWE...,KKRWG,200 GM,0.48,0.47,121
1,EEOTR 98 GM PLTUB OPTI BLACK DAZZLING SHINE,EEOTR,98 GM,0.06,0.02,53
2,XTHVF 200 GM PLPCH WITH OPTICAL BRIGHTNER,XTHVF,200 GM,0.5,0.5,97
3,ATYIX Sandalum Agarbatti,ATYIX,50 gm\n,0.31,0.08,564
4,FBIMG 500M PBT COCOABUTTER LOTN COCOABUTTER & ...,FBIMG,,0.01,0.04,712


## 4. Biscuits

In [20]:
get_matched_indexes('Biscuits', None)

Unnamed: 0,Product Description,Brand Name,Grammage,Discount,Special Offer,Final Price
0,KKRWG 200 GM PLPCH SUPER DETERGENT POWDER POWE...,KKRWG,200 GM,0.48,0.47,121
1,EEOTR 98 GM PLTUB OPTI BLACK DAZZLING SHINE,EEOTR,98 GM,0.06,0.02,53
2,XTHVF 200 GM PLPCH WITH OPTICAL BRIGHTNER,XTHVF,200 GM,0.5,0.5,97
3,ATYIX Sandalum Agarbatti,ATYIX,50 gm\n,0.31,0.08,564
4,FBIMG 500M PBT COCOABUTTER LOTN COCOABUTTER & ...,FBIMG,,0.01,0.04,712


## 4. Chocolate Biscuits

In [21]:
get_matched_indexes('Chocolate Biscuits', None)

Unnamed: 0,Product Description,Brand Name,Grammage,Discount,Special Offer,Final Price
5916,BRNWY Chocolate Veg Cakes,BRNWY,100 gm,0.16,0.17,63
3458,ERAXX Sliced Chocolate Cake,ERAXX,150 gm,0.46,0.06,37
5777,LXGIK Chocolate Chip Cookies,LXGIK,150 gm,0.02,0.25,679
4591,DXWWL Chocolate Almond Dates,DXWWL,100 gm,0.35,0.18,322
4940,LQRFB Hide & Seek Chocolate Cr,LQRFB,100 gm,0.4,0.15,70


6. Facial Kit

In [22]:
get_matched_indexes('Facial Kit', None)

Unnamed: 0,Product Description,Brand Name,Grammage,Discount,Special Offer,Final Price
2078,FEBXH 5 GM CDBOX FACIAL PACK (B/W-,FEBXH,5 GM,0.35,0.4,127
3986,JDLKJ 40 GM PLCNT FACIAL CREAM,JDLKJ,40 GM,0.04,0.18,71
6262,UFRLP 500 GM PLCAN FACIAL CREAM,UFRLP,500 GM,0.2,0.49,386
6906,FEBXH 5 GM CDBOX FACIAL CREAM (B/W-,FEBXH,5 GM,0.34,0.18,84
976,HRUYT 100GM PLJAR GOLD FACIAL GEL,HRUYT,100GM,0.36,0.38,46


## 7. Gold Massage less than 150 rs

In [23]:
get_matched_indexes('Gold Massage less than 150 rs', None)

Unnamed: 0,Product Description,Brand Name,Grammage,Discount,Special Offer,Final Price
727,FEBXH 10 GM CDBOX GOLD MASSAGE GEL (B/W-,FEBXH,10 GM,0.42,0.09,8


## 8. Gold Massage with more than 25% discount

In [24]:
get_matched_indexes('Gold Massage with more than 25% discount', None)

Unnamed: 0,Product Description,Brand Name,Grammage,Discount,Special Offer,Final Price
22,WCDMB 15 GM CDBOX GOLD MASSAGE GEL (B/W-,WCDMB,15 GM,0.21,0.5,228
318,DDWAI 60 GM CDBOX GOLD MASSAGE GEL (B/W-,DDWAI,60 GM,0.04,0.0,506
727,FEBXH 10 GM CDBOX GOLD MASSAGE GEL (B/W-,FEBXH,10 GM,0.42,0.09,8
859,PJRMF 25 GM CDBOX GOLD MASSAGE GEL (B/W-,PJRMF,25 GM,0.5,0.42,223
1048,IVOYT 12 GM CDBOX GOLD MASSAGE GEL (B/W-,IVOYT,12 GM,0.24,0.05,521


## 9. Gold Massage with more than 25% discount and less than 150 rs

In [25]:
get_matched_indexes('Gold Massage with more than 25% discount and less than 150 rs', None)

Unnamed: 0,Product Description,Brand Name,Grammage,Discount,Special Offer,Final Price
727,FEBXH 10 GM CDBOX GOLD MASSAGE GEL (B/W-,FEBXH,10 GM,0.42,0.09,8


---