In [3]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
import re
import string
import random
from PIL import Image
import requests
from io import BytesIO
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
from matplotlib import pyplot
from gensim.models import KeyedVectors

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
#Importing the data
ABSOLUTE_PATH = "../../datasets/"
df = pd.read_csv(ABSOLUTE_PATH+"flipkart-data.csv")

In [5]:
#Converting the category  tree into a category array
def convert_to_arr(row):
    s = row["product_category_tree"].strip('[]').strip('""')
    s = s.split(">>")
    
    for word in s:
        word = word.strip()
    return s

df["category_array"] = df.apply(convert_to_arr,axis=1)
df.drop(["product_category_tree"] , axis=1 , inplace=True)
#Handling null description , replacing it with empty string
df.loc[df["description"].isnull() , "description"] = " "

In [6]:
#Appending the category array to description
def append_desc(row):
    s = row["category_array"]
    desc = row["description"]
    s =  ",".join(s)
    desc = "".join((s,desc))
    return desc

In [7]:
df["description"] = df.apply(append_desc,axis=1)

In [8]:
#Utitlity functions for removing ASCII characters, converting lower case, removing stop words, html and punctuation from description

def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)

def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

df['cleaned'] = df['description'].apply(_removeNonAscii)

df['cleaned'] = df.cleaned.apply(func = make_lower_case)
df['cleaned'] = df.cleaned.apply(func = remove_stop_words)
df['cleaned'] = df.cleaned.apply(func=remove_punctuation)
df['cleaned'] = df.cleaned.apply(func=remove_html)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 16 columns):
uniq_id                    20000 non-null object
crawl_timestamp            20000 non-null object
product_url                20000 non-null object
product_name               20000 non-null object
pid                        20000 non-null object
retail_price               19922 non-null float64
discounted_price           19922 non-null float64
image                      19997 non-null object
is_FK_Advantage_product    20000 non-null bool
description                20000 non-null object
product_rating             20000 non-null object
overall_rating             20000 non-null object
brand                      14136 non-null object
product_specifications     19986 non-null object
category_array             20000 non-null object
cleaned                    20000 non-null object
dtypes: bool(1), float64(2), object(13)
memory usage: 2.3+ MB


In [10]:
#splitting the description into words

corpus = []
for words in df['cleaned']:
    corpus.append(words.split())

In [11]:
EMBEDDING_FILE = '/root/input/GoogleNews-vectors-negative300.bin.gz'
google_word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

# Training our corpus with Google Pretrained Model

google_model = Word2Vec(size = 300, window=5, min_count = 2, workers = -1)
google_model.build_vocab(corpus)

#model.intersect_word2vec_format('./word2vec/GoogleNews-vectors-negative300.bin', lockf=1.0, binary=True)

google_model.intersect_word2vec_format(EMBEDDING_FILE, lockf=1.0, binary=True)

google_model.train(corpus, total_examples=google_model.corpus_count, epochs = 5)

(0, 0)

In [12]:
# Generate the average word2vec for the each book description

def vectors(x):
    
    # Creating a list for storing the vectors (description into vectors)
    global word_embeddings
    word_embeddings = []

    # Reading the each book description 
    for line in df['cleaned']:
        avgword2vec = None
        count = 0
        for word in line.split():
            if word in google_model.wv.vocab:
                count += 1
                if avgword2vec is None:
                    avgword2vec = google_model[word]
                else:
                    avgword2vec = avgword2vec + google_model[word]
                
        if avgword2vec is not None:
            avgword2vec = avgword2vec / count
        
            word_embeddings.append(avgword2vec)

In [15]:
# Recommending the Top 5 similar books

def recommendations(title):
    
    # Calling the function vectors

     #vectors(df)
    
    # finding cosine similarity for the vectors

    cosine_similarities = cosine_similarity(word_embeddings, word_embeddings)

    # taking the title and book image link and store in new data frame called books
    books = df[['product_name', 'image']]
    #Reverse mapping of the index
    indices = pd.Series(df.index, index = df['product_name']).drop_duplicates()
    print(indices)     
    idx = indices['product_name]
    sim_scores = list(enumerate(cosine_similarities[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:6]
    book_indices = [i[0] for i in sim_scores]
    recommend = books.iloc[book_indices]
    for index, row in recommend.iterrows():

        response = requests.get(row['image'])
        img = Image.open(BytesIO(response.content))
        plt.figure()
        plt.imshow(img)
        plt.title(row['product_name'])

In [16]:
recommendations("Formal Men")

product_name
Alisha Solid Women's Cycling Shorts                                          0
FabHomeDecor Fabric Double Sofa Bed                                          1
AW Bellies                                                                   2
Alisha Solid Women's Cycling Shorts                                          3
Sicons All Purpose Arnica Dog Shampoo                                        4
Eternal Gandhi Super Series Crystal Paper Weights  with Silver Finish        5
Alisha Solid Women's Cycling Shorts                                          6
FabHomeDecor Fabric Double Sofa Bed                                          7
dilli bazaaar Bellies, Corporate Casuals, Casuals                            8
Alisha Solid Women's Cycling Shorts                                          9
Ladela Bellies                                                              10
Carrel Printed Women's                                                      11
Sicons All Purpose Tea Tree Dog Shampoo

KeyError: 'product_name'