#**Content Based Recommender Systems**

##### Caculating Cosine Similarity in Python

In [None]:
from math import*
 
def square_rooted(x):
    return round(sqrt(sum([a*a for a in x])),3)
 
def cosine_similarity(x,y):
   numerator = sum(a*b for a,b in zip(x,y))
   denominator = square_rooted(x)*square_rooted(y)
   return round(numerator/float(denominator),3)
 


In [None]:
print(cosine_similarity([3, 45, 7, 2], [2, 54, 13, 15]))

We want to recommend similar products based on previous records. For example if we search for casual shoes, similar shoes should be recommended to us, which can be done based on the hsahtags, brand, pricing, etc.

In [None]:

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity # performs same work as the cosine similarity we created above
from sklearn.feature_extraction.text import CountVectorizer # performs same work as the Document Term Frequency we did earlier

pd.set_option('display.max_columns', 100)
df = pd.read_csv('products_data.csv')
df.head()

In [None]:
df.shape

In [None]:
df = df[['title','brand.name','category','highlights','subcategory']]
df.head()

In [None]:
df['brand.name'] = df['brand.name'].map(lambda x: x.split(',')[:3])

In [None]:
df['brand.name']

In [None]:
df['category'] = df['category'].map(lambda x: x.lower().split(','))

In [None]:
df['category']

In [None]:
df['subcategory'] = df['subcategory'].map(lambda x: x.split(' '))

In [None]:
df['subcategory']

In [None]:
for index, row in df.iterrows():
    row['category'] = [x.lower().replace(' ','') for x in row['category']]
    row['subcategory'] = ''.join(row['subcategory']).lower()

In [None]:
df['category']

In [None]:
df['subcategory']

In [None]:
pip install rake_nltk

In [None]:
import rake_nltk
from rake_nltk import Rake #for extracting significant keywords in the entire text available.
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
# initializing the new column
df['Key_words'] = ""

for index, row in df.iterrows():
    plot = row['hashtags']

    r = Rake()

    r.extract_keywords_from_text(plot)

    key_words_dict_scores = r.get_word_degrees()
    
    row['Key_words'] = list(key_words_dict_scores.keys())

# dropping the Plot column
df.drop(columns = ['hashtags'], inplace = True)

In [None]:
key_words_dict_scores

In [None]:
df

In [None]:
df.set_index('title', inplace = True)
df.head()

In [None]:
df['bag_of_words'] = ''
columns = df.columns
for index, row in df.iterrows():
    words = ''
    for col in columns:
        if col != 'subcategory':
            words = words + ' '.join(row[col])+ ' '
        else:
            words = words + row[col]+ ' '
    row['bag_of_words'] = words
    
df.drop(columns = [col for col in df.columns if col!= 'bag_of_words'], inplace = True)

In [None]:
df.head()

In [None]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df['bag_of_words'])

In [None]:
count_matrix 

In [None]:
c=count_matrix.todense()

In [None]:
c

In [None]:
print(count_matrix[0,:])

In [None]:
# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

In [None]:
# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use later to match the indexes
indices = pd.Series(df.index)
indices[:20]

In [None]:
# function that takes in movie title as input and returns the top 10 recommended movies
def recommendations(title, cosine_sim = cosine_sim):
    
    recommended_movies = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    print(top_10_indexes)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies.append(list(df.index)[i])
        
    return recommended_movies