In [9]:
import heapq
import pandas as pd
import random
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

In [10]:
# Load all categories with places associated.
df = pd.read_csv('../data/all_categories_with_places.csv', sep=';')

In [11]:
# Copy df in zac only keeping category_id, place_1, place_2, place_3
# Zones and categories (zac)
zac = df[['category_id', 'place_1', 'place_2', 'place_3']]

In [12]:
# Load the association between all products and categories.
# Products and categories (pac)
pac = pd.read_csv('../data/products_and_categories.csv', sep=';')

In [13]:
# List all type of places.
places = ['Office', 'Living Room', 'Dinning Room','Kitchen', 'Bedroom', 'Bathroom',
              'Yard', 'Street','Beach','Camping', 'Playroom']

# Merge pac and zac to make paz, a dataframe with product and zones association.
# Products and zones (paz)
paz = pac.merge(zac, how='inner', left_on='category_id', right_on='category_id')
paz = paz[['product_id', 'place_1', 'place_2', 'place_3']]
paz['place_1'] = pd.Categorical(paz['place_1'], categories=places)
paz['place_2'] = pd.Categorical(paz['place_2'], categories=places)
paz['place_3'] = pd.Categorical(paz['place_3'], categories=places)

# Generate onehot vectors to measure cosine similarity between themselves.
paz = pd.get_dummies(paz)
for site in places:
    paz[site] = paz[['place_1_{}'.format(site), 'place_2_{}'.format(site), 'place_3_{}'.format(site)]].max(axis=1)
paz = paz.groupby('product_id')[places].max()
paz = paz.reset_index()
matrix = paz[places]

In [14]:
# Use sklearn to compute cosine similarity
A = matrix.values
A_sparse = sparse.csr_matrix(A)
similarities_matrix = cosine_similarity(A_sparse)
#print('pairwise sparse output:\n {}\n'.format(similarities))


In [15]:
# Make two dict to map products and index
index_to_product = dict(paz['product_id'])
product_to_index = dict(zip(paz['product_id'].values, paz.index))

In [21]:
# Compute scores for query product
query_id = 23148
product_scores = list(enumerate(similarities_matrix[product_to_index[query_id]]))

In [23]:
# Get the largest twenty 20 scores, shuffle and return.
n = 20
print("Getting {} most similar products for {}".format(n, query_id))
top_n_index = heapq.nlargest(n, product_scores, key=lambda x: x[1])
top_n_index = random.sample(top_n_index, k=len(top_n_index))
top = []
for index, score in top_n_index:
    top.append(index_to_product[index])
print(top)
print('..done!')

Getting 20 most similar products for 23148
[6810, 5077, 3931, 5081, 7077, 7078, 3969, 3989, 7100, 7079, 7076, 3968, 3967, 3970, 5082, 7074, 3930, 4145, 5083, 7075]
..done!
