In [1]:
import numpy as np 
import pandas as pd 
import scipy 
import umap
import collections
import functools
from datetime import datetime

from recommenders.datasets import movielens
from sklearn.decomposition import NMF
from scipy import spatial

import matplotlib.pyplot as plt 
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
data = movielens.load_pandas_data(
    size='100k',
    header=['UserId', 'MovieId', 'Rating', 'Timestamp'],
    title_col='Title'
)

data.loc[:, 'Rating'] = data['Rating'].astype(np.float32)
data['datetime'] = data['Timestamp'].apply(lambda x: datetime.fromtimestamp(x))
data.head()

100%|██████████| 4.81k/4.81k [00:14<00:00, 322KB/s]


Unnamed: 0,UserId,MovieId,Rating,Timestamp,Title,datetime
0,196,242,3.0,881250949,Kolya (1996),1997-12-04 21:55:49
1,63,242,3.0,875747190,Kolya (1996),1997-10-02 05:06:30
2,226,242,5.0,883888671,Kolya (1996),1998-01-04 10:37:51
3,154,242,3.0,879138235,Kolya (1996),1997-11-10 11:03:55
4,306,242,5.0,876503793,Kolya (1996),1997-10-10 23:16:33


In [3]:
print(f"# users : {data['UserId'].nunique()} | # items : {data['MovieId'].nunique()}")

# users : 943 | # items : 1682


In [4]:
item_data = data.loc[data['MovieId']==102, :]
item_data.head()

Unnamed: 0,UserId,MovieId,Rating,Timestamp,Title,datetime
81130,5,102,3.0,875721196,"Aristocats, The (1970)",1997-10-01 21:53:16
81131,49,102,2.0,888067164,"Aristocats, The (1970)",1998-02-21 19:19:24
81132,42,102,5.0,881108873,"Aristocats, The (1970)",1997-12-03 06:27:53
81133,295,102,4.0,879518339,"Aristocats, The (1970)",1997-11-14 20:38:59
81134,363,102,4.0,891498681,"Aristocats, The (1970)",1998-04-02 12:31:21


In [5]:
itemMap = data['Title'].unique()
userMap = data['UserId'].unique()

In [37]:
unique_items = data['Title'].unique()
itemMap = pd.DataFrame({'id': np.arange(len(unique_items), dtype=np.int32), 'item': unique_items})
unique_users = data['UserId'].unique()
userMap = pd.DataFrame({'id': np.arange(len(unique_users), dtype=np.int32), 'user': unique_users})
row_ind = data['UserId'].map(dict(zip(userMap['user'],userMap['id']))).to_numpy().astype(np.int32)
col_ind = data['Title'].map(dict(zip(itemMap['item'],itemMap['id']))).to_numpy().astype(np.int32)

In [64]:
k = itemMap.loc[itemMap['item']=='Jungle Book, The (1994)', 'id'].values[0]

In [66]:
k

7

In [38]:
# movies = pd.Series(data['Title'].values, index=data['MovieId'].values).drop_duplicates()
# movies.sort_index(inplace=True)

sparse_df = data.loc[:, ['UserId', 'MovieId', 'Rating']].sort_values(['UserId', 'MovieId'])
# row_ind = sparse_df['UserId'].to_numpy()
# col_ind = sparse_df['MovieId'].to_numpy()
data_ = sparse_df['Rating'].to_numpy()
# inds = list(zip(row_ind, col_ind))
# data_arr = list(zip(data_, inds))

sparse_mat = scipy.sparse.csr_matrix((data_, (row_ind, col_ind)), dtype=np.int32)

In [39]:
model = NMF(n_components=10, init='random', random_state=123)
W = model.fit_transform(sparse_mat)
H = model.components_.T


Maximum number of iterations 200 reached. Increase it to improve convergence.



In [40]:
print(f"W shape : {W.shape} | H shape : {H.shape}")

W shape : (943, 10) | H shape : (1664, 10)


In [41]:
from sklearn import manifold, datasets, decomposition

def get_2d(arr, method='umap'):
    if method == 'umap':
        reducer = umap.UMAP()
        res = reducer.fit_transform(arr)
    elif method == 'pca':
        res = decomposition.TruncatedSVD(n_components=2).fit_transform(arr)
    elif method == 'tsne':
        tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
        res = tsne.fit_transform(arr)
    return res

In [42]:
h_reduced = get_2d(H, 'pca')

In [43]:
plt.figure(figsize=(50, 50))
fig = go.Figure(data=[go.Scatter(x=h_reduced[:, 0], y=h_reduced[:, 1], mode='markers', text=itemMap)])
fig.show()

<Figure size 3600x3600 with 0 Axes>

In [15]:
unique_items = data['Title'].unique()
item_df = pd.DataFrame({'id': range(len(unique_items)), 'item': unique_items})
item_df

Unnamed: 0,id,item
0,0,Kolya (1996)
1,1,L.A. Confidential (1997)
2,2,Heavyweights (1994)
3,3,Legends of the Fall (1994)
4,4,Jackie Brown (1997)
...,...,...
1659,1659,Mamma Roma (1962)
1660,1660,"Eighth Day, The (1996)"
1661,1661,Girls Town (1996)
1662,1662,"Silence of the Palace, The (Saimt el Qusur) (1..."


In [16]:
row_ind = data['Title'].map(pd.Series(data=item_df['item'], index=item_df['id']))


In [15]:
def cosine_similarity(y,x):
    x = W[x, :]
    # print(x,y)
    result = 1- spatial.distance.cosine(x, y)
    # print(result)
    return float(result)

In [16]:
ref_point = 120
sim_arr = np.apply_along_axis(cosine_similarity, 1, H, x=ref_point) #.astype(np.float32)
sim_arr[ref_point] = 0


invalid value encountered in double_scalars



In [17]:
k = 10 
inds = np.argpartition(sim_arr, -k)[-k:]
inds

array([ 864, 1362, 1357, 1259, 1365,  471,  595,  696, 1374,    0])

In [18]:
topK = sim_arr[inds]
topK

array([0.95624274, 0.98084814, 0.97030963, 0.98212082, 0.98084814,
       0.95819977, 0.95707013, 0.97512666, 0.96535484, 1.        ])

In [19]:
trace1 = go.Scatter(x=h_reduced[inds, 0], y=h_reduced[inds, 1], mode='markers')
trace2 = go.Scatter(x=[h_reduced[ref_point, 0]], y=[h_reduced[ref_point, 1]], mode='markers')

fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(trace1)
fig.add_trace(trace2,secondary_y=True)
fig['layout'].update(height = 600, width = 800, title = "Customer Similarity",xaxis=dict(
      tickangle=-90
    ))
fig.show()

In [26]:
arr_120 = np.array([i[0] for i in sparse_mat.getcol(12).toarray()])
user_120 = np.where(arr_120 != 0)[0]
user_120

array([  1,   6,   7,  10,  11,  13,  14,  16,  18,  24,  28,  29,  42,
        43,  49,  58,  59,  60,  62,  64,  69,  72,  73,  76,  84,  90,
        92,  94,  99, 106, 109, 110, 115, 117, 119, 121, 130, 132, 135,
       138, 144, 145, 151, 156, 174, 175, 177, 178, 180, 186, 194, 201,
       204, 207, 213, 214, 216, 218, 221, 222, 226, 233, 234, 239, 246,
       249, 250, 251, 253, 256, 259, 264, 267, 268, 271, 272, 276, 279,
       280, 288, 291, 293, 297, 299, 301, 303, 305, 308, 311, 314, 315,
       318, 322, 327, 328, 329, 332, 334, 339, 342, 343, 344, 345, 346,
       347, 352, 361, 363, 370, 372, 373, 374, 378, 379, 380, 385, 387,
       391, 393, 394, 397, 398, 399, 402, 405, 406, 409, 416, 417, 421,
       425, 429, 430, 433, 435, 437, 442, 443, 445, 447, 450, 453, 454,
       455, 456, 457, 458, 464, 465, 468, 472, 474, 478, 480, 483, 487,
       491, 493, 497, 498, 499, 503, 506, 514, 521, 522, 524, 527, 532,
       533, 537, 538, 542, 543, 548, 551, 556, 557, 559, 560, 56

In [39]:
def get_users(item_id):
    arr_i = np.array([i[0] for i in sparse_mat.getcol(item_id).toarray()])
    user_i = np.where(arr_i != 0)[0].tolist()
    return user_i

niegh_user_arr = {i: get_users(i) for i in inds}

In [40]:
def dict_update(d1, d2):
    for k,v in d2.items():
        try:
            d1[k].append(v)
        except:
            d1[k] = [v]
    return d1

def get_rating(user_dict):
    rating_dict = {}
    rating_sum_dict = {}
    for u, arr in user_dict.items():
        rating_dict[u] = np.array([i[0] for i in sparse_mat.getcol(u).toarray()])[arr]
        rating_zip_dict = dict(zip(arr, rating_dict[u]))
        rating_sum_dict = dict_update(rating_sum_dict, rating_zip_dict)
    rating_mean_dict = {i: sum(arr)/len(arr) for i,arr in rating_sum_dict.items()}
    rating_max_dict = {i: max(arr) for i, arr in rating_sum_dict.items()}
    rating_min_dict = {i: min(arr) for i, arr in rating_sum_dict.items()}
    return rating_dict, rating_mean_dict, rating_max_dict, rating_min_dict

In [41]:
user_arr = functools.reduce(lambda x,y: x+y, list(niegh_user_arr.values()), [])
user_set = functools.reduce(lambda x,y: set(y) | x, list(niegh_user_arr.values()), set([]))
count_arr = np.bincount(np.array(user_arr))
item_cnt = [count_arr[i] for i in user_set]

In [42]:
a, b, c, d = get_rating(niegh_user_arr)

In [43]:
user_items = get_users(120)

In [45]:
neigh_item_df = pd.DataFrame({'item id': list(user_set), 'item': itemMap[list(user_set)], 'item_counts': item_cnt})
neigh_item_df['rating_mean'] = neigh_item_df['item id'].map(b)
neigh_item_df['rating_max'] = neigh_item_df['item id'].map(c)
neigh_item_df['rating_min'] = neigh_item_df['item id'].map(d)
np.in1d(list(user_set), user_items)
neigh_item_df.head()

Unnamed: 0,item id,item,item_counts,rating_mean,rating_max,rating_min
0,6,"Hunt for Red October, The (1990)",1,2.0,2,2
1,7,"Jungle Book, The (1994)",2,3.0,4,2
2,10,Men in Black (1997),1,4.0,4,4
3,12,Star Trek: First Contact (1996),1,5.0,5,5
4,13,"To Wong Foo, Thanks for Everything! Julie Newm...",2,2.5,4,1


In [46]:
neigh_item_enc = W[list(user_set), :]
W2d = get_2d(W)
neigh_item_2d = W2d[list(user_set), :]

In [50]:
fig = go.Figure(data=[go.Scatter(x=neigh_item_2d[:, 0], y=neigh_item_2d[:, 1], mode='markers', text=userMap[np.array(list(user_set))-1])])
fig.show()

In [51]:
#get user history
def item_history(item_id, date=None):
    if date:
        cols = ['UserId', 'MovieId', 'Rating', date]
    else:
        cols = ['UserId', 'MovieId', 'Rating']
    user_data = data.loc[data['MovieId']==item_id, cols]
    return user_data

In [52]:
hist_12 = item_history(12, 'Timestamp')
hist_12.head()

Unnamed: 0,UserId,MovieId,Rating,Timestamp
43580,272,12,5.0,879455254
43581,62,12,4.0,879373613
43582,90,12,5.0,891383241
43583,151,12,5.0,879524368
43584,84,12,5.0,883452874


In [53]:
fig = go.Figure(data=[go.Histogram(x=hist_12['Rating'])])
fig.show()

---

_Item Visualization_ : 
- get items feature scatter plot
- get top@K neighbor items
- get stats of neighbor items based on their users, 
- get item summary stats

In [96]:
class ItemVis(object):
	def __init__(self, df, user_col='UserId', item_col='ItemId', rating_col='rating'):
		self.df = df
		self.user_col = user_col
		self.item_col = item_col 
		self.rating_col = rating_col
		self.sparse_mat, self.userMap, self.itemMap = self.get_sparse_mat(self.df, self.user_col, self.item_col, self.rating_col)
		self.user_enc, self.item_enc = self.__nmf()


	def get_sparse_mat(self, df, user_col, item_col, rating_col):
		unique_items = df[item_col].unique()
		itemMap = pd.DataFrame({'id': np.arange(len(unique_items), dtype=np.int32), 'item': unique_items})
		unique_users = df[user_col].unique()
		userMap = pd.DataFrame({'id': np.arange(len(unique_users), dtype=np.int32), 'user': unique_users})
		row_ind = df[user_col].map(dict(zip(userMap['user'],userMap['id']))).to_numpy().astype(np.int32)
		col_ind = df[item_col].map(dict(zip(itemMap['item'],itemMap['id']))).to_numpy().astype(np.int32)
		data_ = df[rating_col].to_numpy()
		sparse_mat = scipy.sparse.csr_matrix((data_, (row_ind, col_ind)), dtype=np.int32)
		return sparse_mat, userMap, itemMap

	def __nmf(self):
		model = NMF(n_components=10, init='random', random_state=123)
		user = model.fit_transform(self.sparse_mat)
		item = model.components_.T
		return user, item

	def get_2d(self, arr):
		reducer = umap.UMAP()
		res = reducer.fit_transform(arr)
		return res

	def get_item_neigh(self, item_id, k=10):

		def __consine_similarity(y,x):
			x = self.item_enc[x, :]
			result = 1- spatial.distance.cosine(x, y)
			return float(result)

		sim_arr = np.apply_along_axis(__consine_similarity, 1, self.item_enc, x=item_id)
		sim_arr[item_id] = 0

		neighs = np.argpartition(sim_arr, -k)[-k:]
		neigh_sims = sim_arr[neighs]
		neigh_dict = dict(zip(neighs, neigh_sims))
		return neigh_dict
	
	def get_item_users(self, item_id):
		arr_i = np.array([i[0] for i in sparse_mat.getcol(item_id).toarray()])
		user_i = np.where(arr_i != 0)[0].tolist()
		return user_i

	def __dict_update(self, d1, d2):
		for k,v in d2.items():
			try:
				d1[k].append(v)
			except:
				d1[k] = [v]
		return d1

	def get_rating(self, item_list):
		item_dict = {u: self.get_item_users(u) for u in item_list}
		rating_dict = {}
		rating_sum_dict = {}
		for u, arr in item_dict.items():
			rating_dict[u] = np.array([i[0] for i in sparse_mat.getcol(u).toarray()])[arr]
			rating_zip_dict = dict(zip(arr, rating_dict[u]))
			rating_sum_dict = self.__dict_update(rating_sum_dict, rating_zip_dict)
		rating_mean_dict = {i: round(sum(arr)/len(arr), 3) for i,arr in rating_sum_dict.items()}
		rating_max_dict = {i: max(arr) for i, arr in rating_sum_dict.items()}
		rating_min_dict = {i: min(arr) for i, arr in rating_sum_dict.items()}
		return rating_dict, rating_mean_dict, rating_max_dict, rating_min_dict

	def get_item_counts(self, item_list):
		niegh_user_arr = {u: self.get_item_users(u) for u in item_list}
		user_arr = functools.reduce(lambda x,y: x+y, list(niegh_user_arr.values()), [])
		user_set = functools.reduce(lambda x,y: set(y) | x, list(niegh_user_arr.values()), set([]))
		count_arr = np.bincount(np.array(user_arr))
		item_cnt = [count_arr[i] for i in user_set]
		return list(user_set), item_cnt

	def get_neigh_items(self, item, k=10):
		item_id = self.itemMap.loc[self.itemMap['item']==item, 'id'].values
		if len(item_id) > 0:
			item_id == item_id[0]
			neigh_dict = self.get_item_neigh(item_id, k)
			neigh_item_dict, neigh_rating_mean_dict, neigh_rating_max_dict, neigh_rating_min_dict = self.get_rating(list(neigh_dict.keys()))
			item_list, item_count = self.get_item_counts(list(neigh_dict.keys()))
			item_reverse_map = dict(zip(self.itemMap['id'], self.itemMap['item']))
			neigh_item_df = pd.DataFrame({'item id': item_list, 'item': [item_reverse_map[i] for i in item_list], 'count': item_count})
			neigh_item_df['mean rating'] = neigh_item_df['item id'].map(neigh_rating_mean_dict)
			neigh_item_df['max rating'] = neigh_item_df['item id'].map(neigh_rating_max_dict)
			neigh_item_df['min rating'] = neigh_item_df['item id'].map(neigh_rating_min_dict)
		else:
			neigh_item_df = pd.DataFrame()
		return neigh_item_df

	def get_item_history(self, item_id, date=None):
		if date:
			cols = [self.user_col, self.item_col, self.rating_col, date]
		else:
			cols = [self.user_col, self.item_col, self.rating_col]
		user_data = data.loc[data[self.item_col]==item_id, cols]
		return user_data

	


In [97]:
itemVis = ItemVis(data, user_col='UserId', item_col='Title', rating_col='Rating')

In [101]:
item = 'Jungle Book, The (1994)'

In [102]:
item_101_hist = itemVis.get_item_history(item)
item_101_hist.head()

Unnamed: 0,UserId,Title,Rating
1055,253,"Jungle Book, The (1994)",5.0
1056,38,"Jungle Book, The (1994)",5.0
1057,286,"Jungle Book, The (1994)",5.0
1058,210,"Jungle Book, The (1994)",4.0
1059,298,"Jungle Book, The (1994)",4.0


In [103]:
neigh_101_df = itemVis.get_neigh_items(item)
neigh_101_df.head()

Unnamed: 0,item id,item,count,mean rating,max rating,min rating
0,512,Carrie (1976),1,2.0,2,2
1,516,Koyaanisqatsi (1983),2,3.0,3,3
2,517,William Shakespeare's Romeo and Juliet (1996),3,1.333,2,1
3,518,"Thousand Acres, A (1997)",1,5.0,5,5
4,7,"Jungle Book, The (1994)",1,5.0,5,5


In [None]:
[i[1]['item id'] for i in neigh_101_df.iterrows()]

In [82]:
import pandas as pd

In [4]:
df = pd.read_csv('../data/frappe.csv')
df.columns

Index(['user\titem\tcnt\tdaytime\tweekday\tisweekend\thomework\tcost\tweather\tcountry\tcity'], dtype='object')

In [104]:
import numpy as np 

In [105]:
arr1 = np.array([
    [1, 2],
    [4, 6],
    [6, 2]
])
arr2 = np.array([5, 3])

In [107]:
np.vstack((arr1, arr2))

array([[1, 2],
       [4, 6],
       [6, 2],
       [5, 3]])