In [None]:
! pip install umap-learn
! pip install plotly==5.7.0

In [1]:
import numpy as np 
import pandas as pd 
import scipy 
import umap
import collections
import functools
from datetime import datetime

from recommenders.datasets import movielens
from sklearn.decomposition import NMF
from scipy import spatial

import matplotlib.pyplot as plt 
import plotly.graph_objects as go
from plotly.subplots import make_subplots


In [2]:
data = movielens.load_pandas_df(
    size='100k',
    header=['UserId', 'MovieId', 'Rating', 'Timestamp'],
    title_col='Title'
)

data.loc[:, 'Rating'] = data['Rating'].astype(np.float32)
data['datetime'] = data['Timestamp'].apply(lambda x: datetime.fromtimestamp(x))
data.head()

100%|██████████| 4.81k/4.81k [00:09<00:00, 518KB/s]


Unnamed: 0,UserId,MovieId,Rating,Timestamp,Title,datetime
0,196,242,3.0,881250949,Kolya (1996),1997-12-04 21:55:49
1,63,242,3.0,875747190,Kolya (1996),1997-10-02 05:06:30
2,226,242,5.0,883888671,Kolya (1996),1998-01-04 10:37:51
3,154,242,3.0,879138235,Kolya (1996),1997-11-10 11:03:55
4,306,242,5.0,876503793,Kolya (1996),1997-10-10 23:16:33


In [3]:
# data.to_csv('../data/movielens.csv', index=False)

In [4]:
# data['UserId'].unique()

In [5]:
print(f"# users : {data['UserId'].nunique()} | # items : {data['MovieId'].nunique()}")

# users : 943 | # items : 1682


In [6]:
user_data = data.loc[data['UserId'] == 196, :]
user_data.head()

Unnamed: 0,UserId,MovieId,Rating,Timestamp,Title,datetime
0,196,242,3.0,881250949,Kolya (1996),1997-12-04 21:55:49
1518,196,257,2.0,881251577,Men in Black (1997),1997-12-04 22:06:17
13157,196,111,4.0,881251793,"Truth About Cats & Dogs, The (1996)",1997-12-04 22:09:53
13622,196,25,4.0,881251955,"Birdcage, The (1996)",1997-12-04 22:12:35
14668,196,382,4.0,881251843,"Adventures of Priscilla, Queen of the Desert, ...",1997-12-04 22:10:43


In [7]:
user_data['datetime'].min()

Timestamp('1997-12-04 21:55:49')

In [8]:
itemMap = data['Title'].unique()
userMap = data['UserId'].unique()

In [9]:
movies = pd.Series(data['Title'].values, index=data['MovieId'].values).drop_duplicates()
movies.sort_index(inplace=True)

sparse_df = data.loc[:, ['UserId', 'MovieId', 'Rating']].sort_values(['UserId', 'MovieId'])
row_ind = sparse_df['UserId'].to_numpy()
col_ind = sparse_df['MovieId'].to_numpy()
data_ = sparse_df['Rating'].to_numpy()
# inds = list(zip(row_ind, col_ind))
# data_arr = list(zip(data_, inds))

sparse_mat = scipy.sparse.csr_matrix((data_, (row_ind, col_ind)), dtype=np.int32)

In [10]:
model = NMF(n_components=10, init='random', random_state=123)
W = model.fit_transform(sparse_mat)
H = model.components_.T



In [11]:
print(f"W shape : {W.shape} | H shape : {H.shape}")

W shape : (944, 10) | H shape : (1683, 10)


In [12]:
def get_2d(arr):
    reducer = umap.UMAP()
    res = reducer.fit_transform(arr)
    return res

In [13]:
w_reduced = get_2d(W)

In [14]:
w_reduced.shape

(944, 2)

In [15]:
fig = go.Figure(data=[go.Scatter(x=w_reduced[:, 0], y=w_reduced[:, 1], mode='markers')])
fig.show()

In [16]:
def cosine_similarity(y,x):
    x = W[x, :]
    # print(x,y)
    result = 1- spatial.distance.cosine(x, y)
    # print(result)
    return float(result)


In [17]:
ref_point = 12
sim_arr = np.apply_along_axis(cosine_similarity, 1, W, x=ref_point) #.astype(np.float32)
sim_arr[ref_point] = 0


invalid value encountered in double_scalars



In [18]:
k = 10 
inds = np.argpartition(sim_arr, -k)[-k:]
inds

array([371, 781, 175, 538, 610, 530,  65, 719, 575,   0])

In [19]:
topK = sim_arr[inds]
topK

array([0.96671171, 0.96922194, 0.96941333, 0.97102941, 0.97573818,
       0.97526049, 0.9750163 , 0.97274461, 0.97273098, 1.        ])

In [20]:
trace1 = go.Scatter(x=w_reduced[inds, 0], y=w_reduced[inds, 1], mode='markers')
trace2 = go.Scatter(x=[w_reduced[ref_point, 0]], y=[w_reduced[ref_point, 1]], mode='markers')

fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(trace1)
fig.add_trace(trace2,secondary_y=True)
fig['layout'].update(height = 600, width = 800, title = "Customer Similarity",xaxis=dict(
      tickangle=-90
    ))
fig.show()


In [21]:
arr_12 = sparse_mat.getrow(12).toarray()[0]
item_12 = np.where(arr_12 != 0)[0]
item_12

array([  4,  15,  28,  50,  69,  71,  82,  88,  96,  97,  98, 127, 132,
       133, 143, 157, 159, 161, 168, 170, 172, 174, 191, 195, 196, 200,
       202, 203, 204, 215, 216, 228, 238, 242, 276, 282, 300, 318, 328,
       381, 392, 402, 416, 471, 480, 591, 684, 708, 735, 753, 754])

In [23]:
def get_items(user_id):
    arr_i = sparse_mat.getrow(user_id).toarray()[0]
    item_i = np.where(arr_i != 0)[0].tolist()
    return item_i

niegh_item_arr = {u: get_items(u) for u in inds}
# niegh_item_arr = np.array([get_items(u) for u in inds])
# niegh_item_arr

In [24]:
def dict_update(d1, d2):
    for k,v in d2.items():
        try:
            d1[k].append(v)
        except:
            d1[k] = [v]
    return d1

def get_rating(item_dict):
    rating_dict = {}
    rating_sum_dict = {}
    for u, arr in item_dict.items():
        rating_dict[u] = sparse_mat.getrow(u).toarray()[0][arr]
        rating_zip_dict = dict(zip(arr, rating_dict[u]))
        rating_sum_dict = dict_update(rating_sum_dict, rating_zip_dict)
    rating_mean_dict = {i: sum(arr)/len(arr) for i,arr in rating_sum_dict.items()}
    rating_max_dict = {i: max(arr) for i, arr in rating_sum_dict.items()}
    rating_min_dict = {i: min(arr) for i, arr in rating_sum_dict.items()}
    return rating_dict, rating_mean_dict, rating_max_dict, rating_min_dict

In [25]:
item_arr = functools.reduce(lambda x,y: x+y, list(niegh_item_arr.values()), [])
item_set = functools.reduce(lambda x,y: set(y) | x, list(niegh_item_arr.values()), set([]))
count_arr = np.bincount(np.array(item_arr))
item_cnt = [count_arr[i] for i in item_set]


In [26]:
a, b, c, d = get_rating(niegh_item_arr)

In [27]:
user_items = get_items(ref_point)

In [28]:
neigh_item_df = pd.DataFrame({'item id': list(item_set), 'item': itemMap[list(item_set)], 'item_counts': item_cnt, 'already_watched': np.in1d(list(item_set), user_items)})
neigh_item_df['rating_mean'] = neigh_item_df['item id'].map(b)
neigh_item_df['rating_max'] = neigh_item_df['item id'].map(c)
neigh_item_df['rating_min'] = neigh_item_df['item id'].map(d)
np.in1d(list(item_set), user_items)
neigh_item_df.head()

Unnamed: 0,item id,item,item_counts,already_watched,rating_mean,rating_max,rating_min
0,1,L.A. Confidential (1997),3,False,3.666667,4,3
1,514,SubUrbia (1997),1,False,4.0,4,4
2,4,Jackie Brown (1997),1,True,3.0,3,3
3,516,Koyaanisqatsi (1983),1,False,3.0,3,3
4,7,"Jungle Book, The (1994)",3,False,1.666667,2,1


In [29]:
neigh_item_enc = H[list(item_set), :]
H2d = get_2d(H)
neigh_item_2d = H2d[list(item_set), :]

In [30]:
fig = go.Figure(data=[go.Scatter(x=neigh_item_2d[:, 0], y=neigh_item_2d[:, 1], mode='markers', text=itemMap[list(item_set)])])
fig.show()

In [34]:
#get user history
def user_history(user_id, date=None):
    if date:
        cols = ['UserId', 'MovieId', 'Rating', date]
    else:
        cols = ['UserId', 'MovieId', 'Rating']
    user_data = data.loc[data['UserId']==user_id, cols]
    return user_data

In [35]:
hist_12 = user_history(12, 'Timestamp')
hist_12.head()

Unnamed: 0,UserId,MovieId,Rating,Timestamp
34,12,242,5.0,879960826
2742,12,392,4.0,879959025
5052,12,98,5.0,879959068
5556,12,88,5.0,879960826
6941,12,4,5.0,879960826


In [38]:
fig = go.Figure(data=[go.Histogram(x=hist_12['Rating'])])
fig.show()

___

_User Visualization_ : 

- [x] user feature scatter-plot
- [x] get top@K neighbors of a user
- [x] vis user and its' neighbors in a scatter-plot
- [x] get top@N popular items among user neighbor
- [x] get user item history 

- users classify according to #items purchased
- hist plot on ratings (overall & user level and item level)
- 

In [51]:
class UserVis(object):
    def __init__(self, df, user_col='UserId', item_col='ItemId', rating_col='rating'):
        self.df = df
        self.user_col = user_col
        self.item_col = item_col 
        self.rating_col = rating_col
        self.sparse_mat, self.userMap, self.itemMap = self.get_sparse_mat(self.df, self.user_col, self.item_col, self.rating_col)
        self.user_enc, self.item_enc = self.__nmf()


    def get_sparse_mat(self, df, user_col, item_col, rating_col):
        row_ind = df[user_col].to_numpy()
        col_ind = df[item_col].to_numpy()
        data_ = df[rating_col].to_numpy()
        sparse_mat = scipy.sparse.csr_matrix((data_, (row_ind, col_ind)), dtype=np.int32)
        userMap = df[user_col].unique()
        itemMap = df[item_col].unique()
        return sparse_mat, userMap, itemMap

    def __nmf(self):
        model = NMF(n_components=10, init='random', random_state=123)
        user = model.fit_transform(self.sparse_mat)
        item = model.components_.T
        return user, item

    def get_2d(self, arr):
        reducer = umap.UMAP()
        res = reducer.fit_transform(arr)
        return res

    def get_user_neigh(self, user_id, k=10):

        def __consine_similarity(y,x):
            x = self.user_enc[x, :]
            result = 1- spatial.distance.cosine(x, y)
            return float(result)

        sim_arr = np.apply_along_axis(__consine_similarity, 1, self.user_enc, x=user_id)
        sim_arr[user_id] = 0

        neighs = np.argpartition(sim_arr, -k)[-k:]
        neigh_sims = sim_arr[neighs]
        neigh_dict = dict(zip(neighs, neigh_sims))
        return neigh_dict
    
    def get_user_items(self, user_id):
        arr_i = self.sparse_mat.getrow(user_id).toarray()[0]
        item_i = np.where(arr_i != 0)[0].tolist()
        return item_i

    def __dict_update(self, d1, d2):
        for k,v in d2.items():
            try:
                d1[k].append(v)
            except:
                d1[k] = [v]
        return d1

    def get_rating(self, user_list):
        item_dict = {u: self.get_user_items(u) for u in user_list}
        rating_dict = {}
        rating_sum_dict = {}
        for u, arr in item_dict.items():
            rating_dict[u] = sparse_mat.getrow(u).toarray()[0][arr]
            rating_zip_dict = dict(zip(arr, rating_dict[u]))
            rating_sum_dict = self.__dict_update(rating_sum_dict, rating_zip_dict)
        rating_mean_dict = {i: sum(arr)/len(arr) for i,arr in rating_sum_dict.items()}
        rating_max_dict = {i: max(arr) for i, arr in rating_sum_dict.items()}
        rating_min_dict = {i: min(arr) for i, arr in rating_sum_dict.items()}
        return rating_dict, rating_mean_dict, rating_max_dict, rating_min_dict

    def get_item_counts(self, user_list):
        niegh_item_arr = {u: self.get_user_items(u) for u in user_list}
        item_arr = functools.reduce(lambda x,y: x+y, list(niegh_item_arr.values()), [])
        item_set = functools.reduce(lambda x,y: set(y) | x, list(niegh_item_arr.values()), set([]))
        count_arr = np.bincount(np.array(item_arr))
        item_cnt = [count_arr[i] for i in item_set]
        return list(item_set),item_cnt

    def get_neigh_items(self, user_id):
        neigh_dict = self.get_user_neigh(user_id)
        user_items = self.get_user_items(user_id)
        neigh_item_dict, neigh_rating_mean_dict, neigh_rating_max_dict, neigh_rating_min_dict = self.get_rating(list(neigh_dict.keys()))
        item_list, item_count = self.get_item_counts(list(neigh_dict.keys()))
        neigh_item_df = pd.DataFrame({'item id': item_list, 'item': self.itemMap[np.array(item_list)-1], 'count': item_count, 'already_watched': np.in1d(item_list, user_items)})
        neigh_item_df['mean rating'] = neigh_item_df['item id'].map(neigh_rating_mean_dict)
        neigh_item_df['max rating'] = neigh_item_df['item id'].map(neigh_rating_max_dict)
        neigh_item_df['min rating'] = neigh_item_df['item id'].map(neigh_rating_min_dict)
        return neigh_item_df

    def get_user_history(self, user_id, date=None):
        if date:
            cols = [self.user_col, self.item_col, self.rating_col, date]
        else:
            cols = [self.user_col, self.item_col, self.rating_col]
        user_data = data.loc[data[self.user_col]==user_id, cols]
        return user_data

    


In [52]:
userVis = UserVis(data, user_col='UserId', item_col='MovieId', rating_col='Rating')


Maximum number of iterations 200 reached. Increase it to improve convergence.



In [41]:
user_ft, item_ft = userVis.user_enc, userVis.item_enc

In [42]:
user_ft2d, item_ft2d = userVis.get_2d(user_ft), userVis.get_2d(item_ft)

In [43]:
fig = go.Figure(data=[go.Scatter(x=user_ft2d[:, 0], y=user_ft2d[:, 1], mode='markers')])
fig.show()

In [47]:
user_101_hist = userVis.get_user_history(101)
user_101_hist.head()

Unnamed: 0,UserId,MovieId,Rating
1538,101,257,4.0
1870,101,222,3.0
3123,101,118,3.0
3479,101,1,3.0
4031,101,546,4.0


In [53]:
neigh_101_df = userVis.get_neigh_items(101)
neigh_101_df.head()

181 181



invalid value encountered in double_scalars



Unnamed: 0,item id,item,count,already_watched,mean rating,max rating,min rating
0,1,302,7,True,4.142857,5,3
1,515,1428,1,False,4.0,4,4
2,1028,633,3,True,3.666667,5,3
3,7,465,6,True,3.333333,5,2
4,9,86,6,False,3.166667,5,1
