In [496]:
import numpy as np
import pandas as pd
import time
from collections import defaultdict
import seaborn as sns
import warnings
from IPython.display import clear_output

warnings.filterwarnings('ignore')

np.random.seed(1)

% matplotlib inline

In [498]:
ratings

Unnamed: 0,user,item,rating
0,A1JH5J1KQAUBMP,B00005U0JX,True
1,A1RSXP7MB772E3,B001DHXT1G,True
2,AJGP5XYKKBGBG,0792840054,True
3,A2SQJPUCZNHMZE,B005LAIHSG,True
4,A3QVAKVRAH657N,B00005K3OU,False
5,A2YM3KILJ1G0YJ,B005LAIGPA,True
6,A35K06H07L5POJ,B00009QG7O,False
7,AK5PXHJI808E6,B002EP8FF6,False
8,A2J57EQ9KPGXR9,6301963725,True
9,A2UQK3DAZ8NO2T,0792843576,True


In [497]:
ratings = pd.read_csv('../data/reviews_sample_100.csv').drop(['Unnamed: 0', 'reviewTime'], axis = 1)
ratings.columns = ['item', 'user', 'rating']
ratings = ratings[['user', 'item', 'rating']]
ratings['rating'] = ratings['rating'].astype(int)
ratings['rating'] = (ratings['rating'] >= 4).astype(bool)
ratings.head()

Unnamed: 0,user,item,rating
0,A1JH5J1KQAUBMP,B00005U0JX,True
1,A1RSXP7MB772E3,B001DHXT1G,True
2,AJGP5XYKKBGBG,0792840054,True
3,A2SQJPUCZNHMZE,B005LAIHSG,True
4,A3QVAKVRAH657N,B00005K3OU,False


In [494]:
class PLSI():
    
    def __init__(self, n_factors = 5, n_iters = 5,
                 verbose = False, user = 'user', item = 'item', rating = 'rating'):
        self.n_items = 0
        self.n_users = 0
        self.n_latent_factors = n_factors
        self.n_iters = n_iters
        
        self.verbose = verbose
        
        self.user = user
        self.item = item
        self.rating = rating
        
    def _make_bool_matrix(self):
        self.data.loc[:,self.rating] = self.data[self.rating].astype(bool)
        data_matrix = self.data.pivot_table(index=self.user, columns=self.item).fillna(False)
        return data_matrix
        
    def _train_initialize(self, data):
        
        self.data = data[data[self.rating] > 0]
        self.data_zero = data.copy()
        self.data_matrix = self._make_bool_matrix()
        print('num users:', self.data_matrix.shape[0])
        print('num items:', self.data_matrix.shape[1])
        
        print('proportion positive:', len(self.data) / len(self.data_zero))

        self.user_array = np.array(self.data_matrix.index)
        self.item_array = np.array(self.data_matrix.columns.levels[-1])
        
        self.n_users = len(self.user_array)
        self.n_items = len(self.item_array)
        self.n_impl_ratings = self.data[self.rating].sum()
        
        self.items_dict = defaultdict(list)
        for index, value in enumerate(self.item_array):
            self.items_dict[value] = index
            
        self.users_dict = defaultdict(list)
        for index, value in enumerate(self.user_array):
            self.users_dict[value] = index
            
        print('')
            
    def _param_initialize(self):
        self.prob_z_given_user = np.random.rand(self.n_users, self.n_latent_factors)
        self.prob_item_given_z = np.random.rand(self.n_latent_factors, self.n_items)
        self.prob_z_given_user_item = np.random.rand(self.n_latent_factors)
        
        user_group = self.data.groupby(self.user)
        self.count_user = user_group[self.rating].sum()
        self.prob_user = self.count_user / self.n_impl_ratings
        
        item_group = self.data.groupby(self.item)
        self.count_item = item_group[self.rating].sum()
    
    def _update_params(self):
        
        self.prob_item_given_user = np.dot(self.prob_z_given_user, self.prob_item_given_z)
        
        for z in range(self.n_latent_factors):
            #print('\n===========Z={}==========='.format(z))
            
            #print('\n=======E-step=======')
            
            prob_z_given_user_item = np.dot(self.prob_z_given_user[:,z].reshape(1, -1).T,\
                                            self.prob_item_given_z[z,:].reshape(1, -1)) / \
                                            self.prob_item_given_user
            
            data_matrix_z = self.data_matrix * prob_z_given_user_item
            
            #print('\n=======LOOP 1=======')
            prob_z_given_user_num_array = data_matrix_z.sum(axis = 1)
            prob_z_given_user_den_array = self.data_matrix.sum(axis = 1)
            
            prob_z_given_user_array = prob_z_given_user_num_array.values / prob_z_given_user_den_array
            self.prob_z_given_user[:,z] = prob_z_given_user_array
            
            #print('\n=======LOOP 2=======')
            prob_item_given_z_den = data_matrix_z.sum().sum()
            prob_item_given_z_num_array = data_matrix_z.sum(axis = 0)
            prob_item_given_z_array = prob_item_given_z_num_array / prob_item_given_z_den
            self.prob_item_given_z[z, :] = prob_item_given_z_array
            #clear_output()

    def _calc_log_likelihood(self):

        summand_1 = (self.data_matrix * np.log(self.prob_item_given_user)).sum().sum()
        prob_user_array = (self.data_matrix.sum(axis = 1) / self.data_matrix.sum().sum()).values
        count_user_array = self.data_matrix.sum(axis = 1)
        
        summand_2 = (count_user_array * np.log(prob_user_array)).sum()
        log_likelihood = summand_1 + summand_2
        
        return log_likelihood
    
    def _calc_joint_matrix(self):
        
        total_instances = self.data_matrix.sum().sum()
        
        prob_user_array = (self.data_matrix.sum(axis = 1) / self.data_matrix.sum().sum()).values.reshape(1, -1).T
        self.prob_joint_user_item = self.prob_item_given_user * prob_user_array * total_instances
        return self.prob_joint_user_item
        
    def fit(self, data):
        self._train_initialize(data)
        self._param_initialize()
        
        for i in range(self.n_iters):
            self._update_params()
        
            if self.verbose:
                print('\n==================ITER {}=================='.format(i+1))
                
                log_l = self._calc_log_likelihood()
                print('log-likelihood:', log_l)
    
    def predict_proba(self, data):
        pred_list = list()
        for row_index in range(len(data)):
            row = data.iloc[row_index, :]
            user = row[self.user]
            item = row[self.item]
            #rating = row[self.rating]
            
            item_index = self.items_dict[item]
            user_index = self.users_dict[user]
            
            if isinstance(item_index, list) or isinstance(user_index, list):
                rating_pred = 0
            else:
                rating_pred = self.prob_item_given_user[user_index][item_index]
            pred_list.append(rating_pred)
            
        pred_array = np.array(pred_list)
        
        return pred_array
    
    def recommend_top_k(self, user, k = 10):
        user_index = self.users_dict[user]
        item_list = self.data_matrix.columns.levels[-1]
        probas = self.prob_item_given_user[user_index,:]
        indices = np.argsort(probas)[::-1]
        return item_list[indices], probas[indices]
    
    def calc_precision(self, data, k = 10):
        user_array = data[self.user].unique()
        
        precision_list = list()
        i = 0
        for user in user_array:
            print(i, user)
            i += 1
            recommendations, probas = self.recommend_top_k(user, k)
            
            precision_list_user = list()
            for recommendation in recommendations:
                query = data[(data[self.user] == user) & (data[self.item] == recommendation)]
                if len(query) > 0:
                    precision_list_user.append(query[self.rating].values[0])
                if len(precision_list_user) >= k:
                    break
            if len(precision_list_user):
                precision_list.append(sum(precision_list_user) / len(precision_list_user))
                print(sum(precision_list_user), len(precision_list_user))
        if len(precision_list):
            return sum(precision_list) / len(precision_list)
        else:
            return None
        
ratings_train = ratings[:10000]

plsi = PLSI(n_factors = 20, n_iters = 10, verbose = True)
plsi.fit(ratings_train)

num users: 3648
num items: 3880
proportion positive: 0.7162


log-likelihood: -45123.022721525434

log-likelihood: -112047.76612560177

log-likelihood: -109593.3690855049

log-likelihood: -105966.92804931925

log-likelihood: -102254.8721617948

log-likelihood: -99636.0206777163

log-likelihood: -98110.30033766363

log-likelihood: -97251.43083084497

log-likelihood: -96761.15744349209

log-likelihood: -96453.69897886063


In [495]:
plsi.calc_precision(ratings[10000:10300])

0 A36EW68H08UOCS
2 2
1 A2GA2SO3YK6CXI
1 1
2 A2IG2FG7L85RMU
3 APZC5PRJVM0PY
4 A30N4CJVI6U4XE
1 1
5 A3LX5ELWX6HYK6
6 A1F6O5CPQWVZCR
7 A2W3KOTHFGVNMT
8 A25CMSI6S7S63C
0 1
9 A1KAS2W1TD8UOU
10 AOYTQFRLD8VLP
1 1
11 ANAYSRE3LX8GZ
12 ANCOMAI0I7LVG
2 3
13 A2IL4AJIXXXLEM
14 A29764MQVYWKDB
0 1
15 A3S78KBAE6O5FH
1 1
16 A3UUGZRTBS81YN
17 ALWB64XOXNMDP
1 1
18 A2SQJPUCZNHMZE
1 1
19 A3M333GYMKIG8A
20 A1E9QU27DMFRGS
21 A30T2B7RFWGYNU
22 A3I7EHVQ4AAKFV
23 A4G51W9PC7TJL
24 A2KBEW2GOSLS3X
1 1
25 A1NSUJJCG73IRF
1 1
26 A1BN9LZB9575P3
27 AD0J5KK4WQXNS
1 1
28 A6VXZ1EEPRTLV
29 AWG2O9C42XW5G
1 1
30 A3062ECWSS2TRC
31 A1R47IFMN86BJN
32 A1VHK9A4VLJTHC
1 1
33 A1JH5J1KQAUBMP
1 1
34 AW4MT9WP95TVD
1 1
35 A3GKPMHV4U64IY
36 A1PPIFR51T81IK
1 1
37 A3SLA4ADDSYZJ2
38 A28AIG15GAA581
1 1
39 A1R2URG8OHNEDS
1 1
40 A228JH8V9Y700G
1 1
41 A3LZGLA88K0LA0
2 3
42 A2UUYORVIZ00UH
43 A2CU4L3OK00M23
1 1
44 A3KKOJ6T4DCBDT
45 A1EAMAQ8BSI0LY
1 1
46 A3HLUNEJUT3MSP
1 1
47 A11PTCZ2FM2547
1 1
48 A2OMDT29QMLAWQ
1 1
49 A31U2QT7SAL7K
50 A1CLHLW9PF

0.7342995169082126

In [490]:
plsi.calc_precision(df_6, k=5)

0 q
3 5
1 r
3 5
2 s
3 5
3 t
1 5
4 u
3 5
5 v
2 5
6 w


0.49999999999999994

In [484]:
np.random.seed(2)

df_6 = pd.DataFrame([[1,1,1,0,0,0],
                [1,1,1,0,0,0],
                [1,1,1,0,0,0],
                [0,0,0,0,1,0],
                [0,1,0,1,1,0],
                [0,0,0,1,1,0],
                [0,0,0,0,0,0]],
                columns = list('abcdef'), index = list('qrstuvw')).reset_index()
df_6 = df_6.melt(value_vars=list('abcdef'), id_vars = 'index')
df_6.columns = ['user', 'item', 'rating']
df_6['rating'] = df_6['rating'].astype(bool)
#df_6 = df_6.sample(38, replace=False)

plsi = PLSI(n_factors = 2, n_iters = 8)
plsi.fit(df_6)

np.round(plsi.prob_item_given_user, 2)

num users: 6
num items: 5
proportion positive: 0.35714285714285715



array([[ 0.3 ,  0.31,  0.3 ,  0.04,  0.05],
       [ 0.33,  0.32,  0.32,  0.01,  0.01],
       [ 0.33,  0.32,  0.32,  0.01,  0.01],
       [ 0.01,  0.18,  0.02,  0.31,  0.47],
       [ 0.02,  0.19,  0.03,  0.3 ,  0.45],
       [ 0.01,  0.18,  0.02,  0.31,  0.47]])

In [485]:
plsi.recommend_top_k('t')

(Index(['e', 'd', 'b', 'c', 'a'], dtype='object', name='item'),
 array([ 0.47112396,  0.31406748,  0.18338243,  0.02109381,  0.01033233]))