In [1]:
import numpy as np
import pandas as pd
from time import time

from recsys.collaborative_deep_learning import DeepCollab
import recsys.evaluate as ev

from keras.layers import Dense, Input, Concatenate
from keras.models import Model, load_model
from keras.callbacks import EarlyStopping, TensorBoard

Using TensorFlow backend.


In [2]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

In [3]:
full_ratings = pd.read_csv('data/unprocessed/ratings.csv')

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
user_features = pd.read_csv('data/user_features_final.csv')
users = user_features['user_id'].tolist()

In [7]:
train_users, test_users = train_test_split(users, test_size=0.3)

In [8]:
data_users = pd.read_csv('data/train_ratings_set.csv')
train_data_users = data_users[data_users['user_id'].isin(train_users)]
test_data_users = data_users[data_users['user_id'].isin(test_users)]

In [9]:
train_data_users.to_csv('data/backups/autoencoder_train.csv', index=False)
test_data_users.to_csv('data/backups/autoencoder_test.csv', index=False)

In [4]:
train_data_users = pd.read_csv('data/backups/autoencoder_train.csv')
train_data_users['rating'] = train_data_users.apply(lambda x: x['rating'] / 5, axis=1)

train_data_users = train_data_users.pivot(index='user_id', columns='book_id')

train_data_users.fillna(0, inplace=True)
train_data_users = np.asarray(train_data_users)
train_data_users.shape

(37393, 10000)

In [16]:
train_binary_data_users = pd.read_csv('data/backups/autoencoder_train.csv')
train_binary_data_users['rating'] = train_binary_data_users.apply(lambda x: 1 if x['rating'] > 3 else 0, axis=1)
train_binary_data_users = train_binary_data_users.pivot(index='user_id', columns='book_id')
train_binary_data_users.fillna(0, inplace=True)
train_binary_data_users = np.asarray(train_binary_data_users)
train_binary_data_users.shape

(37393, 10000)

In [6]:
temp = pd.read_csv('data/backups/autoencoder_train.csv')
train_users = list(set(temp['user_id'].tolist()))
denoised_train_data_users = full_ratings[full_ratings['user_id'].isin(train_users)]
denoised_train_data_users['rating'] = denoised_train_data_users.apply(lambda x: x['rating'] / 5, axis=1)
denoised_train_data_users = denoised_train_data_users.pivot(index='user_id', columns='book_id')
denoised_train_data_users.fillna(0, inplace=True)
denoised_train_data_users = np.asarray(denoised_train_data_users)
denoised_train_data_users.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


(37393, 10000)

In [17]:
denoised_train_binary_data_users = full_ratings[full_ratings['user_id'].isin(train_users)]
denoised_train_binary_data_users['rating'] = denoised_train_binary_data_users.apply(lambda x: 1 if x['rating'] > 3 else 0, axis=1)

denoised_train_binary_data_users = denoised_train_binary_data_users.pivot(index='user_id', columns='book_id')
denoised_train_binary_data_users.fillna(0, inplace=True)
denoised_train_binary_data_users = np.asarray(denoised_train_binary_data_users)
denoised_train_binary_data_users.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


(37393, 10000)

# Generic Denoising Autoencoder
## 5 Star
## Relu Activation
Relu activation for final layer

In [14]:
model = DeepCollab(batch_size=128, hidden_layers=1, user_features=False)
model.fit(train_data_users, denoised_train_data_users)

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 10000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 10000)             100010000 
_________________________________________________________________
dense_2 (Dense)              (None, 1024)              10241024  
_________________________________________________________________
dense_3 (Dense)              (None, 10000)             10250000  
Total params: 120,501,024
Trainable params: 120,501,024
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.cast instead.
Train on 29914 samples, validate on 7479 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100


In [15]:
model2 = DeepCollab(batch_size=128, hidden_layers=3, user_features=False)
model2.fit(train_data_users, denoised_train_data_users)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 10000)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 10000)             100010000 
_________________________________________________________________
dense_5 (Dense)              (None, 1024)              10241024  
_________________________________________________________________
dense_6 (Dense)              (None, 512)               524800    
_________________________________________________________________
dense_7 (Dense)              (None, 1024)              525312    
_________________________________________________________________
dense_8 (Dense)              (None, 10000)             10250000  
Total params: 121,551,136
Trainable params: 121,551,136
Non-trainable params: 0
______________________________________________________________

In [16]:
model3 = DeepCollab(batch_size=128, hidden_layers=5, user_features=False)
model3.fit(train_data_users, denoised_train_data_users)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 10000)             0         
_________________________________________________________________
dense_9 (Dense)              (None, 10000)             100010000 
_________________________________________________________________
dense_10 (Dense)             (None, 1024)              10241024  
_________________________________________________________________
dense_11 (Dense)             (None, 512)               524800    
_________________________________________________________________
dense_12 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_13 (Dense)             (None, 512)               131584    
_________________________________________________________________
dense_14 (Dense)             (None, 1024)              525312    
__________

In [17]:
model3.autoencoder.save('models/autoencoder_full_noise_5layers.h5')

In [40]:
predictions = model3.predict(test_data_users, test_user_features)



In [55]:
predictions[0]

array([4881435. , 5282606.5, 2000445.1, ...,       0. ,       0. ,
             0. ], dtype=float32)

These predictions explain the low accuracy. We should experiment with the activation function and scaling the ratings.

# Relu with Scaled Ratings

In [16]:
model = DeepCollab(batch_size=64, hidden_layers=1, user_features=False)
model.fit(train_data_users, denoised_train_data_users)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 10000)             0         
_________________________________________________________________
dense_7 (Dense)              (None, 10000)             100010000 
_________________________________________________________________
dense_8 (Dense)              (None, 1024)              10241024  
_________________________________________________________________
dense_9 (Dense)              (None, 10000)             10250000  
Total params: 120,501,024
Trainable params: 120,501,024
Non-trainable params: 0
_________________________________________________________________
Train on 29914 samples, validate on 7479 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100


# Softmax Activation
Activation changed for final layer in DeelCollab class

In [8]:
train_data_users = pd.read_csv('data/backups/autoencoder_train.csv', header=0)

In [9]:
train_users = list(set(train_data_users['user_id'].tolist()))

In [10]:
train_data_users['rating'] = train_data_users.apply(lambda x: x['rating']/5, axis=1)

In [11]:
train_data_users = train_data_users.pivot(index='user_id', columns='book_id')
train_data_users.fillna(0, inplace=True)
train_data_users = np.asarray(train_data_users)
train_data_users.shape

(37393, 10000)

In [12]:
full_ratings = pd.read_csv('data/unprocessed/ratings.csv', header=0)
denoised_train_data_users = full_ratings[full_ratings['user_id'].isin(train_users)].copy()

denoised_train_data_users['rating'] = denoised_train_data_users.apply(lambda x: x['rating']/5, axis=1)
denoised_train_data_users = denoised_train_data_users.pivot(index='user_id', columns='book_id')
denoised_train_data_users.fillna(0, inplace=True)
denoised_train_data_users = np.asarray(denoised_train_data_users)
denoised_train_data_users.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


(37393, 10000)

In [13]:
model = DeepCollab(batch_size=64, hidden_layers=1, user_features=False)
model.fit(train_data_users, denoised_train_data_users)

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 10000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 10000)             100010000 
_________________________________________________________________
dense_2 (Dense)              (None, 1024)              10241024  
_________________________________________________________________
dense_3 (Dense)              (None, 10000)             10250000  
Total params: 120,501,024
Trainable params: 120,501,024
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.cast instead.
Train on 29914 samples, validate on 7479 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100


In [14]:
model = DeepCollab(batch_size=64, hidden_layers=1, user_features=False, epochs=20, earlystopping=False)
model.fit(train_data_users, denoised_train_data_users)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 10000)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 10000)             100010000 
_________________________________________________________________
dense_5 (Dense)              (None, 1024)              10241024  
_________________________________________________________________
dense_6 (Dense)              (None, 10000)             10250000  
Total params: 120,501,024
Trainable params: 120,501,024
Non-trainable params: 0
_________________________________________________________________
Train on 29914 samples, validate on 7479 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoc

## Binary Training

In [18]:
train_data_users = data_users[data_users['user_id'].isin(train_users)]

train_data_users['rating'] = train_data_users.apply(lambda x: 1 if x['rating'] > 3 else 0, axis=1)
train_data_users = train_data_users.pivot(index='user_id', columns='book_id')
train_data_users.fillna(0, inplace=True)
train_data_users = np.asarray(train_data_users)
train_data_users.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


(37393, 10000)

In [19]:
denoised_train_data_users = full_ratings[full_ratings['user_id'].isin(train_users)]
denoised_train_data_users['rating'] = denoised_train_data_users.apply(lambda x: 1 if x['rating'] > 3 else 0, axis=1)
denoised_train_data_users = denoised_train_data_users.pivot(index='user_id', columns='book_id')
denoised_train_data_users.fillna(0, inplace=True)
denoised_train_data_users = np.asarray(denoised_train_data_users)
denoised_train_data_users.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


(37393, 10000)

In [20]:
model = DeepCollab(batch_size=128, hidden_layers=1, user_features=False)
model.fit(train_data_users, denoised_train_data_users)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 10000)             0         
_________________________________________________________________
dense_16 (Dense)             (None, 10000)             100010000 
_________________________________________________________________
dense_17 (Dense)             (None, 1024)              10241024  
_________________________________________________________________
dense_18 (Dense)             (None, 10000)             10250000  
Total params: 120,501,024
Trainable params: 120,501,024
Non-trainable params: 0
_________________________________________________________________
Train on 29914 samples, validate on 7479 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100


In [21]:
model2 = DeepCollab(batch_size=128, hidden_layers=3, user_features=False)
model2.fit(train_data_users, denoised_train_data_users)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 10000)             0         
_________________________________________________________________
dense_19 (Dense)             (None, 10000)             100010000 
_________________________________________________________________
dense_20 (Dense)             (None, 1024)              10241024  
_________________________________________________________________
dense_21 (Dense)             (None, 512)               524800    
_________________________________________________________________
dense_22 (Dense)             (None, 1024)              525312    
_________________________________________________________________
dense_23 (Dense)             (None, 10000)             10250000  
Total params: 121,551,136
Trainable params: 121,551,136
Non-trainable params: 0
______________________________________________________________

In [22]:
model3 = DeepCollab(batch_size=128, hidden_layers=5, user_features=False)
model3.fit(train_data_users, denoised_train_data_users)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 10000)             0         
_________________________________________________________________
dense_24 (Dense)             (None, 10000)             100010000 
_________________________________________________________________
dense_25 (Dense)             (None, 1024)              10241024  
_________________________________________________________________
dense_26 (Dense)             (None, 512)               524800    
_________________________________________________________________
dense_27 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_28 (Dense)             (None, 512)               131584    
_________________________________________________________________
dense_29 (Dense)             (None, 1024)              525312    
__________

In [24]:
model3.autoencoder.save('models/autoencoder_5layers_128batch.h5')

In [26]:
train_data_users.dump('data/backups/autoencoder_train_binary.csv')
denoised_train_data_users.dump('data/backups/autoencoder_train_denoised_binary.csv')

In [25]:
train_data_users = np.load('data/backups/autoencoder_train_binary.csv')
denoised_train_data_users = np.load('data/backups/autoencoder_train_denoised_binary.csv')

In [5]:
model4 = DeepCollab(batch_size=64, hidden_layers=5, user_features=False)
model4.fit(train_data_users, denoised_train_data_users)

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 10000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 10000)             100010000 
_________________________________________________________________
dense_2 (Dense)              (None, 1024)              10241024  
_________________________________________________________________
dense_3 (Dense)              (None, 512)               524800    
_________________________________________________________________
dense_4 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_5 (Dense)              (None, 512)               131584    
_________________________________________________________________
dens

In [6]:
model4 = DeepCollab(batch_size=256, hidden_layers=5, user_features=False)
model4.fit(train_data_users, denoised_train_data_users)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 10000)             0         
_________________________________________________________________
dense_8 (Dense)              (None, 10000)             100010000 
_________________________________________________________________
dense_9 (Dense)              (None, 1024)              10241024  
_________________________________________________________________
dense_10 (Dense)             (None, 512)               524800    
_________________________________________________________________
dense_11 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_12 (Dense)             (None, 512)               131584    
_________________________________________________________________
dense_13 (Dense)             (None, 1024)              525312    
__________

# User Inputs (CDAE)
## 5 Star

In [17]:
temp = pd.read_csv('data/backups/autoencoder_train.csv', header=0)
train_users = list(set(temp['user_id'].tolist()))

In [10]:
user_features = pd.read_csv('data/user_features_final.csv')
train_user_features = user_features[user_features['user_id'].isin(list(train_users))].copy()

In [19]:
train_user_features.shape

(37393, 27)

In [11]:
train_user_features['avg_rating'] = train_user_features.apply(lambda x: x['avg_rating']/5, axis=1)
train_user_features.drop('user_id', axis=1, inplace=True)

In [12]:
train_user_features = np.asarray(train_user_features)

In [38]:
train_data_users = train_data_users.pivot(index='user_id', columns='book_id')
train_data_users.fillna(0, inplace=True)
train_data_users = np.asarray(train_data_users)
train_data_users.shape

(37393, 10000)

In [16]:
denoised_train_data_users = full_ratings[full_ratings['user_id'].isin(train_users)]
denoised_train_data_users = denoised_train_data_users.pivot(index='user_id', columns='book_id')
denoised_train_data_users.fillna(0, inplace=True)
denoised_train_data_users = np.asarray(denoised_train_data_users)
denoised_train_data_users.shape

(37393, 10000)

In [22]:
model = DeepCollab(batch_size=64, hidden_layers=1, user_features=True)
model.fit(train_data_users, denoised_train_data_users, train_user_features)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 10000)        0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            (None, 26)           0                                            
__________________________________________________________________________________________________
dense_10 (Dense)                (None, 10000)        100010000   input_4[0][0]                    
__________________________________________________________________________________________________
dense_11 (Dense)                (None, 26)           702         input_5[0][0]                    
__________________________________________________________________________________________________
concatenat

In [23]:
model1 = DeepCollab(batch_size=64, hidden_layers=3, user_features=True)
model1.fit(train_data_users, denoised_train_data_users, train_user_features)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, 10000)        0                                            
__________________________________________________________________________________________________
input_7 (InputLayer)            (None, 26)           0                                            
__________________________________________________________________________________________________
dense_14 (Dense)                (None, 10000)        100010000   input_6[0][0]                    
__________________________________________________________________________________________________
dense_15 (Dense)                (None, 26)           702         input_7[0][0]                    
__________________________________________________________________________________________________
concatenat

In [24]:
model2 = DeepCollab(batch_size=64, hidden_layers=5, user_features=True)
model2.fit(train_data_users, denoised_train_data_users, train_user_features)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            (None, 10000)        0                                            
__________________________________________________________________________________________________
input_9 (InputLayer)            (None, 26)           0                                            
__________________________________________________________________________________________________
dense_20 (Dense)                (None, 10000)        100010000   input_8[0][0]                    
__________________________________________________________________________________________________
dense_21 (Dense)                (None, 26)           702         input_9[0][0]                    
__________________________________________________________________________________________________
concatenat

In [13]:
model3 = DeepCollab(batch_size=64, hidden_layers=5, user_features=True, nodes=2048)
model3.fit(train_data_users, denoised_train_data_users, train_user_features)

Instructions for updating:
Colocations handled automatically by placer.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 10000)        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 26)           0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 10000)        100010000   input_1[0][0]                    
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 26)           702         input_2[0][0]                    
_____________________________________

In [14]:
model4 = DeepCollab(batch_size=32, hidden_layers=5, user_features=True, nodes=2048)
model4.fit(train_data_users, denoised_train_data_users, train_user_features)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 10000)        0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 26)           0                                            
__________________________________________________________________________________________________
dense_9 (Dense)                 (None, 10000)        100010000   input_3[0][0]                    
__________________________________________________________________________________________________
dense_10 (Dense)                (None, 26)           702         input_4[0][0]                    
__________________________________________________________________________________________________
concatenat

In [15]:
model4.autoencoder.save('models/autoencoder_userfeatures_5star.h5')

## Binary

In [18]:
model = DeepCollab(batch_size=32, hidden_layers=5, user_features=True, nodes=2048)
model.fit(train_binary_data_users, denoised_train_binary_data_users, train_user_features)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 10000)        0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 26)           0                                            
__________________________________________________________________________________________________
dense_17 (Dense)                (None, 10000)        100010000   input_5[0][0]                    
__________________________________________________________________________________________________
dense_18 (Dense)                (None, 26)           702         input_6[0][0]                    
__________________________________________________________________________________________________
concatenat

In [19]:
model1 = DeepCollab(batch_size=64, hidden_layers=5, user_features=True, nodes=2048)
model1.fit(train_binary_data_users, denoised_train_binary_data_users, train_user_features)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 10000)        0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, 26)           0                                            
__________________________________________________________________________________________________
dense_25 (Dense)                (None, 10000)        100010000   input_7[0][0]                    
__________________________________________________________________________________________________
dense_26 (Dense)                (None, 26)           702         input_8[0][0]                    
__________________________________________________________________________________________________
concatenat

In [20]:
model2 = DeepCollab(batch_size=128, hidden_layers=5, user_features=True, nodes=2048)
model2.fit(train_binary_data_users, denoised_train_binary_data_users, train_user_features)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            (None, 10000)        0                                            
__________________________________________________________________________________________________
input_10 (InputLayer)           (None, 26)           0                                            
__________________________________________________________________________________________________
dense_33 (Dense)                (None, 10000)        100010000   input_9[0][0]                    
__________________________________________________________________________________________________
dense_34 (Dense)                (None, 26)           702         input_10[0][0]                   
__________________________________________________________________________________________________
concatenat

In [39]:
model2.autoencoder.save('models/cdae_binary.h5')

# Testing Best Models

In [42]:
test_data_users = pd.read_csv('data/backups/autoencoder_test.csv', header=0)
test_users = list(set(test_data_users['user_id'].tolist()))
test_data_users['rating'] = test_data_users.apply(lambda x: x['rating']/5, axis=1)
test_data_users = test_data_users.pivot(index='user_id', columns='book_id')
test_data_users.fillna(0, inplace=True)
test_data_users = np.asarray(test_data_users)
test_data_users.shape

(16026, 10000)

In [43]:
denoised_test_data_users = full_ratings[full_ratings['user_id'].isin(test_users)]
denoised_test_data_users['rating'] = denoised_test_data_users.apply(lambda x: x['rating']/5, axis=1)
denoised_test_data_users = denoised_test_data_users.pivot(index='user_id', columns='book_id')
denoised_test_data_users.fillna(0, inplace=True)
denoised_test_data_users = np.asarray(denoised_test_data_users)
denoised_test_data_users.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


(16026, 10000)

In [23]:
user_features = pd.read_csv('data/user_features_final.csv')
test_user_features = user_features[user_features['user_id'].isin(list(test_users))].copy()
test_user_features['avg_rating'] = test_user_features.apply(lambda x: x['avg_rating']/5, axis=1)
test_user_features.drop('user_id', axis=1, inplace=True)
test_user_features = np.asarray(test_user_features)
test_user_features.shape

(16026, 26)

In [49]:
test_binary_data_users = pd.read_csv('data/backups/autoencoder_test.csv', header=0)

test_binary_data_users['rating'] = test_binary_data_users.apply(lambda x: 1 if x['rating'] > 3 else 0, axis=1)
test_binary_data_users = test_binary_data_users.pivot(index='user_id', columns='book_id')
test_binary_data_users.fillna(0, inplace=True)
test_binary_data_users = np.asarray(test_binary_data_users)
test_binary_data_users.shape

(16026, 10000)

In [50]:
denoised_test_binary_data_users = full_ratings[full_ratings['user_id'].isin(test_users)]
denoised_test_binary_data_users['rating'] = denoised_test_binary_data_users.apply(lambda x: 1 if x['rating'] > 3 else 0, axis=1)
denoised_test_binary_data_users = denoised_test_binary_data_users.pivot(index='user_id', columns='book_id')
denoised_test_binary_data_users.fillna(0, inplace=True)
denoised_test_binary_data_users = np.asarray(denoised_test_binary_data_users)
denoised_test_binary_data_users.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


(16026, 10000)

### 5 Star

In [44]:
predictions = model4.predict(test_data_users, test_user_features)



In [45]:
predictions.dump('data/backups/cdae_5star_predictions')

In [46]:
predictions.shape

(16026, 10000)

In [47]:
train_ratings = pd.read_csv('data/backups/autoencoder_train.csv', header=0)
full_ratings = pd.read_csv('data/unprocessed/ratings.csv', header=0)

pred_dfs = []

for index, user in enumerate(test_users):
    train_books = train_ratings[train_ratings['user_id'] == user]['book_id'].tolist()
    all_books = full_ratings[full_ratings['user_id'] == user]['book_id'].tolist()
    test_books = [book for book in all_books if book not in train_books]
    idx = [b - 1 for b in test_books]
    filter_books = full_ratings[full_ratings['user_id'] == user]
    test_book_ratings = filter_books[filter_books['book_id'].isin(test_books)]['rating'].tolist()

    #print(len(test_book_ratings))
    raw_predictions = predictions[index]
    raw_predictions = raw_predictions[idx]
    #print(raw_predictions.shape)
    df = pd.DataFrame({'book_id': test_books, 
                       'rating': test_book_ratings,

                       'pred_proba': raw_predictions})
    df['user_id'] = user
    pred_dfs.append(df)
    
evaluate_df = pd.concat(pred_dfs, ignore_index=True)

evaluate_df.head()

Unnamed: 0,book_id,pred_proba,rating,user_id
0,1158,0.566804,3,32771
1,2411,0.330137,4,32771
2,148,0.869025,4,32771
3,195,0.113886,4,32771
4,291,0.20099,5,32771


In [48]:
evaluate_df.to_csv('cdae_5star_evaluate.csv', index=False)

In [2]:
star_eval = pd.read_csv('data/cdae_5star_evaluate.csv')

In [7]:
# Reverse the original scaling
star_eval['scaled_raw'] = star_eval.apply(lambda x: x['pred_proba'] * 5, axis=1)

min_p = star_eval['scaled_raw'].min()
max_p = star_eval['scaled_raw'].max()

# Ref: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
def scale_prediction(x, min_p, max_p):
    raw = (x - min_p) / (max_p - min_p)
    return np.rint(raw * 4) + 1

star_eval['prediction'] = star_eval.apply(lambda x: scale_prediction(x['scaled_raw'], min_p, max_p), axis=1)
star_eval['prediction'].value_counts()

1.0    1410663
2.0     306733
3.0      69737
4.0       2328
5.0          2
Name: prediction, dtype: int64

In [8]:
cfm, p, r = ev.replay_5star_results(star_eval)

In [9]:
cfm

array([[ 27808,   8031,   1362,     17,      0],
       [ 85429,  18544,   3210,     67,      0],
       [335845,  64964,  11421,    294,      0],
       [512056, 106961,  22811,    708,      0],
       [449525, 108233,  30933,   1242,      2]])

In [10]:
print(p, r)

0.48065991265173286 0.03268187160058632


In [12]:
aps, mAP, binary_mAP, binary_aps, skipped = ev.top_n_5star_results(star_eval, 10)

  recall = tps / tps[-1]


0 users evaluated
500 users evaluated
1000 users evaluated
1500 users evaluated
2000 users evaluated
2500 users evaluated
3000 users evaluated
3500 users evaluated
4000 users evaluated
4500 users evaluated
5000 users evaluated
5500 users evaluated
6000 users evaluated
6500 users evaluated
7000 users evaluated
7500 users evaluated
8000 users evaluated
8500 users evaluated
9000 users evaluated
9500 users evaluated
10000 users evaluated
10500 users evaluated
11000 users evaluated
11500 users evaluated
12000 users evaluated
12500 users evaluated
13000 users evaluated
13500 users evaluated
14000 users evaluated
14500 users evaluated
15000 users evaluated
15500 users evaluated
16000 users evaluated


In [13]:
print(mAP, binary_mAP, skipped)

0.04473979782852864 0.6709846499438413 0


## Binary

In [51]:
predictions = model2.predict(test_data_users, test_user_features)
predictions.dump('data/backups/cdae_binary_predictions')



In [52]:
train_ratings = pd.read_csv('data/backups/autoencoder_train.csv', header=0)
full_ratings = pd.read_csv('data/unprocessed/ratings.csv', header=0)

pred_dfs = []

for index, user in enumerate(test_users):
    train_books = train_ratings[train_ratings['user_id'] == user]['book_id'].tolist()
    all_books = full_ratings[full_ratings['user_id'] == user]['book_id'].tolist()
    test_books = [book for book in all_books if book not in train_books]
    idx = [b - 1 for b in test_books]
    filter_books = full_ratings[full_ratings['user_id'] == user]
    test_book_ratings = filter_books[filter_books['book_id'].isin(test_books)]['rating'].tolist()

    #print(len(test_book_ratings))
    raw_predictions = predictions[index]
    raw_predictions = raw_predictions[idx]
    #print(raw_predictions.shape)
    df = pd.DataFrame({'book_id': test_books, 
                       'rating': test_book_ratings,
                       'pred_proba': raw_predictions})
    df['user_id'] = user
    pred_dfs.append(df)
    
evaluate_df = pd.concat(pred_dfs, ignore_index=True)

evaluate_df.head()

Unnamed: 0,book_id,pred_proba,rating,user_id
0,1158,0.508821,3,32771
1,2411,0.014855,4,32771
2,148,1.192314,4,32771
3,195,0.153827,4,32771
4,291,0.31718,5,32771


In [53]:
evaluate_df.to_csv('cdae_binary_evaluate.csv', index=False)

In [14]:
binary_eval = pd.read_csv('data/cdae_binary_evaluate.csv')
binary_eval.head()

Unnamed: 0,book_id,pred_proba,rating,user_id
0,1158,0.508822,3,32771
1,2411,0.014855,4,32771
2,148,1.192314,4,32771
3,195,0.153827,4,32771
4,291,0.31718,5,32771


In [15]:
binary_eval['binary_rating'] = binary_eval.apply(lambda x: 1 if x['rating'] > 3 else 0, axis=1)

In [18]:
min_p = binary_eval['pred_proba'].min()
max_p = binary_eval['pred_proba'].max()

def scale_prediction_binary(x, min_p, max_p):
    raw = (x - min_p) / (max_p - min_p)
    return np.rint(raw)

binary_eval['prediction'] = binary_eval.apply(lambda x: scale_prediction_binary(x['pred_proba'], min_p, max_p), axis=1)
binary_eval['prediction'].value_counts()

0.0    1731176
1.0      58287
Name: prediction, dtype: int64

In [19]:
cfm, p, r = ev.replay_binary_results(binary_eval)

In [20]:
cfm

array([[ 545269,   11723],
       [1185907,   46564]])

In [21]:
print(p, r)

0.7988745346303635 0.03778101066881087


In [24]:
mAP, skipped = ev.top_n_binary_results(binary_eval, 10)

0 users evaluated


  recall = tps / tps[-1]


500 users evaluated
1000 users evaluated
1500 users evaluated
2000 users evaluated
2500 users evaluated
3000 users evaluated
3500 users evaluated
4000 users evaluated
4500 users evaluated
5000 users evaluated
5500 users evaluated
6000 users evaluated
6500 users evaluated
7000 users evaluated
7500 users evaluated
8000 users evaluated
8500 users evaluated
9000 users evaluated
9500 users evaluated
10000 users evaluated
10500 users evaluated
11000 users evaluated
11500 users evaluated
12000 users evaluated
12500 users evaluated
13000 users evaluated
13500 users evaluated
14000 users evaluated
14500 users evaluated
15000 users evaluated
15500 users evaluated
16000 users evaluated


In [25]:
mAP

0.6628229127667541

# Cold Start

In [54]:
cold_start_features = np.zeros((1, 26))
cold_start_ratings = np.zeros((1, 10000))

In [55]:
cold_start_features.shape

(1, 26)

In [56]:
cold_start = model4.predict(cold_start_ratings, cold_start_features)



In [57]:
cold_start

array([[4.567486  , 4.439316  , 2.8567214 , ..., 0.        , 0.06534243,
        0.        ]], dtype=float32)

In [58]:
cold_start.dump('data/backups/cold_start_5star_cdae')

In [26]:
coldstart_5star = np.load('data/backups/cold_start_5star_cdae')

In [27]:
cold_start_users = pd.read_csv('data/cold_start_ratings_set.csv')

In [40]:
pred_dfs = []
users = list(set(cold_start_users['user_id'].tolist()))

for index, user in enumerate(users):
    
    all_books = cold_start_users[cold_start_users['user_id'] == user]['book_id'].tolist()
    idx = [b - 1 for b in all_books]
    filter_books = cold_start_users[cold_start_users['user_id'] == user]
    test_book_ratings = filter_books[filter_books['book_id'].isin(all_books)]['rating'].tolist()

    #print(len(test_book_ratings))

    raw_predictions = coldstart_5star[:,idx]

    #print(raw_predictions.shape)
    df = pd.DataFrame({'book_id': all_books, 
                       'rating': test_book_ratings,
                       'pred_proba': raw_predictions[0]})
    df['user_id'] = user
    pred_dfs.append(df)
    
evaluate_df = pd.concat(pred_dfs, ignore_index=True)

evaluate_df.head()

Unnamed: 0,book_id,pred_proba,rating,user_id
0,258,1.03308,5,1
1,268,0.872772,3,1
2,5556,0.0,3,1
3,3638,0.051928,3,1
4,1796,0.136489,5,1


In [39]:
coldstart_5star[:, [1, 2, 3]]

array([[4.439316 , 2.8567214, 4.2510595]], dtype=float32)

In [41]:
evaluate_df['scaled_raw'] = star_eval.apply(lambda x: x['pred_proba'] * 5, axis=1)

min_p = evaluate_df['scaled_raw'].min()
max_p = evaluate_df['scaled_raw'].max()

# Ref: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
def scale_prediction(x, min_p, max_p):
    raw = (x - min_p) / (max_p - min_p)
    return np.rint(raw * 4) + 1

evaluate_df['prediction'] = evaluate_df.apply(lambda x: scale_prediction(x['scaled_raw'], min_p, max_p), axis=1)
evaluate_df['prediction'].value_counts()

1.0    322
2.0    113
3.0     47
4.0     20
5.0      5
Name: prediction, dtype: int64

In [44]:
cfm, p, r = ev.replay_5star_results(evaluate_df)
cfm

array([[ 35,  10,   2,   7,   2],
       [ 22,  11,   7,   2,   0],
       [ 79,  28,  13,   1,   1],
       [114,  36,  12,   5,   0],
       [ 72,  28,  13,   5,   2]])

In [45]:
print(p, r)

0.26364917741227656 0.1301775147928994


In [46]:
aps, mAP, binary_mAP, binary_aps, skipped = ev.top_n_5star_results(evaluate_df, 10)

  recall = tps / tps[-1]


0 users evaluated


In [47]:
mAP

0.0

In [48]:
binary_mAP

0.66

In [59]:
cold_start2 = model2.predict(cold_start_ratings, cold_start_features)
cold_start2



array([[4.131628  , 4.276492  , 1.8483028 , ..., 0.04341072, 0.08637094,
        0.03407854]], dtype=float32)

In [60]:
cold_start2.dump('data/backups/cold_start_binary_cdae')

In [49]:
cold_start_binary = np.load('data/backups/cold_start_binary_cdae')

pred_dfs = []
users = list(set(cold_start_users['user_id'].tolist()))

for index, user in enumerate(users):
    
    all_books = cold_start_users[cold_start_users['user_id'] == user]['book_id'].tolist()
    idx = [b - 1 for b in all_books]
    filter_books = cold_start_users[cold_start_users['user_id'] == user]
    test_book_ratings = filter_books[filter_books['book_id'].isin(all_books)]['rating'].tolist()

    #print(len(test_book_ratings))

    raw_predictions = cold_start_binary[:,idx]

    #print(raw_predictions.shape)
    df = pd.DataFrame({'book_id': all_books, 
                       'rating': test_book_ratings,
                       'pred_proba': raw_predictions[0]})
    df['user_id'] = user
    pred_dfs.append(df)
    
evaluate_df = pd.concat(pred_dfs, ignore_index=True)

evaluate_df.head()

Unnamed: 0,book_id,pred_proba,rating,user_id
0,258,0.937875,5,1
1,268,0.951219,3,1
2,5556,0.049214,3,1
3,3638,0.148281,3,1
4,1796,0.218136,5,1


In [50]:
evaluate_df['binary_rating'] = evaluate_df.apply(lambda x: 1 if x['rating'] > 3 else 0, axis=1)

min_p = evaluate_df['pred_proba'].min()
max_p = evaluate_df['pred_proba'].max()

def scale_prediction_binary(x, min_p, max_p):
    raw = (x - min_p) / (max_p - min_p)
    return np.rint(raw)

evaluate_df['prediction'] = evaluate_df.apply(lambda x: scale_prediction_binary(x['pred_proba'], min_p, max_p), axis=1)
evaluate_df['prediction'].value_counts()

0.0    443
1.0     64
Name: prediction, dtype: int64

In [51]:
cfm, p, r = ev.replay_binary_results(evaluate_df)
cfm

array([[203,  17],
       [240,  47]])

In [52]:
print(p, r)

0.734375 0.16376306620209058


In [53]:
mAP, skipped = ev.top_n_binary_results(evaluate_df, 10)

0 users evaluated


  recall = tps / tps[-1]


In [54]:
mAP

0.52