# Book Recommender System in Tensorflow

In [25]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import preprocessing
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine

In [26]:
k = 900

epochs = 10
display_step = 10

learning_rate = 0.3

batch_size = 25

In [27]:
sql_reviews = 'SELECT user_id, book_id, rating, date_created FROM public."Reviews"'

sql_books = 'SELECT book_id FROM public."Books"'

engine = create_engine('postgresql://ece651_ml:TVL3MV0mguz0DOhLbbm2@localhost:5432/ece651')

df = pd.pandas.read_sql(sql_reviews, engine)
df_books = pd.pandas.read_sql(sql_books, engine)

i1 = df_books.set_index('book_id').index
i2 = df.set_index('book_id').index
books = df_books[~i1.isin(i2)]

rows, column = books.shape
empty_array = np.zeros((rows, 1))
unrated_books = np.hstack((empty_array, books.values, empty_array, empty_array))
unrated_books = pd.DataFrame(unrated_books)
unrated_books.columns = ['user_id', 'book_id', 'rating', 'date_created']

df = df.append(unrated_books, ignore_index=True)
df.shape

(1052, 4)

### Reading Dataset and splitting it in a training set and a test set

In [28]:
y = df.date_created
df = df.drop('date_created', axis=1)

df.columns = ['user', 'book', 'rating']

X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2)

train_data = X_train
test_data = X_test

num_books = df.book.nunique()
num_users = df.user.nunique()

print("USERS: {} BOOKS: {}".format(num_users, num_books))

USERS: 148 BOOKS: 866


### Loading training set with three columns: user, book and ratings

In [29]:
# Normalize in [0, 1]

u = df['user'].values.astype(float)

user_min = u.min()
user_range = u.max() - u.min()

min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(u.reshape(-1,1))
df_normalized = pd.DataFrame(x_scaled)
df['user'] = df_normalized

b = df['book'].values.astype(float)

book_min = b.min()
book_range = b.max() - b.min()

min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(b.reshape(-1,1))
df_normalized = pd.DataFrame(x_scaled)
df['book'] = df_normalized

r = df['rating'].values.astype(float)

rating_min = r.min()
rating_range = r.max() - r.min()

min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(r.reshape(-1,1))
df_normalized = pd.DataFrame(x_scaled)
df['rating'] = df_normalized

### Convert DataFrame in user-item matrix

In [30]:
matrix = df.pivot(index='user', columns='book', values='rating')
matrix.fillna(0, inplace=True)

### Users and items ordered as they are in matrix

In [31]:
users = matrix.index.tolist()
books = matrix.columns.tolist()

matrix = matrix.values

print("Matrix shape: {}".format(matrix.shape))

Matrix shape: (148, 866)


### Network Parameters

In [32]:
num_input = num_books   # num of items
num_hidden_1 = 10       # 1st layer num features
num_hidden_2 = 5        # 2nd layer num features (the latent dim)

X = tf.placeholder(tf.float64, [None, num_input])

weights = {
    'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1], dtype=tf.float64)),
    'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2], dtype=tf.float64)),
    'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1], dtype=tf.float64)),
    'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input], dtype=tf.float64)),
}

biases = {
    'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2], dtype=tf.float64)),
    'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'decoder_b2': tf.Variable(tf.random_normal([num_input], dtype=tf.float64)),
}

### Building the encoder

In [33]:
def encoder(x):
    # Encoder Hidden layer with sigmoid activation #1
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']), biases['encoder_b1']))
    # Encoder Hidden layer with sigmoid activation #2
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']), biases['encoder_b2']))
    return layer_2

### Building the decoder

In [34]:
def decoder(x):
    # Decoder Hidden layer with sigmoid activation #1
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']), biases['decoder_b1']))
    # Decoder Hidden layer with sigmoid activation #2
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']), biases['decoder_b2']))
    return layer_2

### Construct model

In [35]:
encoder_op = encoder(X)
decoder_op = decoder(encoder_op)

### Prediction

In [36]:
y_pred = decoder_op

### Targets are the input data.

In [37]:
y_true = X

### Define loss and optimizer, minimize the squared error

In [38]:
loss = tf.losses.mean_squared_error(y_true, y_pred)
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(loss)

predictions = pd.DataFrame()

### Define evaluation metrics

In [39]:
eval_x = tf.placeholder(tf.int32, )
eval_y = tf.placeholder(tf.int32, )
pre, pre_op = tf.metrics.precision(labels=eval_x, predictions=eval_y)

### Initialize the variables

In [40]:
init = tf.global_variables_initializer()
local_init = tf.local_variables_initializer()

### Train the Model

In [41]:
with tf.Session() as session:
    session.run(init)
    session.run(local_init)

    num_batches = int(matrix.shape[0] / batch_size)
    matrix = np.array_split(matrix, num_batches)

    for i in range(epochs):

        avg_cost = 0

        for batch in matrix:
            _, l = session.run([optimizer, loss], feed_dict={X: batch})
            avg_cost += l

        avg_cost /= num_batches

        print("Epoch: {} Loss: {}".format(i + 1, avg_cost))

        # if i % display_step == 0 or i == 1:
        #     print('Step %i: Minibatch Loss: %f' % (i, l))

    print("Predictions...")

    matrix = np.concatenate(matrix, axis=0)

    preds = session.run(decoder_op, feed_dict={X: matrix})

    # print(matrix)
    # print(preds)
    
    predictions = predictions.append(pd.DataFrame(preds))

    predictions = predictions.stack().reset_index(name='rating')
    predictions.columns = ['user', 'book', 'rating']
    predictions['user'] = predictions['user'].map(lambda value: users[value])
    predictions['book'] = predictions['book'].map(lambda value: books[value])

    print(predictions)
    print(predictions.shape)
    
    keys = ['user', 'book']
    i1 = predictions.set_index(keys).index
    i2 = df.set_index(keys).index

    recs = predictions
    recs = recs.sort_values(['user', 'rating'], ascending=[True, False])
    recs.to_csv('prediction.csv', sep=',', index=False, header=False)

Epoch: 1 Loss: 0.3260860204696655
Epoch: 2 Loss: 0.32541964650154115
Epoch: 3 Loss: 0.3245575726032257
Epoch: 4 Loss: 0.32344475388526917
Epoch: 5 Loss: 0.3220130860805511
Epoch: 6 Loss: 0.3201813757419586
Epoch: 7 Loss: 0.3178592026233673
Epoch: 8 Loss: 0.31496134400367737
Epoch: 9 Loss: 0.3114425539970398
Epoch: 10 Loss: 0.307342404127121
Predictions...
        user      book    rating
0        0.0  0.000000  0.091016
1        0.0  0.000407  0.087472
2        0.0  0.000815  0.329781
3        0.0  0.005295  0.512410
4        0.0  0.007739  0.910046
5        0.0  0.008554  0.348876
6        0.0  0.008961  0.238703
7        0.0  0.015886  0.811999
8        0.0  0.016293  0.091394
9        0.0  0.016701  0.569199
10       0.0  0.017515  0.093598
11       0.0  0.018330  0.229532
12       0.0  0.018737  0.517796
13       0.0  0.019145  0.137648
14       0.0  0.019552  0.159122
15       0.0  0.019959  0.327069
16       0.0  0.020367  0.195748
17       0.0  0.020774  0.688759
18       0.0  0

In [42]:
recs['user'] = recs['user'] * user_range + user_min
recs['book'] = recs['book'] * book_range + book_min

recs.sort_values(['user', 'rating'], ascending=[True, False])

Unnamed: 0,user,book,rating
320,0.0,487.0,0.990523
489,0.0,816.0,0.990183
117,0.0,193.0,0.984451
31,0.0,88.0,0.984270
559,0.0,951.0,0.974646
594,0.0,1003.0,0.974327
663,0.0,1156.0,0.974144
477,0.0,801.0,0.973073
561,0.0,953.0,0.968619
279,0.0,442.0,0.966386


In [43]:
recs.loc[recs['user'] == 2380]

Unnamed: 0,user,book,rating
64573,2380.0,816.0,0.989200
64404,2380.0,487.0,0.989169
64115,2380.0,88.0,0.983133
64201,2380.0,193.0,0.982176
64561,2380.0,801.0,0.979874
64643,2380.0,951.0,0.977487
64678,2380.0,1003.0,0.975958
64747,2380.0,1156.0,0.974727
64645,2380.0,953.0,0.974411
64363,2380.0,442.0,0.971110


In [44]:
recs.loc[recs['user'] == 2380]['book'].shape

(866,)

In [45]:
user_2380_top = recs.loc[recs['user'] == 2380]

expected_2380_book_ids = [382.0,670.0,662.0,375.0,677.0];
for x in expected_2380_book_ids:
    if x not in user_2380_top['book'].values.round(): 
        print(f'Couldn\'t find {x} for user 2380')

In [46]:
recs.loc[recs['user'] == 1]

Unnamed: 0,user,book,rating
1355,1.0,816.0,0.990124
1186,1.0,487.0,0.988206
1343,1.0,801.0,0.984925
897,1.0,88.0,0.982526
1425,1.0,951.0,0.982147
983,1.0,193.0,0.980974
1427,1.0,953.0,0.980554
1460,1.0,1003.0,0.979420
1529,1.0,1156.0,0.975811
1414,1.0,939.0,0.974586


In [47]:
recs.loc[recs['user'] == 1]['book'].shape

(866,)

In [48]:
user_1_top = recs.loc[recs['user'] == 1]

expected_1_book_ids = [1387.0,1374.0,1420.0,1526.0,1308.0,1384.0,1210.0,1385.0];
for x in expected_1_book_ids:
    if x not in user_1_top['book'].values.round(): 
        print(f'Couldn\'t find {x} for user 1')