# Book Recommender System in Tensorflow

In [131]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import preprocessing
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine

In [132]:
k = 900

epochs = 100
display_step = 10

learning_rate = 0.3

batch_size = 25

In [133]:
sql_reviews = 'SELECT user_id, book_id, rating, date_created FROM public."Reviews"'

sql_books = 'SELECT book_id FROM public."Books"'

engine = create_engine('postgresql://ece651_ml:TVL3MV0mguz0DOhLbbm2@localhost:5432/ece651')

df = pd.pandas.read_sql(sql_reviews, engine)
df_books = pd.pandas.read_sql(sql_books, engine)

In [134]:
df

Unnamed: 0,user_id,book_id,rating,date_created
0,2292,360,5,2019-03-11 12:26:14.930797
1,2293,360,5,2019-03-11 12:26:14.930797
2,2294,360,5,2019-03-11 12:26:14.930797
3,2297,655,4,2019-03-11 12:26:14.930797
4,2295,360,5,2019-03-11 12:26:14.930797
5,2307,777,5,2019-03-11 12:26:14.930797
6,2296,360,4,2019-03-11 12:26:14.930797
7,2297,360,4,2019-03-11 12:26:14.930797
8,2298,360,5,2019-03-11 12:26:14.930797
9,2318,780,5,2019-03-11 12:26:14.930797


In [135]:
i1 = df_books.set_index('book_id').index
i2 = df.set_index('book_id').index
books = df_books[~i1.isin(i2)]

In [136]:
books

Unnamed: 0,book_id
0,21
1,22
2,23
3,34
4,40
5,42
6,43
7,60
8,61
9,62


In [137]:
rows, column = books.shape
empty_array = np.zeros((rows, 1))
unrated_books = np.hstack((empty_array, books.values, empty_array, empty_array))
unrated_books = pd.DataFrame(unrated_books)
unrated_books.columns = ['user_id', 'book_id', 'rating', 'date_created']

df = df.append(unrated_books, ignore_index=True)
df.shape

(1052, 4)

In [138]:
df

Unnamed: 0,user_id,book_id,rating,date_created
0,2292.0,360.0,5.0,2019-03-11 12:26:14.930797
1,2293.0,360.0,5.0,2019-03-11 12:26:14.930797
2,2294.0,360.0,5.0,2019-03-11 12:26:14.930797
3,2297.0,655.0,4.0,2019-03-11 12:26:14.930797
4,2295.0,360.0,5.0,2019-03-11 12:26:14.930797
5,2307.0,777.0,5.0,2019-03-11 12:26:14.930797
6,2296.0,360.0,4.0,2019-03-11 12:26:14.930797
7,2297.0,360.0,4.0,2019-03-11 12:26:14.930797
8,2298.0,360.0,5.0,2019-03-11 12:26:14.930797
9,2318.0,780.0,5.0,2019-03-11 12:26:14.930797


### Reading Dataset and splitting it in a training set and a test set

In [139]:
y = df.date_created
df = df.drop('date_created', axis=1)

df.columns = ['user', 'book', 'rating']

X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2)

train_data = X_train
test_data = X_test

In [140]:
train_data

Unnamed: 0,user,book,rating
564,0.0,512.0,0.0
640,0.0,688.0,0.0
773,0.0,977.0,0.0
488,0.0,424.0,0.0
480,0.0,415.0,0.0
331,0.0,168.0,0.0
822,0.0,1080.0,0.0
500,0.0,437.0,0.0
482,0.0,417.0,0.0
226,1.0,1526.0,5.0


In [141]:
num_books = df.book.nunique()
num_users = df.user.nunique()

print("USERS: {} BOOKS: {}".format(num_users, num_books))

USERS: 149 BOOKS: 866


### Loading training set with three columns: user, book and ratings

In [142]:
# Normalize in [0, 1]

u = df['user'].values.astype(float)

user_min = u.min()
user_range = u.max() - u.min()

In [143]:
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(u.reshape(-1,1))
df_normalized = pd.DataFrame(x_scaled)
df['user'] = df_normalized

In [144]:
df

Unnamed: 0,user,book,rating
0,0.904856,360.0,5.0
1,0.905251,360.0,5.0
2,0.905645,360.0,5.0
3,0.906830,655.0,4.0
4,0.906040,360.0,5.0
5,0.910778,777.0,5.0
6,0.906435,360.0,4.0
7,0.906830,360.0,4.0
8,0.907225,360.0,5.0
9,0.915120,780.0,5.0


In [145]:
b = df['book'].values.astype(float)

book_min = b.min()
book_range = b.max() - b.min()

min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(b.reshape(-1,1))
df_normalized = pd.DataFrame(x_scaled)
df['book'] = df_normalized
df

Unnamed: 0,user,book,rating
0,0.904856,0.138086,5.0
1,0.905251,0.138086,5.0
2,0.905645,0.138086,5.0
3,0.906830,0.258248,4.0
4,0.906040,0.138086,5.0
5,0.910778,0.307943,5.0
6,0.906435,0.138086,4.0
7,0.906830,0.138086,4.0
8,0.907225,0.138086,5.0
9,0.915120,0.309165,5.0


In [146]:
r = df['rating'].values.astype(float)

rating_min = r.min()
rating_range = r.max() - r.min()

min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(r.reshape(-1,1))
df_normalized = pd.DataFrame(x_scaled)
df['rating'] = df_normalized
df

Unnamed: 0,user,book,rating
0,0.904856,0.138086,1.0
1,0.905251,0.138086,1.0
2,0.905645,0.138086,1.0
3,0.906830,0.258248,0.8
4,0.906040,0.138086,1.0
5,0.910778,0.307943,1.0
6,0.906435,0.138086,0.8
7,0.906830,0.138086,0.8
8,0.907225,0.138086,1.0
9,0.915120,0.309165,1.0


### Convert DataFrame in user-item matrix

In [147]:
matrix = df.pivot(index='user', columns='book', values='rating')
matrix.fillna(0, inplace=True)

In [148]:
matrix.shape

(149, 866)

In [149]:
for user in range(0, 148):
    for book in range(0, 865):
        if matrix.iloc[user, book] != 0.0:
            print(user,book, matrix.iloc[user,book])

1 693 1.0
1 708 0.8
1 734 1.0
1 735 1.0
1 736 1.0
1 738 1.0
1 743 1.0
1 746 1.0
2 202 1.0
2 399 1.0
2 400 1.0
3 202 1.0
3 399 1.0
3 400 1.0
4 202 1.0
4 399 1.0
4 400 1.0
5 202 1.0
5 399 1.0
5 400 1.0
6 202 0.8
6 399 0.8
6 400 0.8
7 202 0.8
7 399 0.8
7 400 0.8
8 202 1.0
8 399 1.0
8 400 1.0
9 455 1.0
9 456 1.0
9 466 1.0
9 467 1.0
9 469 1.0
9 473 1.0
10 455 1.0
10 457 1.0
10 458 1.0
11 455 0.6000000000000001
11 456 0.8
11 471 1.0
11 472 1.0
12 455 1.0
13 455 1.0
14 455 1.0
15 455 1.0
16 455 1.0
16 458 1.0
17 456 1.0
18 456 1.0
18 458 1.0
19 456 1.0
20 456 1.0
21 456 1.0
22 456 1.0
23 458 1.0
23 459 1.0
24 458 1.0
25 458 1.0
26 458 1.0
27 458 1.0
27 465 1.0
28 504 0.8
29 466 0.4
29 471 0.6000000000000001
30 466 0.8
31 466 1.0
32 466 1.0
32 468 1.0
32 515 1.0
33 466 0.8
34 466 1.0
35 466 1.0
36 468 1.0
37 468 0.8
38 468 1.0
39 468 0.6000000000000001
40 468 0.8
41 468 0.8
41 469 0.8
42 468 0.8
42 469 0.6000000000000001
43 469 1.0
44 469 1.0
45 469 0.8
46 469 1.0
47 469 1.0
47 470 1.0
48 490 

### Users and items ordered as they are in matrix

In [150]:
users = matrix.index.tolist()
books = matrix.columns.tolist()

matrix = matrix.values

print("Matrix shape: {}".format(matrix.shape))

Matrix shape: (149, 866)


### Network Parameters

In [151]:
num_input = num_books   # num of items
num_hidden_1 = 10       # 1st layer num features
num_hidden_2 = 5        # 2nd layer num features (the latent dim)

X = tf.placeholder(tf.float64, [None, num_input])

weights = {
    'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1], dtype=tf.float64)),
    'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2], dtype=tf.float64)),
    'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1], dtype=tf.float64)),
    'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input], dtype=tf.float64)),
}

biases = {
    'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2], dtype=tf.float64)),
    'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'decoder_b2': tf.Variable(tf.random_normal([num_input], dtype=tf.float64)),
}

### Building the encoder

In [152]:
def encoder(x):
    # Encoder Hidden layer with sigmoid activation #1
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']), biases['encoder_b1']))
    # Encoder Hidden layer with sigmoid activation #2
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']), biases['encoder_b2']))
    return layer_2

### Building the decoder

In [153]:
def decoder(x):
    # Decoder Hidden layer with sigmoid activation #1
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']), biases['decoder_b1']))
    # Decoder Hidden layer with sigmoid activation #2
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']), biases['decoder_b2']))
    return layer_2

### Construct model

In [154]:
encoder_op = encoder(X)
decoder_op = decoder(encoder_op)

### Prediction

In [155]:
y_pred = decoder_op

### Targets are the input data.

In [156]:
y_true = X

### Define loss and optimizer, minimize the squared error

In [157]:
loss = tf.losses.mean_squared_error(y_true, y_pred)
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(loss)

predictions = pd.DataFrame()

### Define evaluation metrics

In [158]:
eval_x = tf.placeholder(tf.int32, )
eval_y = tf.placeholder(tf.int32, )
pre, pre_op = tf.metrics.precision(labels=eval_x, predictions=eval_y)

### Initialize the variables

In [159]:
init = tf.global_variables_initializer()
local_init = tf.local_variables_initializer()

### Train the Model

In [160]:
with tf.Session() as session:
    session.run(init)
    session.run(local_init)

    num_batches = int(matrix.shape[0] / batch_size)
    matrix = np.array_split(matrix, num_batches)

    for i in range(epochs):

        avg_cost = 0

        for batch in matrix:
            _, l = session.run([optimizer, loss], feed_dict={X: batch})
            avg_cost += l

        avg_cost /= num_batches

        print("Epoch: {} Loss: {}".format(i + 1, avg_cost))

        # if i % display_step == 0 or i == 1:
        #     print('Step %i: Minibatch Loss: %f' % (i, l))

    print("Predictions...")

    matrix = np.concatenate(matrix, axis=0)

    preds = session.run(decoder_op, feed_dict={X: matrix})

    # print(matrix)
    # print(preds)
    
    predictions = predictions.append(pd.DataFrame(preds))

    predictions = predictions.stack().reset_index(name='rating')
    predictions.columns = ['user', 'book', 'rating']
    predictions['user'] = predictions['user'].map(lambda value: users[value])
    predictions['book'] = predictions['book'].map(lambda value: books[value])

    print(predictions)
    print(predictions.shape)
    
    keys = ['user', 'book']
    i1 = predictions.set_index(keys).index
    i2 = df.set_index(keys).index

    recs = predictions
    recs = recs.sort_values(['user', 'rating'], ascending=[True, False])
    recs = recs.groupby('user').head(k)
    recs.to_csv('prediction.csv', sep=',', index=False, header=False)

Epoch: 1 Loss: 0.36583629846572874
Epoch: 2 Loss: 0.3649981677532196
Epoch: 3 Loss: 0.3639024317264557
Epoch: 4 Loss: 0.3624683678150177
Epoch: 5 Loss: 0.36059013605117796
Epoch: 6 Loss: 0.35812965631484983
Epoch: 7 Loss: 0.35490967631340026
Epoch: 8 Loss: 0.3507173717021942
Epoch: 9 Loss: 0.34533209800720216
Epoch: 10 Loss: 0.3386188566684723
Epoch: 11 Loss: 0.3306865692138672
Epoch: 12 Loss: 0.321924215555191
Epoch: 13 Loss: 0.3126800417900085
Epoch: 14 Loss: 0.3029721021652222
Epoch: 15 Loss: 0.2926967978477478
Epoch: 16 Loss: 0.2822574138641357
Epoch: 17 Loss: 0.2723504066467285
Epoch: 18 Loss: 0.2618617057800293
Epoch: 19 Loss: 0.24895212352275847
Epoch: 20 Loss: 0.23211152851581573
Predictions...
        user      book    rating
0        0.0  0.000000  0.457678
1        0.0  0.000407  0.313247
2        0.0  0.000815  0.106197
3        0.0  0.005295  0.115287
4        0.0  0.007739  0.270109
5        0.0  0.008554  0.405806
6        0.0  0.008961  0.311619
7        0.0  0.015886  

In [161]:
recs['user'] = recs['user'] * user_range + user_min
recs['book'] = recs['book'] * book_range + book_min
print(user_min)
recs.sort_values(['user', 'rating'], ascending=[True, False])

0.0


Unnamed: 0,user,book,rating
830,0.0,2222.0,0.962578
628,0.0,1080.0,0.962555
786,0.0,1750.0,0.958704
825,0.0,2152.0,0.950810
809,0.0,2090.0,0.944616
27,0.0,83.0,0.937709
745,0.0,1510.0,0.935604
151,0.0,303.0,0.933667
776,0.0,1693.0,0.930339
381,0.0,612.0,0.927263


In [162]:
recs.loc[recs['user'] == 2380]

Unnamed: 0,user,book,rating
64914,2380.0,2222.0,0.944172
64235,2380.0,303.0,0.940229
64712,2380.0,1080.0,0.938972
64870,2380.0,1750.0,0.936113
64829,2380.0,1510.0,0.934473
64909,2380.0,2152.0,0.933772
64893,2380.0,2090.0,0.931682
64465,2380.0,612.0,0.927003
64907,2380.0,2150.0,0.926271
64860,2380.0,1693.0,0.924538


In [163]:
recs.loc[recs['user'] == 2380]['book'].shape

(866,)

In [164]:
user_2380_top = recs.loc[recs['user'] == 2380].head(20)

expected_2380_book_ids = [382,670,662,375,677];
for x in expected_2380_book_ids:
    if x not in user_2380_top['book'].values.round(): 
        print(f'Couldn\'t find {x} for user 2380')

Couldn't find 382 for user 2380
Couldn't find 670 for user 2380
Couldn't find 662 for user 2380
Couldn't find 375 for user 2380
Couldn't find 677 for user 2380


In [23]:
recs.loc[recs['user'] == 1]

Unnamed: 0,user,book,rating
1391,1.0,913.0,0.989935
1589,1.0,1363.0,0.984772
966,1.0,170.0,0.983759
1520,1.0,1141.0,0.982608
1731,1.0,2476.0,0.978708
1486,1.0,1062.0,0.976282
1204,1.0,512.0,0.975342
1286,1.0,682.0,0.972632
1250,1.0,619.0,0.964075
1713,1.0,2343.0,0.963863


In [24]:
recs.loc[recs['user'] == 1]['book'].shape

(866,)

In [25]:
user_1_top = recs.loc[recs['user'] == 1]

expected_1_book_ids = [1387,1374,1420,1526,1308,1384,1210,1385];
for x in expected_1_book_ids:
    if x not in user_1_top['book'].values.round(): 
        print(f'Couldn\'t find {x} for user 1')