# Book Recommender System in Tensorflow

In [172]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import preprocessing
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine

In [173]:
k = 900
debug = False

epochs = 500
display_step = 10

learning_rate = 0.2

batch_size = 15

In [174]:
sql_reviews = 'SELECT user_id, book_id, rating, date_created FROM public."Reviews"'

sql_books = 'SELECT book_id FROM public."Books"'

engine = create_engine('postgresql://ece651_ml:TVL3MV0mguz0DOhLbbm2@localhost:5432/ece651')

df = pd.pandas.read_sql(sql_reviews, engine)
df_books = pd.pandas.read_sql(sql_books, engine)

In [175]:
if debug:
    df

In [176]:
i1 = df_books.set_index('book_id').index
i2 = df.set_index('book_id').index
books = df_books[~i1.isin(i2)]

In [177]:
if debug:
    books

In [178]:
rows, column = books.shape
empty_array = np.zeros((rows, 1))
unrated_books = np.hstack((empty_array, books.values, empty_array, empty_array))
unrated_books = pd.DataFrame(unrated_books)
unrated_books.columns = ['user_id', 'book_id', 'rating', 'date_created']

df = df.append(unrated_books, ignore_index=True)
df.shape

(1052, 4)

In [179]:
if debug:
    df

### Reading Dataset and splitting it in a training set and a test set

In [180]:
y = df.date_created
df = df.drop('date_created', axis=1)

df.columns = ['user', 'book', 'rating']

X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2)

train_data = X_train
test_data = X_test

In [181]:
if debug:
    train_data

In [182]:
num_books = df.book.nunique()
num_users = df.user.nunique()

print("USERS: {} BOOKS: {}".format(num_users, num_books))

USERS: 149 BOOKS: 866


### Loading training set with three columns: user, book and ratings

In [183]:
# Normalize in [0, 1]

u = df['user'].values.astype(float)

user_min = u.min()
user_range = u.max() - u.min()

In [184]:
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(u.reshape(-1,1))
df_normalized = pd.DataFrame(x_scaled)
df['user'] = df_normalized

In [185]:
if debug:
    df

In [186]:
b = df['book'].values.astype(float)

book_min = b.min()
book_range = b.max() - b.min()

min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(b.reshape(-1,1))
df_normalized = pd.DataFrame(x_scaled)
df['book'] = df_normalized

if debug:
    df

In [187]:
r = df['rating'].values.astype(float)

rating_min = r.min()
rating_range = r.max() - r.min()

min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(r.reshape(-1,1))
df_normalized = pd.DataFrame(x_scaled)
df['rating'] = df_normalized

if debug:
    df

### Convert DataFrame in user-item matrix

In [188]:
matrix = df.pivot(index='user', columns='book', values='rating')
matrix.fillna(0, inplace=True)

In [189]:
matrix.shape

(149, 866)

In [190]:
if debug:
    for user in range(0, 148):
        for book in range(0, 865):
            if matrix.iloc[user, book] != 0.0:
                print(user,book, matrix.iloc[user,book])

### Users and items ordered as they are in matrix

In [191]:
users = matrix.index.tolist()
books = matrix.columns.tolist()

matrix = matrix.values

print("Matrix shape: {}".format(matrix.shape))

Matrix shape: (149, 866)


### Network Parameters

In [192]:
num_input = num_books   # num of items
num_hidden_1 = 10       # 1st layer num features
num_hidden_2 = 5        # 2nd layer num features (the latent dim)

X = tf.placeholder(tf.float64, [None, num_input])

weights = {
    'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1], dtype=tf.float64)),
    'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2], dtype=tf.float64)),
    'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1], dtype=tf.float64)),
    'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input], dtype=tf.float64)),
}

biases = {
    'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2], dtype=tf.float64)),
    'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'decoder_b2': tf.Variable(tf.random_normal([num_input], dtype=tf.float64)),
}

### Building the encoder

In [193]:
def encoder(x):
    # Encoder Hidden layer with sigmoid activation #1
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']), biases['encoder_b1']))
    # Encoder Hidden layer with sigmoid activation #2
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']), biases['encoder_b2']))
    return layer_2

### Building the decoder

In [194]:
def decoder(x):
    # Decoder Hidden layer with sigmoid activation #1
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']), biases['decoder_b1']))
    # Decoder Hidden layer with sigmoid activation #2
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']), biases['decoder_b2']))
    return layer_2

### Construct model

In [195]:
encoder_op = encoder(X)
decoder_op = decoder(encoder_op)

### Prediction

In [196]:
y_pred = decoder_op

### Targets are the input data.

In [197]:
y_true = X

### Define loss and optimizer, minimize the squared error

In [198]:
loss = tf.losses.mean_squared_error(y_true, y_pred)
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(loss)

predictions = pd.DataFrame()

### Define evaluation metrics

In [199]:
eval_x = tf.placeholder(tf.int32, )
eval_y = tf.placeholder(tf.int32, )
pre, pre_op = tf.metrics.precision(labels=eval_x, predictions=eval_y)

### Initialize the variables

In [200]:
init = tf.global_variables_initializer()
local_init = tf.local_variables_initializer()

### Train the Model

In [201]:
with tf.Session() as session:
    session.run(init)
    session.run(local_init)

    num_batches = int(matrix.shape[0] / batch_size)
    matrix = np.array_split(matrix, num_batches)

    for i in range(epochs):

        avg_cost = 0

        for batch in matrix:
            _, l = session.run([optimizer, loss], feed_dict={X: batch})
            avg_cost += l

        avg_cost /= num_batches

        print("Epoch: {} Loss: {}".format(i + 1, avg_cost))

        # if i % display_step == 0 or i == 1:
        #     print('Step %i: Minibatch Loss: %f' % (i, l))

    print("Predictions...")

    matrix = np.concatenate(matrix, axis=0)

    preds = session.run(decoder_op, feed_dict={X: matrix})

    # print(matrix)
    # print(preds)
    
    predictions = predictions.append(pd.DataFrame(preds))

    predictions = predictions.stack().reset_index(name='rating')
    predictions.columns = ['user', 'book', 'rating']
    predictions['user'] = predictions['user'].map(lambda value: users[value])
    predictions['book'] = predictions['book'].map(lambda value: books[value])

    print(f'Prediction Shape: {predictions.shape}')
    if debug:
        print(predictions)
        
    
    keys = ['user', 'book']
    i1 = predictions.set_index(keys).index
    i2 = df.set_index(keys).index

    recs = predictions

Epoch: 1 Loss: 0.3485371702247196
Epoch: 2 Loss: 0.3477831648455726
Epoch: 3 Loss: 0.3465774688455794
Epoch: 4 Loss: 0.34465524223115707
Epoch: 5 Loss: 0.34160438511106705
Epoch: 6 Loss: 0.3367960618601905
Epoch: 7 Loss: 0.32930441697438556
Epoch: 8 Loss: 0.3179060187604692
Epoch: 9 Loss: 0.30171798004044426
Epoch: 10 Loss: 0.2821054723527696
Epoch: 11 Loss: 0.2608022838830948
Epoch: 12 Loss: 0.2330661184257931
Epoch: 13 Loss: 0.18925084173679352
Epoch: 14 Loss: 0.1267265205581983
Epoch: 15 Loss: 0.0661082412633631
Epoch: 16 Loss: 0.03705949294898245
Epoch: 17 Loss: 0.02280322007007069
Epoch: 18 Loss: 0.01500691877057155
Epoch: 19 Loss: 0.011917385790083144
Epoch: 20 Loss: 0.009301632901446687
Epoch: 21 Loss: 0.0057995235288722646
Epoch: 22 Loss: 0.003933653908057345
Epoch: 23 Loss: 0.002889335621148348
Epoch: 24 Loss: 0.0026650927029550076
Epoch: 25 Loss: 0.0026395761200951207
Epoch: 26 Loss: 0.0026274845262782443
Epoch: 27 Loss: 0.002617068640473816
Epoch: 28 Loss: 0.0020886011318200

Epoch: 232 Loss: 0.0010274108563963738
Epoch: 233 Loss: 0.0009983008785638958
Epoch: 234 Loss: 0.000992006641657402
Epoch: 235 Loss: 0.0010556866246689525
Epoch: 236 Loss: 0.0010301431385515672
Epoch: 237 Loss: 0.0010617154572779934
Epoch: 238 Loss: 0.0010134587549449254
Epoch: 239 Loss: 0.0010338466316978964
Epoch: 240 Loss: 0.0009897133018562777
Epoch: 241 Loss: 0.0009849896499266226
Epoch: 242 Loss: 0.001047258949256502
Epoch: 243 Loss: 0.001081418161953075
Epoch: 244 Loss: 0.001009420634040402
Epoch: 245 Loss: 0.0009764033052811606
Epoch: 246 Loss: 0.0009595119465504669
Epoch: 247 Loss: 0.0009657514423856305
Epoch: 248 Loss: 0.0009385020789017694
Epoch: 249 Loss: 0.0009168698937476923
Epoch: 250 Loss: 0.0009668883010615698
Epoch: 251 Loss: 0.0008982579877030932
Epoch: 252 Loss: 0.0008747180885014435
Epoch: 253 Loss: 0.0009455379913561046
Epoch: 254 Loss: 0.0009270661701318912
Epoch: 255 Loss: 0.0008310917958927652
Epoch: 256 Loss: 0.0008143720139438907
Epoch: 257 Loss: 0.0008009459

Epoch: 447 Loss: 0.00047351711772434
Epoch: 448 Loss: 0.0004927858260796509
Epoch: 449 Loss: 0.0005284090907985552
Epoch: 450 Loss: 0.0005374652084558167
Epoch: 451 Loss: 0.0005009089460751662
Epoch: 452 Loss: 0.00048301436375671375
Epoch: 453 Loss: 0.000463651054208943
Epoch: 454 Loss: 0.000522301233205427
Epoch: 455 Loss: 0.0006579363366149159
Epoch: 456 Loss: 0.0005389596190070733
Epoch: 457 Loss: 0.0005061087010997451
Epoch: 458 Loss: 0.0005563853887401314
Epoch: 459 Loss: 0.000554494483771527
Epoch: 460 Loss: 0.00047850931231449876
Epoch: 461 Loss: 0.000502747651706967
Epoch: 462 Loss: 0.0004825830110348761
Epoch: 463 Loss: 0.0004779156847184317
Epoch: 464 Loss: 0.0005539507449915012
Epoch: 465 Loss: 0.0004837093519098643
Epoch: 466 Loss: 0.0004656783017304002
Epoch: 467 Loss: 0.0004596986432766749
Epoch: 468 Loss: 0.0004950357364982159
Epoch: 469 Loss: 0.0005480162524488858
Epoch: 470 Loss: 0.0005025819765352127
Epoch: 471 Loss: 0.0004617454962701433
Epoch: 472 Loss: 0.0004680577

Write Out CSV
============

In [202]:
recs['user'] = recs['user'] * user_range + user_min
recs['book'] = recs['book'] * book_range + book_min
print(f'User Min: {user_min}')
recs = recs.sort_values(['user', 'rating'], ascending=[True, False])
recs.to_csv('prediction.csv', sep=',', index=False, header=True)

User Min: 0.0


Write Out SQL INSERT
==================

In [210]:
# The number of recommendations to save for each user
save_X_recommendations = 100

with open('../../db/create_tables_insert_data/Recommendations.sql', 'w') as file:
    file.write('''
-- Created uing the Jupyter notebook

\\set ON_ERROR_STOP on
SET CLIENT_ENCODING TO 'utf8';

drop table if exists "Recommendations";
drop index if exists "Recommendations_pkey";
drop index if exists "Recommendations_book_id_fk";
drop index if exists "Recommendations_user_id_fk";

CREATE TABLE public."Recommendations" (
    user_id integer NOT NULL,
    book_id integer NOT NULL,
    likelihood numeric NOT NULL
);\n\n\n''')
    for x in recs.groupby('user').head(save_X_recommendations).itertuples():
        file.write('INSERT INTO public."Recommendations" (user_id, book_id, likelihood) VALUES '+
                   f'({int(round(x.user))}, {int(round(x.book))}, {x.rating});\n')
    file.write('''
ALTER TABLE ONLY public."Recommendations"
    ADD CONSTRAINT "Recommendations_pkey" PRIMARY KEY (user_id, book_id);
ALTER TABLE ONLY public."Recommendations"
    ADD CONSTRAINT Recommendations_book_id_fk FOREIGN KEY (book_id) REFERENCES public."Books"(book_id);
ALTER TABLE ONLY public."Recommendations"
    ADD CONSTRAINT Recommendations_user_id_fk FOREIGN KEY (user_id) REFERENCES public."Users"(user_id);

GRANT ALL ON TABLE public."Recommendations" TO ece651_ml;
GRANT ALL ON TABLE public."Recommendations" TO ece651_web;
GRANT ALL ON TABLE public."Recommendations" TO ece651_scraper;

\\unset ON_ERROR_STOP
''')

In [204]:
if debug:
    recs.loc[recs['user'] == 2380]

In [205]:
recs.loc[recs['user'] == 2380]['book'].shape

(866,)

In [206]:
user_2380_top = recs.loc[recs['user'] == 2380].head(20)

expected_2380_book_ids = [382,670,662,375,677];
for x in expected_2380_book_ids:
    if x not in user_2380_top['book'].values.round(): 
        print(f'Couldn\'t find {x} for user 2380')

Couldn't find 677 for user 2380


In [207]:
if debug:
    recs.loc[recs['user'] == 1]

In [208]:
recs.loc[recs['user'] == 1]['book'].shape

(866,)

In [209]:
user_1_top = recs.loc[recs['user'] == 1].head(20)

expected_1_book_ids = [1387,1374,1420,1526,1308,1384,1210,1385];
for x in expected_1_book_ids:
    if x not in user_1_top['book'].values.round(): 
        print(f'Couldn\'t find {x} for user 1')

Couldn't find 1374 for user 1
Couldn't find 1420 for user 1
Couldn't find 1526 for user 1
Couldn't find 1384 for user 1
Couldn't find 1385 for user 1
