In [77]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [78]:
from sklearn import preprocessing
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split

In [79]:
k = 10

epochs = 10
display_step = 10

learning_rate = 0.3

batch_size = 250

sample_data = "./ratings.dat"

In [80]:
# Reading dataset

df = pd.read_csv(sample_data, sep='t', names=['user', 'item', 'rating', 'timestamp'], header=None)

y = df.timestamp
df = df.drop('timestamp', axis=1)

X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2)

train_data = X_train
test_data = X_test

num_items = df.item.nunique()
num_users = df.user.nunique()

print("USERS: {} ITEMS: {}".format(num_users, num_items))

USERS: 6040 ITEMS: 3706


In [81]:
# Normalize in [0, 1]

r = df['rating'].values.astype(float)
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(r.reshape(-1,1))
df_normalized = pd.DataFrame(x_scaled)
df['rating'] = df_normalized

In [82]:
# Convert DataFrame in user-item matrix

matrix = df.pivot(index='user', columns='item', values='rating')
matrix.fillna(0, inplace=True)

In [83]:
# Users and items ordered as they are in matrix

users = matrix.index.tolist()
items = matrix.columns.tolist()

matrix = matrix.values

print("Matrix shape: {}".format(matrix.shape))

# num_users = matrix.shape[0]
# num_items = matrix.shape[1]
# print("USERS: {} ITEMS: {}".format(num_users, num_items))

Matrix shape: (6040, 3706)


In [84]:
# Network Parameters

num_input = num_items   # num of items
num_hidden_1 = 10       # 1st layer num features
num_hidden_2 = 5        # 2nd layer num features (the latent dim)

X = tf.placeholder(tf.float64, [None, num_input])

weights = {
    'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1], dtype=tf.float64)),
    'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2], dtype=tf.float64)),
    'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1], dtype=tf.float64)),
    'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input], dtype=tf.float64)),
}

biases = {
    'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2], dtype=tf.float64)),
    'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'decoder_b2': tf.Variable(tf.random_normal([num_input], dtype=tf.float64)),
}

In [85]:
# Building the encoder

def encoder(x):
    # Encoder Hidden layer with sigmoid activation #1
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']), biases['encoder_b1']))
    # Encoder Hidden layer with sigmoid activation #2
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']), biases['encoder_b2']))
    return layer_2

# Building the decoder

def decoder(x):
    # Decoder Hidden layer with sigmoid activation #1
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']), biases['decoder_b1']))
    # Decoder Hidden layer with sigmoid activation #2
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']), biases['decoder_b2']))
    return layer_2

# Construct model

encoder_op = encoder(X)
decoder_op = decoder(encoder_op)

# Prediction

y_pred = decoder_op

# Targets are the input data.

y_true = X

In [86]:
# Define loss and optimizer, minimize the squared error

loss = tf.losses.mean_squared_error(y_true, y_pred)
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(loss)

predictions = pd.DataFrame()

# Define evaluation metrics

eval_x = tf.placeholder(tf.int32, )
eval_y = tf.placeholder(tf.int32, )
pre, pre_op = tf.metrics.precision(labels=eval_x, predictions=eval_y)

In [87]:
# Initialize the variables (i.e. assign their default value)

init = tf.global_variables_initializer()
local_init = tf.local_variables_initializer()

In [88]:
with tf.Session() as session:
    session.run(init)
    session.run(local_init)

    num_batches = int(matrix.shape[0] / batch_size)
    matrix = np.array_split(matrix, num_batches)

    for i in range(epochs):

        avg_cost = 0

        for batch in matrix:
            _, l = session.run([optimizer, loss], feed_dict={X: batch})
            avg_cost += l

        avg_cost /= num_batches

        print("Epoch: {} Loss: {}".format(i + 1, avg_cost))

        # if i % display_step == 0 or i == 1:
        #     print('Step %i: Minibatch Loss: %f' % (i, l))

    print("Predictions...")

    matrix = np.concatenate(matrix, axis=0)

    preds = session.run(decoder_op, feed_dict={X: matrix})

    # print(matrix)
    # print(preds)
    
    predictions = predictions.append(pd.DataFrame(preds))

    predictions = predictions.stack().reset_index(name='rating')
    predictions.columns = ['user', 'item', 'rating']
    predictions['user'] = predictions['user'].map(lambda value: users[value])
    predictions['item'] = predictions['item'].map(lambda value: items[value])

    print(predictions)

Epoch: 1 Loss: 0.35251562173167866
Epoch: 2 Loss: 0.34807778149843216
Epoch: 3 Loss: 0.3307875357568264
Epoch: 4 Loss: 0.2924945739408334
Epoch: 5 Loss: 0.27025464177131653
Epoch: 6 Loss: 0.21905607481797537
Epoch: 7 Loss: 0.10675932637726267
Epoch: 8 Loss: 0.026675402109200757
Epoch: 9 Loss: 0.01964284502901137
Epoch: 10 Loss: 0.019379176354656618
Predictions...
          user  item    rating
0            1     1  0.203235
1            1     2  0.051075
2            1     3  0.037214
3            1     4  0.021072
4            1     5  0.022445
5            1     6  0.091034
6            1     7  0.041232
7            1     8  0.012828
8            1     9  0.011032
9            1    10  0.077799
10           1    11  0.098319
11           1    12  0.012422
12           1    13  0.021075
13           1    14  0.008070
14           1    15  0.023940
15           1    16  0.054702
16           1    17  0.173982
17           1    18  0.013127
18           1    19  0.023543
19           1

In [89]:
    print("Filtering out items in training set")

    keys = ['user', 'item']
    i1 = predictions.set_index(keys).index
    i2 = df.set_index(keys).index

    recs = predictions[~i1.isin(i2)]
    recs = recs.sort_values(['user', 'rating'], ascending=[True, False])
    recs = recs.groupby('user').head(k)
    recs.to_csv('recs.tsv', sep='\t', index=False, header=False)

Filtering out items in training set


In [91]:
    # creare un vettore dove ci sono per ogni utente i suoi 10 movies

    test = test_data

    test = test.sort_values(['user', 'rating'], ascending=[True, False])

    #test = test.groupby('user').head(k) #.reset_index(drop=True)
    #test_list = test.as_matrix(columns=['item']).reshape((-1))
    #recs_list = recs.groupby('user').head(k).as_matrix(columns=['item']).reshape((-1))

    print("Evaluating...")

    p = 0.0
    for user in users[:10]:
        test_list = test[(test.user == user)].head(k).values.flatten()
        recs_list = recs[(recs.user == user)].head(k).values.flatten()

    #session.run(pre_op, feed_dict={eval_x: test_list, eval_y: recs_list})

        #pu = precision_score(test_list, recs_list, average='micro')
        #p += pu

        # print("Precision for user {}: {}".format(user, pu))
        # print("User test: {}".format(test_list))
        # print("User recs: {}".format(recs_list))

    #p /= len(users)

    # p = session.run(pre)
    # print("Precision@{}: {}".format(k, p))

    # print("test len: {} - recs len: {}".format(len(test_list), len(recs_list)))
    #
    # print("test list - type: {}".format(type(test_list)))
    # print(test_list)
    #
    # print("recs list - type: {}".format(type(recs_list)))
    # print(recs_list)

Evaluating...


RuntimeError: Attempted to use a closed Session.