In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from spotlight.factorization.implicit import ImplicitFactorizationModel
from spotlight.interactions import Interactions
from spotlight.cross_validation import random_train_test_split

In [9]:
ratings_df = pd.read_csv('C:/Users/papad/Desktop/goodbooks-10k-master/goodbooks-10k-master/ratings.csv')
books_df = pd.read_csv('C:/Users/papad/Desktop/goodbooks-10k-master/goodbooks-10k-master/books.csv')
to_read_df = pd.read_csv('C:/Users/papad/Desktop/goodbooks-10k-master/goodbooks-10k-master/to_read.csv')
book_tags_df = pd.read_csv('C:/Users/papad/Desktop/goodbooks-10k-master/goodbooks-10k-master/book_tags.csv')
tags_df = pd.read_csv('C:/Users/papad/Desktop/goodbooks-10k-master/goodbooks-10k-master/tags.csv')

# Dataset

In [10]:
percentage = 1.7
ratings_df = ratings_df.head(int(len(ratings_df)*(percentage/100)))

In [11]:
books_df[books_df['book_id']==9439]

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
9438,9439,35539,35539,1654509,17,316013315,9780316000000.0,Bob Spitz,2005.0,The Beatles: The Biography,...,8120,8464,390,205,314,1270,3020,3655,https://images.gr-assets.com/books/1327951066m...,https://images.gr-assets.com/books/1327951066s...


In [12]:
ratings_df

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3
...,...,...,...
101595,344,27,3
101596,344,21,3
101597,344,25,3
101598,344,23,3


In [13]:
num_users = ratings_df['user_id'].unique()
print(len(num_users))
num_items = ratings_df['book_id'].unique()
print(len(num_items))

1823
5050


# Spotlight

Spotlight users and items ids should be consecutive. For that purpose in the cell below, the defaultdict function from the collections library is used to provide, the uid_map and iid_map. These variables contain consecutive numbers equal to the number of users and items respectively. These are necessary to create the interaction objects, which are the fundamental objects of the Spotlight platform, and are required in order to build a Spotlight recommender. 

Then I keep the reveresed mapped user and item ids, to take the real predicted item ids when i finish the process with the Spotlight model.

In [7]:
from collections import defaultdict
from itertools import count
uid_map = defaultdict(count().__next__)
iid_map = defaultdict(count().__next__)
uids = np.array([uid_map[uid] for uid in ratings_df["user_id"].values ], dtype=np.int32)
iids = np.array([iid_map[iid] for iid in ratings_df["book_id"].values ], dtype=np.int32)

uid_rev_map = {v: k for k, v in uid_map.items()}
iid_rev_map = {v: k for k, v in iid_map.items()}


print("userId %d got uid %d" % (556, uid_map[556]))
print("movieId %d got iid %d" % (54001, iid_map[54001]))

userId 556 got uid 1157
movieId 54001 got iid 4477


Interactions refer, to the user-item interactions. Each interaction contains a user-item pair interaction, and it can be added a timestamp, and for explicit models, a rating.

After creating the Interactions object I split the data using the random_train_test_split function from the Spotlight library. I firstly split my dataset into train set, test set, validation set. 
Having in mind that a lightGBM that combines the knowledge extracted from a few baseline models, I split the training set into features and label sets, to use them to formulate the data in order to train the lightGBM model later.

In [8]:
from spotlight.interactions import Interactions
from spotlight.cross_validation import random_train_test_split
from spotlight.factorization.implicit import ImplicitFactorizationModel
import time  


#pecify the ids of the users and items for all the pairs of user-item interactions
implicit_interaction = Interactions(user_ids=uids,
                                   item_ids=iids)


#lets initialise the seed, so that its repeatable and reproducible 
implicit_train_tmp, implicit_test = random_train_test_split(implicit_interaction, random_state=np.random.seed(42))
implicit_train, implicit_val = random_train_test_split(implicit_train_tmp, random_state=np.random.seed(42))
implicit_train_features, implicit_train_labels = random_train_test_split(implicit_train, random_state=np.random.seed(42))

In [9]:
print(implicit_train)
print(implicit_test)
print(implicit_val)
print(implicit_train_features)
print(implicit_train_labels)

<Interactions dataset (1157 users x 4477 items x 38248 interactions)>
<Interactions dataset (1157 users x 4477 items x 11953 interactions)>
<Interactions dataset (1157 users x 4477 items x 9563 interactions)>
<Interactions dataset (1157 users x 4477 items x 30598 interactions)>
<Interactions dataset (1157 users x 4477 items x 7650 interactions)>


Apart from a Spotlight model, an SVD model is trained, so the conversion of the Interaction objects to dataframes is thought as necessary, as the same data should be used for all the model, to provide fair comparisons.

In [10]:
def interactionToDataFrame(dataset):
    converted_users = []
    converted_books = []
    rating = []
    for i in range(len(dataset.user_ids)):
        converted_users.append(uid_rev_map[dataset.user_ids[i]])
        converted_books.append(iid_rev_map[dataset.item_ids[i]])
        rating.append(float(ratings_df['rating'][(ratings_df['book_id'] == converted_books[i]) & (
            ratings_df['user_id'] == converted_users[i])].values))
    df = pd.DataFrame({'user_id': converted_users,
                       'book_id': converted_books, 'rating': rating})
    return df

In [11]:
train_df = interactionToDataFrame(implicit_train)
val_df = interactionToDataFrame(implicit_val)
test_df = interactionToDataFrame(implicit_test)
train_features_df = interactionToDataFrame(implicit_train_features)
train_labels_df = interactionToDataFrame(implicit_train_labels)

## Implicit Factorization model

The model form the Spotlight, that is used is the Implicit Factorization model. This is a model used to address implicit feedback from the users. To make it clear, the implicit feedback in a movie platform could be the movie that a user watched but did not gave a rating. Given, that he/ she watched some movies, knowledge can be extracted about his/her preferences, without having ratings.

In [12]:
implicit_model = ImplicitFactorizationModel(n_iter=5)

current = time.time()

implicit_model.fit(implicit_train, verbose=True)

end = time.time()
diff = end - current
print('Training took %d second' % (diff))

Epoch 0: loss 0.8884793090820312
Epoch 1: loss 0.5257788654168447
Epoch 2: loss 0.4345852353175481
Epoch 3: loss 0.3761194217205048
Epoch 4: loss 0.33019956866900124
Training took 38 second


### Top - K

The idea is to provide the top-K(100) recommendations for each user and then evaluate the results using the NDCG evaluation metric. 

For this purpose on the cell below, the topK recommendations for a user are calculated.

The function below, topK_implicit, has been introduced on the lab of the course Recommender Systems.

In [13]:
import scipy.stats as st
k = 100
def topK_implicit(user, model, k=100):
  a= list()
  ranks = st.rankdata(-model.predict(user))   
  for iid in np.argwhere(ranks <= k):
    a.append(iid_rev_map[iid.item(0)])
  return a

rec_forUser = topK_implicit(0,implicit_model)
print(rec_forUser)

[26, 33, 301, 18, 27, 21, 2, 23, 24, 35, 287, 58, 111, 5, 8, 65, 45, 14, 55, 13, 1796, 867, 50, 87, 32, 28, 495, 11, 101, 344, 122, 9, 81, 60, 238, 570, 181, 119, 213, 407, 10, 171, 94, 516, 233, 115, 177, 4, 483, 100, 92, 401, 131, 662, 284, 378, 323, 860, 130, 78, 186, 150, 66, 114, 132, 22, 7, 75, 121, 272, 217, 80, 228, 43, 57, 273, 172, 125, 138, 29, 118, 178, 354, 108, 955, 64, 383, 127, 714, 38, 916, 225, 523, 40, 410, 44, 25, 170, 104, 67]


The topK recommendations for all users are needed, so the function below returns the topK recommendations for all the users, taking advantage of the function above.

In [14]:
def allUsers_topK_implicit(model):
    a = []
    for i in range(len(num_users)):
       a.append(topK_implicit(i,model))
    return a

### NDCG Score

In [15]:
allUsers_topK_imp = allUsers_topK_implicit(implicit_model)

To provide the NDCG value it is necessary to compute the relevance for each user. The relevance containes 0s and 1s, based on which of the topK recommended movies the user has actually watched. For every recommended movie that the user has actually watched, the value of 1 is assigned. 

The function below returns a list that contains a relevance list for each user.

In [16]:
def relevance_spotlight_implicit(allUsers_topK):
    relevance_implicit_full = []
    helper = []
    count = 0
    for i in range(1,len(num_users)+1):
        for j in range(len(allUsers_topK[count])):
            if allUsers_topK[count][j] in test_df['book_id'][test_df['user_id'] == i].values.tolist():
                helper.append(1)
            else:
                helper.append(0)
        relevance_implicit_full.append(helper)
        helper = []
        count += 1
    return relevance_implicit_full

The relevance_implicit_full variable contains the relevance list of each user for the implicit factorization model results.

In [17]:
relevance_implicit_full = relevance_spotlight_implicit(allUsers_topK_imp)

The dcg and ideal_dcg functions are taken form https://github.com/lezzago/LambdaMart/blob/master/lambdamart.py. They are necessary to compute the desired NDCG evaluation metric.

In [18]:
def dcg(scores):
    """
    Returns the DCG value of the list of scores.
    Parameters
    ----------
    scores : list
        Contains labels in a certain ranked order

    Returns
    -------
    DCG_val: int
        This is the value of the DCG on the given scores
    """
    return np.sum([
                    (np.power(2, scores[i]) - 1) / np.log2(i + 2)
                    for i in range(len(scores))
                ])

In [19]:
def ideal_dcg(scores):
    """
    Returns the Ideal DCG value of the list of scores.
    Parameters
    ----------
    scores : list
        Contains labels in a certain ranked order

    Returns
    -------
    Ideal_DCG_val: int
        This is the value of the Ideal DCG on the given scores
    """
    scores = [score for score in sorted(scores)[::-1]]
    return dcg(scores)

The function below, NDCG_forUser, given a relevance list, returns the NDCG value. In simpler words, it returns the NDCG value for a user.

In [20]:
def NDCG_forUser(relevance):
    return dcg(relevance) / ideal_dcg(relevance)

The function below, allUsers_ndcg, returns a list that contains the NDCG score for each user.

In [21]:
def allUsers_ndcg(relevance):
    allUsers_implicit_ndcg = []
    for i in range(len(num_users)):
        allUsers_implicit_ndcg.append(NDCG_forUser(relevance[i]))
    return allUsers_implicit_ndcg

In [22]:
allUsers_implicit_ndcg = allUsers_ndcg(relevance_implicit_full)

  


Fill the nan values with 0s.

In [23]:
allUsers_implicit_ndcg = pd.Series(allUsers_implicit_ndcg).fillna(0).tolist()

Compute the mean value of the NDCG scores.

In [24]:
np.array(allUsers_implicit_ndcg).mean()

0.1536967026264128

# SVD

On this section (Section 3), an SVD model is implemented, aiming to provide the top-K recommendations for each user and evaluate the results using the NDCG evaluation metric.

Pivot is used to have a matrix that has as rows the userIDs and as columns the movieIDs. The unrated, by a user, movies are assigned the value 0 instead of null. Pivot organizes a DataFrame by giving index and column values. It is helpful in the recommender systems approach, as it provides a dataframe in which, it can be seen directly any user-item rating.

In [25]:
df_book_features = train_df.pivot(
    index='user_id',
    columns='book_id',
    values='rating'
).fillna(0)

In [26]:
df_book_features

book_id,2,3,4,5,7,8,9,10,11,13,...,9966,9972,9974,9978,9981,9985,9988,9990,9991,9995
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5.0,0.0,0.0,0.0,0.0,4.0,0.0,5.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1942,0.0,5.0,0.0,0.0,0.0,0.0,3.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1947,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## SVD implementation

In [27]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(df_book_features)

## Predictions

To get the prediction of a rating of an item dj, we can calculate the dot product of the two vectors.
The sigma variable has just the values, so I have to convert to a diagonal matrix in orded to implement the matrix multiplication.

In [28]:
sigma = np.diag(sigma)
svd_predictions_score = np.dot(np.dot(U, sigma), Vt)
svd_predictions_score_df = pd.DataFrame(svd_predictions_score, columns = df_book_features.columns)


In [29]:
svd_predictions_score_df

book_id,2,3,4,5,7,8,9,10,11,13,...,9966,9972,9974,9978,9981,9985,9988,9990,9991,9995
0,-0.017455,-0.001515,0.050331,0.052745,-0.033695,0.037934,0.010390,0.026964,0.073122,-0.001999,...,0.001903,-0.000394,0.000056,-0.000907,0.000815,-1.754186e-05,-0.000661,0.000795,0.002423,-0.000368
1,-0.001614,0.000302,0.004419,-0.001005,0.007396,-0.000887,0.004430,0.001443,0.002486,-0.000718,...,-0.000247,-0.000332,0.000052,0.000014,0.000060,1.849593e-05,0.000086,-0.000098,-0.000225,0.000026
2,3.849454,0.112768,1.658002,1.557440,1.672078,1.979167,0.796537,1.328552,0.403837,2.029649,...,0.041932,0.185939,0.027752,0.063716,-0.009722,-1.030917e-04,0.018998,0.010440,0.019181,0.041334
3,-0.000626,-0.000102,0.002429,0.002598,-0.000845,0.002723,-0.000042,0.001276,0.002041,0.001074,...,0.000103,-0.000015,-0.000011,0.000013,0.000026,-3.098268e-07,-0.000003,0.000019,0.000100,-0.000034
4,-0.083200,0.034611,0.945283,0.463547,2.253105,1.216248,0.239510,0.247593,-0.367294,1.516744,...,0.023008,0.152818,0.008016,0.068222,-0.025327,2.370912e-03,0.053958,-0.006830,-0.008167,0.033255
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1140,0.251917,-0.022533,1.166826,1.033472,-0.352679,0.831270,0.612273,0.644032,2.238336,-0.131467,...,0.038015,0.048774,0.008908,-0.019945,0.001733,2.585383e-04,-0.017269,0.020149,0.027642,-0.002355
1141,0.032994,0.002253,-0.050669,-0.035706,0.023541,-0.031798,-0.010256,-0.024701,-0.053161,0.008626,...,-0.000460,0.005642,0.000374,0.000739,-0.001203,-5.239600e-05,0.000540,0.000189,-0.000779,0.001643
1142,0.358116,0.036171,1.310905,0.730774,0.686997,0.720426,0.554902,0.725293,0.530283,0.416280,...,-0.007361,-0.068892,0.005568,0.009801,0.024332,1.806628e-03,0.007684,-0.009410,0.004719,-0.004828
1143,1.806776,0.042165,0.666855,0.413391,0.383797,0.501376,0.508996,0.613926,0.383042,0.353137,...,-0.008535,-0.017707,0.008814,0.008885,0.007097,5.137072e-04,-0.007193,-0.004727,-0.015177,-0.004555


In [30]:
svd_predictions_score_df[svd_predictions_score_df.index == 1137]

book_id,2,3,4,5,7,8,9,10,11,13,...,9966,9972,9974,9978,9981,9985,9988,9990,9991,9995
1137,-0.110217,0.030743,0.669496,0.284043,0.215431,-0.082184,0.678284,0.298014,1.371991,-0.406187,...,-0.003757,0.018544,0.013231,-0.022132,0.006734,0.000996,-0.006257,0.009587,0.005257,0.01462


## Top-K recommendations

The function below returns a list that contains the top-100 recommendations for each user

In [31]:
def SVD_topK_first(score_df):
    topK_svd_rec = []
    c = 0
    for i in range(len(svd_predictions_score_df)):
        svd_h_df = pd.DataFrame({'book_id': score_df[score_df.index == i].columns,
                                 'score': score_df[score_df.index == i].values[0]})
        svd_h_df = svd_h_df.sort_values('score', ascending=False)
        svd_h_df = svd_h_df.head(100)
        topK_svd_rec.append(svd_h_df['book_id'].values)
    return topK_svd_rec

## NDCG Score

The function below returns the relevance list of a list of movies.

In [32]:
def SVD_relevance_first(score_df):
    svd_topK = SVD_topK_first(score_df)
    relevance_svd_full = []
    helper = []
    count = 0
    for i in range(1,len(svd_predictions_score_df)+1):
        for j in range(len(svd_topK[count])):
            if svd_topK[count][j] in test_df['book_id'][test_df['user_id'] == i].values.tolist():
                helper.append(1)
            else:
                helper.append(0)
        relevance_svd_full.append(helper)
        helper = []
        count += 1
    return relevance_svd_full

In [33]:
relevance_first = SVD_relevance_first(svd_predictions_score_df)

In [34]:
np.asarray(relevance_first).shape

(1145, 100)

In [35]:
n_f = NDCG_forUser(relevance_first[0])

  


In [36]:
n_f

nan

Below is the NDCG scores of each user

In [37]:
def allUsers_ndcg_svd(relevance):
    allUsers_implicit_ndcg = []
    for i in range(len(svd_predictions_score_df)):
        allUsers_implicit_ndcg.append(NDCG_forUser(relevance[i]))
    return allUsers_implicit_ndcg
all_ndcg_f = allUsers_ndcg_svd(relevance_first)

  


In [38]:
all_ndcg_f = pd.Series(all_ndcg_f).fillna(0).tolist()

In [39]:
np.asarray(all_ndcg_f).mean()

0.13970337091012314

# Feature engineering

##  Feature 1 (Implicit factorization model scores)

Train an implicit factorization model using the train_features dataset

In [40]:
f1_model = ImplicitFactorizationModel(n_iter = 5)

current = time.time()

f1_model.fit(implicit_train_features,verbose = True)

end = time.time()
diff = end - current
print('Training took %d second'%(diff))

Epoch 0: loss 0.9358363171418508
Epoch 1: loss 0.5963205330073833
Epoch 2: loss 0.4488706881801287
Epoch 3: loss 0.3914698469142119
Epoch 4: loss 0.32816403011480966
Training took 17 second


### Target Movies

In [59]:
def getImplicitTop():
    moviePerUser = []
    rev = []
    for user in range(len(num_users)):
        pred = f1_model.predict(user)
        #sort by score in descending order oscer
        predMovieId = np.argsort(-pred)
        #keep top 100
        predMovieId = predMovieId[:100]
        #reverse the mapping to get the original movieIDs
        for movie in predMovieId:
            rev.append(iid_rev_map[movie])
        moviePerUser.append(rev)
        rev = []
    return moviePerUser

In [60]:
target_positive = getImplicitTop()

Add negative samples to the target movies

In [42]:
len(train_df['user_id'].unique())

1145

In [62]:
import random
target_negative = []
for i in range(1,len(num_users)+1):
    #get 50 random movieIDs for each user
    target_negative.append(random.choices(train_features_df['book_id'].values, k = 50))

In [63]:
target_movies = []
for i in range(len(num_users)):
    target_movies.append(target_positive[i] + target_negative[i])

Map the target movies to pick the correct predictions based on the item id

In [64]:
def map_movies(target_movies):
    map_target_movies = []
    map_help = []
    for movies in target_movies:
        for movie in movies:
            map_help.append(iid_map[movie])
        map_target_movies.append(map_help)
        map_help = []
    return map_target_movies

Implicit Factorization model Scores

Train implicit on implicit_features_train! and take these scores.

In [65]:
def give_movies_implicit_scores(map_movies):
    implicit_scores = 0
    feature = []
    for i in range(len(num_users)):
        #for movies in map_target_movies:
        my_list = map_movies[i]
        my_array = np.array(my_list)
        implicit_scores = f1_model.predict(user_ids = i,item_ids = my_array)
        feature.append(implicit_scores)
    return feature

In [66]:
map_target_movies = map_movies(target_movies)
feature1 = give_movies_implicit_scores(map_target_movies)
feature1 = np.asarray(feature1)
feature1.shape

(1157, 150)

In [67]:
feature1[0]

array([ 9.516431  ,  9.101404  ,  8.972198  ,  8.943448  ,  8.937172  ,
        8.874731  ,  8.746431  ,  8.567189  ,  8.561559  ,  8.546211  ,
        8.46651   ,  8.422235  ,  8.383978  ,  8.265129  ,  8.2099085 ,
        8.204773  ,  8.2002125 ,  8.178185  ,  8.177673  ,  8.127718  ,
        8.113931  ,  8.06932   ,  8.061887  ,  8.018593  ,  7.9616294 ,
        7.9164124 ,  7.912512  ,  7.8891816 ,  7.841817  ,  7.8412337 ,
        7.8384867 ,  7.828294  ,  7.8015904 ,  7.736866  ,  7.646708  ,
        7.6425004 ,  7.636322  ,  7.5865817 ,  7.5842657 ,  7.541096  ,
        7.462632  ,  7.4609046 ,  7.4018383 ,  7.3510857 ,  7.341598  ,
        7.331588  ,  7.326401  ,  7.3144817 ,  7.305494  ,  7.2572136 ,
        7.2230725 ,  7.2216926 ,  7.175812  ,  7.1735387 ,  7.1717134 ,
        7.1619887 ,  7.10519   ,  7.0970535 ,  7.0966277 ,  7.0930104 ,
        7.078827  ,  7.0782866 ,  7.0771775 ,  7.068073  ,  7.0252495 ,
        7.0058913 ,  6.999884  ,  6.97893   ,  6.9419746 ,  6.92

## Feature 2 (Rank)

In [68]:
feature2 = []
for i in range(len(num_users)):
    feature2 = (st.rankdata(-f1_model.predict(i)))
    feature2 = feature2[map_target_movies]
    feature2.sort()

In [50]:
np.array(feature2).shape

(1157, 100)

In [69]:
feature2[0]

array([1.000e+00, 2.000e+00, 3.000e+00, 4.000e+00, 7.000e+00, 1.100e+01,
       1.300e+01, 1.400e+01, 1.700e+01, 1.800e+01, 2.000e+01, 2.000e+01,
       2.100e+01, 2.200e+01, 2.300e+01, 3.100e+01, 3.100e+01, 3.200e+01,
       3.500e+01, 3.800e+01, 3.900e+01, 3.900e+01, 4.000e+01, 4.000e+01,
       4.600e+01, 5.400e+01, 5.600e+01, 5.600e+01, 5.800e+01, 6.200e+01,
       7.300e+01, 8.100e+01, 8.200e+01, 8.300e+01, 8.700e+01, 8.700e+01,
       8.900e+01, 9.000e+01, 9.300e+01, 9.900e+01, 9.900e+01, 9.900e+01,
       1.000e+02, 1.040e+02, 1.100e+02, 1.110e+02, 1.190e+02, 1.230e+02,
       1.300e+02, 1.360e+02, 1.410e+02, 1.430e+02, 1.510e+02, 1.530e+02,
       1.560e+02, 1.580e+02, 1.580e+02, 1.620e+02, 1.620e+02, 1.630e+02,
       1.700e+02, 1.710e+02, 1.730e+02, 1.750e+02, 1.760e+02, 1.790e+02,
       1.850e+02, 1.870e+02, 1.910e+02, 1.940e+02, 1.970e+02, 2.070e+02,
       2.080e+02, 2.100e+02, 2.160e+02, 2.180e+02, 2.290e+02, 2.330e+02,
       2.440e+02, 2.450e+02, 2.450e+02, 2.520e+02, 

## Feature 3 (SVD scores)

SVD model Scores. Take advantage of the DataFrame containing the scores to extract the predicted scores for the desired movies.

In [51]:
df_movie_features1 = train_features_df.pivot(
    index='user_id',
    columns='book_id',
    values='rating'
).fillna(0)

In [52]:
U1, sigma1, Vt1 = svds(df_movie_features1)
sigma1 = np.diag(sigma1)
svd_predictions_score1 = np.dot(np.dot(U1, sigma1), Vt1)
svd_predictions_score_df1 = pd.DataFrame(svd_predictions_score1, columns = df_movie_features1.columns)

In [53]:
target_movies[0][2]

301

In [54]:
svd_predictions_score_df1[svd_predictions_score_df1.index == 0]

book_id,2,3,4,5,7,8,9,10,11,13,...,9966,9972,9974,9978,9981,9985,9988,9990,9991,9995
0,-0.003803,-0.000947,0.039469,0.034202,-0.031163,0.024672,0.004871,0.027775,0.053842,0.000344,...,0.001509,0.000897,4.2e-05,-0.000582,0.000632,1.9e-05,-0.000746,0.000456,0.001204,-0.000152


In [55]:
float(svd_predictions_score_df1[svd_predictions_score_df1.index == 0][target_movies[0][0]].values)

0.029890038382440833

In [56]:

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(svd_predictions_score_df1[svd_predictions_score_df1.index == 0])

book_id      2         3         4         5         7         8         9     \
0       -0.003803 -0.000947  0.039469  0.034202 -0.031163  0.024672  0.004871   

book_id      10        11        13        14        15        18        19    \
0        0.027775  0.053842  0.000344 -0.008217  0.007827 -0.018395 -0.024396   

book_id      21        22        23        24        25       26        27    \
0       -0.013431  0.035654 -0.013833 -0.000136  0.006278  0.02989 -0.009358   

book_id     28        29        32        33        35        36        37    \
0       -0.00059  0.002453  0.013451  0.044615  0.017398  0.006528 -0.005179   

book_id      38        39        40        41        42        43        44    \
0        0.032792 -0.010436  0.008961  0.000565  0.006298  0.005667  0.003016   

book_id      45        46        47        48        49        50       51    \
0        0.033224  0.013759  0.001376 -0.004219 -0.000799  0.014246 -0.00014   

book_id      52        53   

In [57]:
feature3 = []
feature3_user = []
count = 0
for i in range(len(num_users)):
    for j in range(len(target_movies[i])):
        feature3_user.append(float(svd_predictions_score_df1[svd_predictions_score_df1.index == i][target_movies[i][j]].values))
    feature3.append(feature3_user)
    feature3_user = []

KeyError: 5194

In [None]:
type(feature3)

## Feature 4 (Genre)

In [None]:
def create_genre_feature(genre):
    feature = []
    genre_per_user = []
    for i in range(len(num_users)):
        for j in range(len(target_movies[i])):
            a =  movies_df[movies_df['movieId'] == target_movies[i][j]]['genres']
            a = a.tolist()
            a = str(a[0])
            a = a.split('|')
            if genre in a:
                genre_per_user.append(1)
            else:
                genre_per_user.append(0)
        feature.append(genre_per_user)
        genre_per_user = []
    return feature

In [None]:
feature4 = create_genre_feature('Comedy')

### Features 8-15 (genres)

In [None]:
feature8 = create_genre_feature('Adventure')
feature9 = create_genre_feature('Drama')
feature10 = create_genre_feature('Romance')
feature11 = create_genre_feature('Fantasy')
feature12 = create_genre_feature('Thriller')
feature13 = create_genre_feature('Animation')
feature14 = create_genre_feature('Children')
feature15 = create_genre_feature('Horror')

## Feature 5 (#Users watched each target movie)

In [87]:
number_views = []
number_view_per_movie = []
for i in range(len(num_users)):
    for j in range(len(target_movies[i])):
        a = len(ratings_df[ratings_df['book_id'] == target_movies[i][j]]['user_id'])
        number_view_per_movie.append(a)
    number_views.append(number_view_per_movie)
    number_view_per_movie = []


In [88]:
feature5 = number_views

In [89]:
np.array(feature5).shape

(1157, 150)

## Feature 6 (New user)

In [None]:
len(ratings_df[ratings_df['user_id']==1]) > 10

In [None]:
new_users_binary = []
binary_per_user = []
for i in range(len(num_users)):
    for j in range(len(target_movies[i])):
        a = len(ratings_df[ratings_df['user_id']==1]) > 10
        if a:
            binary_per_user.append(1)
        else:
            binary_per_user.append(0)
    new_users_binary.append(binary_per_user)
    binary_per_user = []

In [None]:
#rankings of old/new users

In [None]:
feature6 = new_users_binary

In [None]:
np.array(feature6).shape

## Feature 7(Embeddings) x32

In [None]:
emb = f1_model._net.item_embeddings

In [None]:
emb_features = []
helper = []
for i in range(32):
    for j in range(len(num_items)):
        helper.append(emb.weight[j][i].item())
    emb_features.append(helper)
    helper = []

In [None]:
a = []
b = []
feature7 = []
for i in range(32):
    for user in range(len(num_users)):
        for movie in map_target_movies[user]:
            a.append(emb_features[0][movie])
        b.append(a)
        a = []
    feature7.append(b)
    b = []

In [None]:
np.array(feature7).shape

In [None]:
np.array(feature7[0]).shape

## Features stack

In [None]:
'''features = np.stack((feature1,feature2,feature3,feature4,feature5,feature6,
                     feature7[0], feature7[1],feature7[2], feature7[3],
                     feature7[4], feature7[5],feature7[6], feature7[7],
                     feature7[8], feature7[9],feature7[10], feature7[11],
                     feature7[12], feature7[13],feature7[14], feature7[15],
                     feature7[16], feature7[17],feature7[18], feature7[19],
                     feature7[20], feature7[21],feature7[22], feature7[23],
                     feature7[24], feature7[25],feature7[26], feature7[27],
                     feature7[28], feature7[29],feature7[30], feature7[31],
                     feature8,feature9,feature10,feature11,feature12,
                    feature13,feature14,feature15))'''
features = np.stack((feature1,feature2,feature5,feature6,
                     feature7[0], feature7[1],feature7[2], feature7[3],
                     feature7[4], feature7[5],feature7[6], feature7[7],
                     feature7[8], feature7[9],feature7[10], feature7[11],
                     feature7[12], feature7[13],feature7[14], feature7[15],
                     feature7[16], feature7[17],feature7[18], feature7[19],
                     feature7[20], feature7[21],feature7[22], feature7[23],
                     feature7[24], feature7[25],feature7[26], feature7[27],
                     feature7[28], feature7[29],feature7[30], feature7[31]))

In [90]:
features = np.stack((feature1, feature2,feature5))

In [91]:
len(features)

3

## Labels

Below are the movies that each user watched

In [70]:
def watched_movies(df):
    watched = []
    #for i in range(len(num_users)):
    for u in uid_rev_map:
        watched.append(df[df["user_id"]==uid_rev_map.get(u)].groupby("book_id")["book_id"].count().index.tolist())  
    return watched 

In [71]:
#train_labels_df
#test_df
#val_df
train_watched = watched_movies(train_labels_df)
test_watched = watched_movies(test_df)
val_watched = watched_movies(val_df)

Creation of the target list. It containes 0s and 1s for the 100 recommended movies for each user. Based on the watched movies it assigns the values. If the user watched the recommended movie, then the value of the target is equal to 1, otherwise it is 0.

In [77]:
def create_target(watched):
    user_target = []
    target = []
    for i in range(len(num_users)):
        for target_movie in target_movies[i]:
            if target_movie in watched[i]:
                user_target.append(1)
            else:
                user_target.append(0)
        target.append(user_target)
        user_target = []
    return target

In [78]:
label_train = create_target(train_watched)
label_test = create_target(test_watched)
label_val = create_target(val_watched)
#y_train = y_test = y_val

## LightGBM model

In [92]:
k = len(target_movies[0])

X = np.array(features)
#X = X.reshape(len(num_users)*k,len(features))
X = X.reshape(len(features), len(num_users)*k)
X = np.transpose(X)
label_train = np.array(label_train)
label_test = np.array(label_test)
label_val = np.array(label_val)

In [93]:
y_train = label_train.reshape(len(num_users)*k,)
y_test = label_test.reshape(len(num_users)*k,)
y_val = label_val.reshape(len(num_users)*k,)

In [94]:
import lightgbm as lgb
param = {
    "task": "train",
    "num_leaves": 255,
    "min_data_in_leaf": 1,
    "min_sum_hessian_in_leaf": 100,
    "objective": "lambdarank",
    "metric": "ndcg",
    "ndcg_eval_at": [1, 5, 10, 20],
    "learning_rate": .1,
    "num_threads": 2
}
gbm = lgb.LGBMRanker()
gbm.set_params(**param)

LGBMRanker(metric='ndcg', min_data_in_leaf=1, min_sum_hessian_in_leaf=100,
           ndcg_eval_at=[1, 5, 10, 20], num_leaves=255, num_threads=2,
           objective='lambdarank', task='train')

target movies
train_df
feature1
feature2

X_train
X_test
X_val
y_train
y_test
y_val

In [95]:
X_train = X
X_test = X
X_val = X

In [96]:
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [97]:
query_train = [X_train.shape[0]]
query_val = [X_val.shape[0]]
query_test = [X_test.shape[0]]
group_train = []
group_val = []
print(query_train)
print(query_val)
print(query_test)
'''
for i in range(390):
    group_train.append(100)
group_train.append(40)
group_val.append(60)
for i in range(97):
    group_val.append(100)'''

for i in range(len(num_users)):
    group_train.append(k)
    group_val.append(k)


[173550]
[173550]
[173550]


In [98]:
gbm.fit(X_train, y_train, group=group_train,
        eval_set=[(X_val, y_val)], eval_group=[group_val],
        eval_at=[1, 3, 5, 10], early_stopping_rounds=50, eval_metric='ndcg')
#no early stopping

[1]	valid_0's ndcg@1: 0.297321	valid_0's ndcg@3: 0.297532	valid_0's ndcg@5: 0.300917	valid_0's ndcg@10: 0.319767
Training until validation scores don't improve for 50 rounds
[2]	valid_0's ndcg@1: 0.284356	valid_0's ndcg@3: 0.2887	valid_0's ndcg@5: 0.297721	valid_0's ndcg@10: 0.319024
[3]	valid_0's ndcg@1: 0.288678	valid_0's ndcg@3: 0.292544	valid_0's ndcg@5: 0.294971	valid_0's ndcg@10: 0.31696
[4]	valid_0's ndcg@1: 0.293863	valid_0's ndcg@3: 0.297492	valid_0's ndcg@5: 0.302732	valid_0's ndcg@10: 0.322549
[5]	valid_0's ndcg@1: 0.286949	valid_0's ndcg@3: 0.297337	valid_0's ndcg@5: 0.30322	valid_0's ndcg@10: 0.323135
[6]	valid_0's ndcg@1: 0.28522	valid_0's ndcg@3: 0.294137	valid_0's ndcg@5: 0.301629	valid_0's ndcg@10: 0.321857
[7]	valid_0's ndcg@1: 0.288678	valid_0's ndcg@3: 0.293817	valid_0's ndcg@5: 0.300199	valid_0's ndcg@10: 0.321374
[8]	valid_0's ndcg@1: 0.290406	valid_0's ndcg@3: 0.295197	valid_0's ndcg@5: 0.301638	valid_0's ndcg@10: 0.322753
[9]	valid_0's ndcg@1: 0.292135	valid_0's

LGBMRanker(metric='ndcg', min_data_in_leaf=1, min_sum_hessian_in_leaf=100,
           ndcg_eval_at=[1, 5, 10, 20], num_leaves=255, num_threads=2,
           objective='lambdarank', task='train')

In [99]:
gbm.best_iteration_

19

In [100]:
test_pred = gbm.predict(X_test)
test_pred

array([ 0.20982217,  0.48232015,  0.22800856, ..., -1.16299677,
       -1.20102401, -1.20102401])

## NDCG Score

In [115]:
p = 0
m = 150
lgbm_pred_per_user = []
for i in range(len(num_users)):
    lgbm_pred_per_user.append(test_pred[p:m])
    p += 150
    m += 150

### NDCG IFM on this data

In [None]:
np.array(feature7[0][609]).shape

In [None]:
feature1.shape

In [None]:
lgbm_NDCG = []
IFM_NDCG = []
#SVD_NDCG = []
for i in range(len(num_users)):
    excel_df = pd.DataFrame({'movie_id': target_movies[i], 'model1_score': feature1[i],
                             'rank': feature2[i], '#users_watched': feature5[i],
                             'new_user': feature6[i],
                             'emedding1': feature7[0][i], 'emedding2': feature7[1][i], 'emedding3': feature7[2][i],
                             'emedding4': feature7[3][i], 'emedding5': feature7[4][i], 'emedding6': feature7[5][i],
                             'emedding7': feature7[6][i], 'emedding8': feature7[7][i], 'emedding9': feature7[8][i],
                             'emedding10': feature7[9][i], 'emedding11': feature7[10][i], 'emedding12': feature7[11][i],
                             'emedding13': feature7[12][i], 'emedding14': feature7[13][i], 'emedding15': feature7[14][i],
                             'emedding16': feature7[15][i], 'emedding17': feature7[16][i], 'emedding18': feature7[17][i],
                             'emedding19': feature7[18][i], 'emedding20': feature7[19][i], 'emedding21': feature7[20][i],
                             'emedding22': feature7[21][i], 'emedding23': feature7[22][i], 'emedding24': feature7[23][i],
                             'emedding25': feature7[24][i], 'emedding26': feature7[25][i], 'emedding27': feature7[26][i],
                             'emedding28': feature7[27][i], 'emedding29': feature7[28][i], 'emedding30': feature7[29][i],
                             'emedding31': feature7[30][i], 'emedding32': feature7[31][i],
                             'train_label': label_train[i], 'test_label': label_test[i],
                             'y_val': label_val[i], 'LGBM Score': lgbm_pred_per_user[i]})
    excel_df = excel_df.sort_values('LGBM Score', ascending=False)
    relevance_lgbm_user = excel_df['test_label'].values.tolist()
    lgbm_NDCG.append(NDCG_forUser(relevance_lgbm_user))
    excel_df = excel_df.sort_values('model1_score', ascending=False)
    ifm_relevance = excel_df['test_label'].values.tolist()
    IFM_NDCG.append(NDCG_forUser(ifm_relevance))
    #excel_df = excel_df.sort_values('model2_score', ascending=False)
    #svd_relevance_h = excel_df['test_label'].values.tolist()
    #SVD_NDCG.append(NDCG_forUser(svd_relevance_h))

In [116]:
np.array(lgbm_pred_per_user).shape

(1157, 150)

In [117]:
lgbm_NDCG = []
IFM_NDCG = []
#SVD_NDCG = []
for i in range(len(num_users)):
    excel_df = pd.DataFrame({'movie_id': target_movies[i], 'model1_score': feature1[i],
                             'rank': feature2[i], '#users_watched': feature5[i],
                             'train_label': label_train[i], 'test_label': label_test[i],
                             'y_val': label_val[i], 'LGBM Score': lgbm_pred_per_user[i]})
    excel_df = excel_df.sort_values('LGBM Score', ascending=False)
    relevance_lgbm_user = excel_df['test_label'].values.tolist()
    lgbm_NDCG.append(NDCG_forUser(relevance_lgbm_user))
    excel_df = excel_df.sort_values('model1_score', ascending=False)
    ifm_relevance = excel_df['test_label'].values.tolist()
    IFM_NDCG.append(NDCG_forUser(ifm_relevance))
    #excel_df = excel_df.sort_values('model2_score', ascending=False)
    #svd_relevance_h = excel_df['test_label'].values.tolist()
    #SVD_NDCG.append(NDCG_forUser(svd_relevance_h))

  


In [118]:
lgbm_NDCG = pd.Series(lgbm_NDCG).fillna(0).tolist()
IFM_NDCG = pd.Series(IFM_NDCG).fillna(0).tolist()
#SVD_NDCG = pd.Series(SVD_NDCG).fillna(0).tolist()

## NDCG Comparison

In [119]:
allUsers_implicit_ndcg[0]

0.0

In [120]:
ndcg_df = pd.DataFrame({'Implicit FM': allUsers_implicit_ndcg, 'LightGBM': lgbm_NDCG, 'IFM_this': IFM_NDCG})

In [121]:
ndcg_df

Unnamed: 0,Implicit FM,LightGBM,IFM_this
0,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000
2,0.000000,0.642889,0.434471
3,0.000000,0.000000,0.000000
4,0.000000,0.351835,0.273095
...,...,...,...
1152,0.000000,0.000000,0.000000
1153,0.000000,0.180031,0.189200
1154,0.240540,0.000000,0.000000
1155,0.000000,0.203795,0.178104


In [122]:
ndcg_df['Implicit FM'].mean()

0.15369670262641288

In [123]:
lgbm_mean = ndcg_df['LightGBM'].mean()
lgbm_mean

0.2808607274416536

In [124]:
ifm_mean = ndcg_df['IFM_this'].mean()
ifm_mean

0.2869945347978676

In [None]:
#svd_mean = ndcg_df['SVD_this'].mean()
#svd_mean

## Visualization

## Excel-Like

In [None]:
excel1_df = pd.DataFrame({'movie_id': target_movies[0],'model1_score':feature1[0], 'model2_score':feature3[0],'is_comedy':feature4[0],
                            '#users_watched':feature5[0], 'train_label':label_train[0], 'test_label':label_test[0],
                            'label_val':label_val[0],'LGBM Score':lgbm_pred_per_user[0]})

In [None]:
excel1_df.head(10)

In [None]:
excel1_df = excel1_df.sort_values('LGBM Score',ascending = False)
excel1_df.head(10)

In [None]:
relevance_lgbm_firstUser = excel1_df['test_label'].values.tolist()

In [None]:
L1GBM_ndcg = NDCG_forUser(relevance_lgbm_firstUser)
L1GBM_ndcg