In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from spotlight.factorization.implicit import ImplicitFactorizationModel
from spotlight.interactions import Interactions
from spotlight.cross_validation import random_train_test_split

In [2]:
ratings_df = pd.read_csv('C:/Users/papad/Desktop/goodbooks-10k-master/goodbooks-10k-master/ratings.csv')
books_df = pd.read_csv('C:/Users/papad/Desktop/goodbooks-10k-master/goodbooks-10k-master/books.csv')
to_read_df = pd.read_csv('C:/Users/papad/Desktop/goodbooks-10k-master/goodbooks-10k-master/to_read.csv')
book_tags_df = pd.read_csv('C:/Users/papad/Desktop/goodbooks-10k-master/goodbooks-10k-master/book_tags.csv')
tags_df = pd.read_csv('C:/Users/papad/Desktop/goodbooks-10k-master/goodbooks-10k-master/tags.csv')

# Dataset

In [3]:
percentage = 1.7
ratings_df = ratings_df.head(int(len(ratings_df)*(percentage/100)))

In [4]:
books_df[books_df['book_id']==9439]

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
9438,9439,35539,35539,1654509,17,316013315,9780316000000.0,Bob Spitz,2005.0,The Beatles: The Biography,...,8120,8464,390,205,314,1270,3020,3655,https://images.gr-assets.com/books/1327951066m...,https://images.gr-assets.com/books/1327951066s...


In [5]:
ratings_df

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3
...,...,...,...
101595,344,27,3
101596,344,21,3
101597,344,25,3
101598,344,23,3


In [6]:
num_users = ratings_df['user_id'].unique()
print(len(num_users))
num_items = ratings_df['book_id'].unique()
print(len(num_items))

1823
5050


# Spotlight

Spotlight users and items ids should be consecutive. For that purpose in the cell below, the defaultdict function from the collections library is used to provide, the uid_map and iid_map. These variables contain consecutive numbers equal to the number of users and items respectively. These are necessary to create the interaction objects, which are the fundamental objects of the Spotlight platform, and are required in order to build a Spotlight recommender. 

Then I keep the reveresed mapped user and item ids, to take the real predicted item ids when i finish the process with the Spotlight model.

In [7]:
from collections import defaultdict
from itertools import count
uid_map = defaultdict(count().__next__)
iid_map = defaultdict(count().__next__)
uids = np.array([uid_map[uid] for uid in ratings_df["user_id"].values ], dtype=np.int32)
iids = np.array([iid_map[iid] for iid in ratings_df["book_id"].values ], dtype=np.int32)

uid_rev_map = {v: k for k, v in uid_map.items()}
iid_rev_map = {v: k for k, v in iid_map.items()}


print("userId %d got uid %d" % (556, uid_map[556]))
print("movieId %d got iid %d" % (54001, iid_map[54001]))

userId 556 got uid 1823
movieId 54001 got iid 5050


Interactions refer, to the user-item interactions. Each interaction contains a user-item pair interaction, and it can be added a timestamp, and for explicit models, a rating.

After creating the Interactions object I split the data using the random_train_test_split function from the Spotlight library. I firstly split my dataset into train set, test set, validation set. 
Having in mind that a lightGBM that combines the knowledge extracted from a few baseline models, I split the training set into features and label sets, to use them to formulate the data in order to train the lightGBM model later.

In [8]:
from spotlight.interactions import Interactions
from spotlight.cross_validation import random_train_test_split
from spotlight.factorization.implicit import ImplicitFactorizationModel
import time  


#pecify the ids of the users and items for all the pairs of user-item interactions
implicit_interaction = Interactions(user_ids=uids,
                                   item_ids=iids)


#lets initialise the seed, so that its repeatable and reproducible 
implicit_train_tmp, implicit_test = random_train_test_split(implicit_interaction, random_state=np.random.seed(42))
implicit_train, implicit_val = random_train_test_split(implicit_train_tmp, random_state=np.random.seed(42))
implicit_train_features, implicit_train_labels = random_train_test_split(implicit_train, random_state=np.random.seed(42))

In [9]:
print(implicit_train)
print(implicit_test)
print(implicit_val)
print(implicit_train_features)
print(implicit_train_labels)

<Interactions dataset (1823 users x 5050 items x 65024 interactions)>
<Interactions dataset (1823 users x 5050 items x 20320 interactions)>
<Interactions dataset (1823 users x 5050 items x 16256 interactions)>
<Interactions dataset (1823 users x 5050 items x 52019 interactions)>
<Interactions dataset (1823 users x 5050 items x 13005 interactions)>


Apart from a Spotlight model, an SVD model is trained, so the conversion of the Interaction objects to dataframes is thought as necessary, as the same data should be used for all the model, to provide fair comparisons.

In [10]:
def interactionToDataFrame(dataset):
    converted_users = []
    converted_books = []
    rating = []
    for i in range(len(dataset.user_ids)):
        converted_users.append(uid_rev_map[dataset.user_ids[i]])
        converted_books.append(iid_rev_map[dataset.item_ids[i]])
        rating.append(float(ratings_df['rating'][(ratings_df['book_id'] == converted_books[i]) & (
            ratings_df['user_id'] == converted_users[i])].values))
    df = pd.DataFrame({'user_id': converted_users,
                       'book_id': converted_books, 'rating': rating})
    return df

In [11]:
train_df = interactionToDataFrame(implicit_train)
val_df = interactionToDataFrame(implicit_val)
test_df = interactionToDataFrame(implicit_test)
train_features_df = interactionToDataFrame(implicit_train_features)
train_labels_df = interactionToDataFrame(implicit_train_labels)

## Implicit Factorization model

The model form the Spotlight, that is used is the Implicit Factorization model. This is a model used to address implicit feedback from the users. To make it clear, the implicit feedback in a movie platform could be the movie that a user watched but did not gave a rating. Given, that he/ she watched some movies, knowledge can be extracted about his/her preferences, without having ratings.

In [12]:
implicit_model = ImplicitFactorizationModel(n_iter=5)

current = time.time()

implicit_model.fit(implicit_train, verbose=True)

end = time.time()
diff = end - current
print('Training took %d second' % (diff))

Epoch 0: loss 0.7945170504605676
Epoch 1: loss 0.46438092685590576
Epoch 2: loss 0.3951647238233897
Epoch 3: loss 0.3419982120981367
Epoch 4: loss 0.29941024103267927
Training took 23 second


### Top - K

The idea is to provide the top-K(100) recommendations for each user and then evaluate the results using the NDCG evaluation metric. 

For this purpose on the cell below, the topK recommendations for a user are calculated.

The function below, topK_implicit, has been introduced on the lab of the course Recommender Systems.

In [13]:
import scipy.stats as st
k = 100
def topK_implicit(user, model, k=100):
  a= list()
  ranks = st.rankdata(-model.predict(user))   
  for iid in np.argwhere(ranks <= k):
    a.append(iid_rev_map[iid.item(0)])
  return a

rec_forUser = topK_implicit(0,implicit_model)
print(rec_forUser)

[26, 33, 301, 18, 27, 21, 2, 23, 24, 35, 287, 58, 111, 5, 8, 65, 45, 325, 14, 55, 36, 102, 13, 268, 50, 87, 32, 28, 42, 495, 11, 658, 101, 344, 372, 60, 238, 119, 587, 407, 10, 94, 516, 115, 4, 483, 709, 100, 306, 92, 148, 131, 662, 284, 378, 323, 860, 948, 136, 150, 66, 236, 114, 22, 75, 162, 121, 272, 217, 43, 57, 125, 155, 160, 530, 138, 154, 178, 354, 468, 117, 383, 19, 211, 1135, 38, 916, 485, 225, 2508, 620, 523, 40, 116, 410, 457, 62, 25, 46, 67]


The topK recommendations for all users are needed, so the function below returns the topK recommendations for all the users, taking advantage of the function above.

In [14]:
def allUsers_topK_implicit(model):
    a = []
    for i in range(len(num_users)):
       a.append(topK_implicit(i,model))
    return a

### NDCG Score

In [15]:
allUsers_topK_imp = allUsers_topK_implicit(implicit_model)

To provide the NDCG value it is necessary to compute the relevance for each user. The relevance containes 0s and 1s, based on which of the topK recommended movies the user has actually watched. For every recommended movie that the user has actually watched, the value of 1 is assigned. 

The function below returns a list that contains a relevance list for each user.

In [16]:
def relevance_spotlight_implicit(allUsers_topK):
    relevance_implicit_full = []
    helper = []
    count = 0
    for i in range(1,len(num_users)+1):
        for j in range(len(allUsers_topK[count])):
            if allUsers_topK[count][j] in test_df['book_id'][test_df['user_id'] == i].values.tolist():
                helper.append(1)
            else:
                helper.append(0)
        relevance_implicit_full.append(helper)
        helper = []
        count += 1
    return relevance_implicit_full

The relevance_implicit_full variable contains the relevance list of each user for the implicit factorization model results.

In [17]:
relevance_implicit_full = relevance_spotlight_implicit(allUsers_topK_imp)

The dcg and ideal_dcg functions are taken form https://github.com/lezzago/LambdaMart/blob/master/lambdamart.py. They are necessary to compute the desired NDCG evaluation metric.

In [18]:
def dcg(scores):
    """
    Returns the DCG value of the list of scores.
    Parameters
    ----------
    scores : list
        Contains labels in a certain ranked order

    Returns
    -------
    DCG_val: int
        This is the value of the DCG on the given scores
    """
    return np.sum([
                    (np.power(2, scores[i]) - 1) / np.log2(i + 2)
                    for i in range(len(scores))
                ])

In [19]:
def ideal_dcg(scores):
    """
    Returns the Ideal DCG value of the list of scores.
    Parameters
    ----------
    scores : list
        Contains labels in a certain ranked order

    Returns
    -------
    Ideal_DCG_val: int
        This is the value of the Ideal DCG on the given scores
    """
    scores = [score for score in sorted(scores)[::-1]]
    return dcg(scores)

The function below, NDCG_forUser, given a relevance list, returns the NDCG value. In simpler words, it returns the NDCG value for a user.

In [20]:
def NDCG_forUser(relevance):
    return dcg(relevance) / ideal_dcg(relevance)

The function below, allUsers_ndcg, returns a list that contains the NDCG score for each user.

In [21]:
def allUsers_ndcg(relevance):
    allUsers_implicit_ndcg = []
    for i in range(len(num_users)):
        allUsers_implicit_ndcg.append(NDCG_forUser(relevance[i]))
    return allUsers_implicit_ndcg

In [22]:
allUsers_implicit_ndcg = allUsers_ndcg(relevance_implicit_full)

  


Fill the nan values with 0s.

In [23]:
allUsers_implicit_ndcg = pd.Series(allUsers_implicit_ndcg).fillna(0).tolist()

Compute the mean value of the NDCG scores.

In [24]:
np.array(allUsers_implicit_ndcg).mean()

0.16613371587743056

# SVD

On this section (Section 3), an SVD model is implemented, aiming to provide the top-K recommendations for each user and evaluate the results using the NDCG evaluation metric.

Pivot is used to have a matrix that has as rows the userIDs and as columns the movieIDs. The unrated, by a user, movies are assigned the value 0 instead of null. Pivot organizes a DataFrame by giving index and column values. It is helpful in the recommender systems approach, as it provides a dataframe in which, it can be seen directly any user-item rating.

In [25]:
df_book_features = train_df.pivot(
    index='user_id',
    columns='book_id',
    values='rating'
).fillna(0)

In [26]:
df_book_features

book_id,2,3,4,5,7,8,9,10,11,13,...,9978,9981,9985,9988,9990,9991,9995,9997,9998,10000
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5.0,0.0,0.0,4.0,0.0,4.0,0.0,5.0,4.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,3.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,1.0,5.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2996,0.0,0.0,0.0,0.0,0.0,0.0,5.0,3.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## SVD implementation

In [27]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(df_book_features)

## Predictions

To get the prediction of a rating of an item dj, we can calculate the dot product of the two vectors.
The sigma variable has just the values, so I have to convert to a diagonal matrix in orded to implement the matrix multiplication.

In [28]:
sigma = np.diag(sigma)
svd_predictions_score = np.dot(np.dot(U, sigma), Vt)
svd_predictions_score_df = pd.DataFrame(svd_predictions_score, columns = df_book_features.columns)


In [29]:
svd_predictions_score_df

book_id,2,3,4,5,7,8,9,10,11,13,...,9978,9981,9985,9988,9990,9991,9995,9997,9998,10000
0,-0.010878,0.001699,0.037315,0.054154,-0.032385,0.054848,-0.002643,0.009269,0.100840,0.005271,...,-0.001585,-0.000658,-0.000318,-0.000725,0.002243,0.000623,0.000637,-0.000238,-0.000024,-0.000294
1,0.167729,0.021259,0.545071,0.272336,0.034942,0.238378,0.324955,0.377873,0.483631,-0.026733,...,-0.002831,0.003401,0.000510,0.001778,0.000996,0.000791,-0.000491,-0.001575,-0.000390,-0.001601
2,-0.019510,0.019100,0.064092,0.205953,0.164346,0.192726,-0.024577,0.111922,0.085094,0.174809,...,0.001524,0.001337,-0.000093,-0.001377,0.003923,0.006413,0.004705,0.001990,0.000771,-0.001000
3,3.394401,0.188196,2.748498,2.588143,1.540054,2.646761,1.110343,2.283996,1.590742,1.917589,...,0.051450,0.013929,0.003920,0.019847,0.005607,0.051331,0.025648,0.003715,0.001794,0.024131
4,0.673281,0.043852,-0.180988,0.224068,2.199936,0.823216,0.095388,-0.219599,-0.276868,1.728877,...,0.021184,-0.010471,0.001762,0.002209,0.025505,0.002890,0.011805,0.010243,0.003392,0.014009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1793,1.070161,0.070405,0.431227,0.276808,0.204574,0.145199,0.510457,0.543501,0.415747,-0.077417,...,0.001516,0.006850,0.001818,-0.001997,-0.007713,-0.000311,-0.000278,-0.002220,-0.001023,0.002505
1794,2.938096,0.118338,1.803317,0.793223,1.522085,0.969640,1.554006,1.435542,0.807817,0.828302,...,0.029257,0.011112,0.006825,0.017814,-0.015756,-0.003172,-0.004750,-0.003475,-0.001652,0.020018
1795,0.436334,0.006816,1.308374,1.143837,0.207157,1.474781,0.252979,0.404460,1.603133,0.866364,...,-0.003908,-0.013338,-0.003226,0.005036,0.036836,0.012052,0.008938,-0.002934,0.000049,0.007415
1796,-0.002591,0.001616,0.015516,0.004805,-0.002842,0.003602,0.019717,0.007145,0.045343,-0.014538,...,-0.001130,-0.000070,-0.000044,-0.000427,0.000612,-0.000513,-0.000127,-0.000195,-0.000058,-0.000322


In [30]:
svd_predictions_score_df[svd_predictions_score_df.index == 1137]

book_id,2,3,4,5,7,8,9,10,11,13,...,9978,9981,9985,9988,9990,9991,9995,9997,9998,10000
1137,0.039832,0.091497,0.977666,0.466906,-0.25592,0.26179,0.941213,0.684707,2.024352,-0.711186,...,-0.043803,0.004107,-0.001139,-0.016291,0.018851,-0.013582,-0.003162,-0.008037,-0.002353,-0.015503


## Top-K recommendations

The function below returns a list that contains the top-100 recommendations for each user

In [31]:
def SVD_topK_first(score_df):
    topK_svd_rec = []
    c = 0
    for i in range(len(svd_predictions_score_df)):
        svd_h_df = pd.DataFrame({'book_id': score_df[score_df.index == i].columns,
                                 'score': score_df[score_df.index == i].values[0]})
        svd_h_df = svd_h_df.sort_values('score', ascending=False)
        svd_h_df = svd_h_df.head(100)
        topK_svd_rec.append(svd_h_df['book_id'].values)
    return topK_svd_rec

## NDCG Score

The function below returns the relevance list of a list of movies.

In [32]:
def SVD_relevance_first(score_df):
    svd_topK = SVD_topK_first(score_df)
    relevance_svd_full = []
    helper = []
    count = 0
    for i in range(1,len(svd_predictions_score_df)+1):
        for j in range(len(svd_topK[count])):
            if svd_topK[count][j] in test_df['book_id'][test_df['user_id'] == i].values.tolist():
                helper.append(1)
            else:
                helper.append(0)
        relevance_svd_full.append(helper)
        helper = []
        count += 1
    return relevance_svd_full

In [33]:
relevance_first = SVD_relevance_first(svd_predictions_score_df)

In [34]:
np.asarray(relevance_first).shape

(1798, 100)

In [35]:
n_f = NDCG_forUser(relevance_first[0])

  


In [36]:
n_f

nan

Below is the NDCG scores of each user

In [37]:
def allUsers_ndcg_svd(relevance):
    allUsers_implicit_ndcg = []
    for i in range(len(svd_predictions_score_df)):
        allUsers_implicit_ndcg.append(NDCG_forUser(relevance[i]))
    return allUsers_implicit_ndcg
all_ndcg_f = allUsers_ndcg_svd(relevance_first)

  


In [38]:
all_ndcg_f = pd.Series(all_ndcg_f).fillna(0).tolist()

In [39]:
np.asarray(all_ndcg_f).mean()

0.15886791279304913

# Feature engineering

##  Feature 1 (Implicit factorization model scores)

Train an implicit factorization model using the train_features dataset

In [135]:
f1_model = ImplicitFactorizationModel(n_iter = 5)

current = time.time()

f1_model.fit(implicit_train_features,verbose = True)

end = time.time()
diff = end - current
print('Training took %d second'%(diff))

Epoch 0: loss 0.8525460751033297
Epoch 1: loss 0.49117058311022965
Epoch 2: loss 0.4112948196775773
Epoch 3: loss 0.35288794455575007
Epoch 4: loss 0.3056439611257291
Training took 19 second


### Target Movies

In [136]:
def getImplicitTop():
    moviePerUser = []
    rev = []
    for user in range(len(num_users)):
        pred = f1_model.predict(user)
        #sort by score in descending order oscer
        predMovieId = np.argsort(-pred)
        #keep top 100
        predMovieId = predMovieId[:100]
        #reverse the mapping to get the original movieIDs
        for movie in predMovieId:
            rev.append(iid_rev_map[movie])
        moviePerUser.append(rev)
        rev = []
    return moviePerUser

In [137]:
target_positive = getImplicitTop()

Add negative samples to the target movies

In [138]:
len(train_df['user_id'].unique())

1798

In [139]:
import random
target_negative = []
for i in range(1,len(num_users)+1):
    #get 50 random movieIDs for each user
    target_negative.append(random.choices(train_features_df['book_id'].values, k = 50))

In [140]:
target_movies = []
for i in range(len(num_users)):
    target_movies.append(target_positive[i] + target_negative[i])

Map the target movies to pick the correct predictions based on the item id

In [104]:
def map_movies(target_movies):
    map_target_movies = []
    map_help = []
    for movies in target_movies:
        for movie in movies:
            map_help.append(iid_map[movie])
        map_target_movies.append(map_help)
        map_help = []
    return map_target_movies

Implicit Factorization model Scores

Train implicit on implicit_features_train! and take these scores.

In [105]:
def give_movies_implicit_scores(map_movies):
    implicit_scores = 0
    feature = []
    for i in range(len(num_users)):
        #for movies in map_target_movies:
        my_list = map_movies[i]
        my_array = np.array(my_list)
        implicit_scores = f1_model.predict(user_ids = i,item_ids = my_array)
        feature.append(implicit_scores)
    return feature

In [106]:
map_target_movies = map_movies(target_movies)
feature1 = give_movies_implicit_scores(map_target_movies)
feature1 = np.asarray(feature1)
feature1.shape

(1823, 150)

In [107]:
feature1[0]

array([11.43107   , 11.092909  , 10.87504   , 10.799794  , 10.6881485 ,
       10.542676  , 10.534383  , 10.448541  , 10.425026  , 10.403548  ,
       10.355983  , 10.238815  , 10.198807  , 10.183271  , 10.171095  ,
       10.163433  , 10.151938  , 10.094472  , 10.034395  ,  9.87258   ,
        9.860393  ,  9.839075  ,  9.819038  ,  9.789733  ,  9.763545  ,
        9.754364  ,  9.7512865 ,  9.7469225 ,  9.654484  ,  9.588643  ,
        9.574999  ,  9.569372  ,  9.527291  ,  9.482679  ,  9.388159  ,
        9.376306  ,  9.3569765 ,  9.313295  ,  9.272049  ,  9.232968  ,
        9.225159  ,  9.224716  ,  9.185566  ,  9.129577  ,  9.123506  ,
        9.102493  ,  9.036898  ,  8.979385  ,  8.921572  ,  8.900492  ,
        8.884119  ,  8.762404  ,  8.740627  ,  8.721332  ,  8.718059  ,
        8.699275  ,  8.695723  ,  8.687365  ,  8.663226  ,  8.655886  ,
        8.6446495 ,  8.634677  ,  8.624192  ,  8.616806  ,  8.616389  ,
        8.587243  ,  8.582605  ,  8.580853  ,  8.578739  ,  8.52

## Feature 2 (Rank)

In [108]:
feature2 = []
for i in range(len(num_users)):
    feature2 = (st.rankdata(-f1_model.predict(i)))
    feature2 = feature2[map_target_movies]
    feature2.sort()

In [109]:
np.array(feature2).shape

(1823, 150)

In [110]:
feature2[0]

array([1.000e+00, 2.000e+00, 3.000e+00, 4.000e+00, 4.000e+00, 5.000e+00,
       6.000e+00, 7.000e+00, 8.000e+00, 9.000e+00, 9.000e+00, 1.000e+01,
       1.100e+01, 1.200e+01, 1.200e+01, 1.200e+01, 1.300e+01, 1.500e+01,
       1.700e+01, 1.800e+01, 2.000e+01, 2.000e+01, 2.300e+01, 2.400e+01,
       2.500e+01, 2.600e+01, 2.800e+01, 2.900e+01, 3.300e+01, 3.500e+01,
       3.800e+01, 3.900e+01, 4.000e+01, 4.100e+01, 4.200e+01, 4.500e+01,
       4.700e+01, 4.700e+01, 4.800e+01, 4.900e+01, 5.200e+01, 5.400e+01,
       5.500e+01, 5.900e+01, 6.100e+01, 6.400e+01, 7.200e+01, 7.400e+01,
       8.000e+01, 8.100e+01, 8.300e+01, 8.500e+01, 8.600e+01, 8.700e+01,
       8.900e+01, 9.000e+01, 9.100e+01, 9.200e+01, 9.200e+01, 9.400e+01,
       9.600e+01, 9.700e+01, 9.800e+01, 1.000e+02, 1.000e+02, 1.000e+02,
       1.020e+02, 1.040e+02, 1.060e+02, 1.060e+02, 1.130e+02, 1.170e+02,
       1.210e+02, 1.240e+02, 1.340e+02, 1.360e+02, 1.360e+02, 1.400e+02,
       1.510e+02, 1.570e+02, 1.580e+02, 1.620e+02, 

## Feature 3 (SVD scores)

SVD model Scores. Take advantage of the DataFrame containing the scores to extract the predicted scores for the desired movies.

In [141]:
df_movie_features1 = train_features_df.pivot(
    index='user_id',
    columns='book_id',
    values='rating'
).fillna(0)

In [142]:
U1, sigma1, Vt1 = svds(df_movie_features1)
sigma1 = np.diag(sigma1)
svd_predictions_score1 = np.dot(np.dot(U1, sigma1), Vt1)
svd_predictions_score_df1 = pd.DataFrame(svd_predictions_score1, columns = df_movie_features1.columns)

In [143]:
svd_predictions_score_df1[svd_predictions_score_df1.index == 1]

book_id,2,3,4,5,7,8,9,10,11,13,...,9978,9981,9985,9988,9990,9991,9995,9997,9998,10000
1,-0.029723,-0.001287,0.015668,0.072093,0.008951,0.078051,-0.022855,0.040357,-0.019496,0.054079,...,0.002311,0.00104,-5.9e-05,0.000457,0.000507,0.002461,0.000747,0.00091,0.00037,-4.3e-05


In [144]:
float(svd_predictions_score_df1[svd_predictions_score_df1.index == 0][target_movies[0][0]].values)

0.021623719055023614

In [145]:
feature3 = []
feature3_user = []
count = 0
for i in range(len(num_users)):
    for j in range(len(target_movies[i])):
        feature3_user.append(float(svd_predictions_score_df1[svd_predictions_score_df1.index == i][target_movies[i][j]].values))
    feature3.append(feature3_user)
    feature3_user = []

KeyError: 1215

In [None]:
type(feature3)

## Feature 5 (#Users watched each target movie)

In [146]:
number_views = []
number_view_per_movie = []
for i in range(len(num_users)):
    for j in range(len(target_movies[i])):
        a = len(ratings_df[ratings_df['book_id'] == target_movies[i][j]]['user_id'])
        number_view_per_movie.append(a)
    number_views.append(number_view_per_movie)
    number_view_per_movie = []


In [147]:
feature5 = number_views

In [148]:
np.array(feature5).shape

(1823, 150)

## Feature 6 (New user)

In [149]:
len(ratings_df[ratings_df['user_id']==1]) > 10

False

In [150]:
new_users_binary = []
binary_per_user = []
for i in range(len(num_users)):
    for j in range(len(target_movies[i])):
        a = len(ratings_df[ratings_df['user_id']==1]) > 10
        if a:
            binary_per_user.append(1)
        else:
            binary_per_user.append(0)
    new_users_binary.append(binary_per_user)
    binary_per_user = []

In [151]:
#rankings of old/new users

In [152]:
feature6 = new_users_binary

In [153]:
np.array(feature6).shape

(1823, 150)

## Feature 7(Embeddings) x32

In [154]:
emb = f1_model._net.item_embeddings

In [155]:
emb_features = []
helper = []
for i in range(32):
    for j in range(len(num_items)):
        helper.append(emb.weight[j][i].item())
    emb_features.append(helper)
    helper = []

In [156]:
a = []
b = []
feature7 = []
for i in range(32):
    for user in range(len(num_users)):
        for movie in map_target_movies[user]:
            a.append(emb_features[0][movie])
        b.append(a)
        a = []
    feature7.append(b)
    b = []

In [157]:
np.array(feature7).shape

(32, 1823, 150)

In [158]:
np.array(feature7[0]).shape

(1823, 150)

## Features stack

In [None]:
'''features = np.stack((feature1,feature2,feature3,feature4,feature5,feature6,
                     feature7[0], feature7[1],feature7[2], feature7[3],
                     feature7[4], feature7[5],feature7[6], feature7[7],
                     feature7[8], feature7[9],feature7[10], feature7[11],
                     feature7[12], feature7[13],feature7[14], feature7[15],
                     feature7[16], feature7[17],feature7[18], feature7[19],
                     feature7[20], feature7[21],feature7[22], feature7[23],
                     feature7[24], feature7[25],feature7[26], feature7[27],
                     feature7[28], feature7[29],feature7[30], feature7[31],
                     feature8,feature9,feature10,feature11,feature12,
                    feature13,feature14,feature15))'''
features = np.stack((feature1,feature2,feature5,feature6,
                     feature7[0], feature7[1],feature7[2], feature7[3],
                     feature7[4], feature7[5],feature7[6], feature7[7],
                     feature7[8], feature7[9],feature7[10], feature7[11],
                     feature7[12], feature7[13],feature7[14], feature7[15],
                     feature7[16], feature7[17],feature7[18], feature7[19],
                     feature7[20], feature7[21],feature7[22], feature7[23],
                     feature7[24], feature7[25],feature7[26], feature7[27],
                     feature7[28], feature7[29],feature7[30], feature7[31]))

In [None]:
features = np.stack((feature1, feature2,feature5))

In [None]:
len(features)

## Labels

Below are the movies that each user watched

In [None]:
def watched_movies(df):
    watched = []
    #for i in range(len(num_users)):
    for u in uid_rev_map:
        watched.append(df[df["user_id"]==uid_rev_map.get(u)].groupby("book_id")["book_id"].count().index.tolist())  
    return watched 

In [None]:
#train_labels_df
#test_df
#val_df
train_watched = watched_movies(train_labels_df)
test_watched = watched_movies(test_df)
val_watched = watched_movies(val_df)

Creation of the target list. It containes 0s and 1s for the 100 recommended movies for each user. Based on the watched movies it assigns the values. If the user watched the recommended movie, then the value of the target is equal to 1, otherwise it is 0.

In [None]:
def create_target(watched):
    user_target = []
    target = []
    for i in range(len(num_users)):
        for target_movie in target_movies[i]:
            if target_movie in watched[i]:
                user_target.append(1)
            else:
                user_target.append(0)
        target.append(user_target)
        user_target = []
    return target

In [None]:
label_train = create_target(train_watched)
label_test = create_target(test_watched)
label_val = create_target(val_watched)
#y_train = y_test = y_val

# LightGBM model

In [None]:
k = len(target_movies[0])

X = np.array(features)
#X = X.reshape(len(num_users)*k,len(features))
X = X.reshape(len(features), len(num_users)*k)
X = np.transpose(X)
label_train = np.array(label_train)
label_test = np.array(label_test)
label_val = np.array(label_val)

In [None]:
y_train = label_train.reshape(len(num_users)*k,)
y_test = label_test.reshape(len(num_users)*k,)
y_val = label_val.reshape(len(num_users)*k,)

In [None]:
import lightgbm as lgb
param = {
    "task": "train",
    "num_leaves": 255,
    "min_data_in_leaf": 1,
    "min_sum_hessian_in_leaf": 100,
    "objective": "lambdarank",
    "metric": "ndcg",
    "ndcg_eval_at": [1, 5, 10, 20],
    "learning_rate": .1,
    "num_threads": 2
}
gbm = lgb.LGBMRanker()
gbm.set_params(**param)

target movies
train_df
feature1
feature2

X_train
X_test
X_val
y_train
y_test
y_val

In [None]:
X_train = X
X_test = X
X_val = X

In [None]:
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [None]:
query_train = [X_train.shape[0]]
query_val = [X_val.shape[0]]
query_test = [X_test.shape[0]]
group_train = []
group_val = []
print(query_train)
print(query_val)
print(query_test)
'''
for i in range(390):
    group_train.append(100)
group_train.append(40)
group_val.append(60)
for i in range(97):
    group_val.append(100)'''

for i in range(len(num_users)):
    group_train.append(k)
    group_val.append(k)


In [None]:
gbm.fit(X_train, y_train, group=group_train,
        eval_set=[(X_val, y_val)], eval_group=[group_val],
        eval_at=[1, 3, 5, 10], early_stopping_rounds=50, eval_metric='ndcg')
#no early stopping

In [None]:
gbm.best_iteration_

In [None]:
test_pred = gbm.predict(X_test)
test_pred

## NDCG Score

In [None]:
p = 0
m = 150
lgbm_pred_per_user = []
for i in range(len(num_users)):
    lgbm_pred_per_user.append(test_pred[p:m])
    p += 150
    m += 150

### NDCG IFM on this data

In [None]:
np.array(feature7[0][609]).shape

In [None]:
feature1.shape

In [None]:
lgbm_NDCG = []
IFM_NDCG = []
#SVD_NDCG = []
for i in range(len(num_users)):
    excel_df = pd.DataFrame({'movie_id': target_movies[i], 'model1_score': feature1[i],
                             'rank': feature2[i], '#users_watched': feature5[i],
                             'new_user': feature6[i],
                             'emedding1': feature7[0][i], 'emedding2': feature7[1][i], 'emedding3': feature7[2][i],
                             'emedding4': feature7[3][i], 'emedding5': feature7[4][i], 'emedding6': feature7[5][i],
                             'emedding7': feature7[6][i], 'emedding8': feature7[7][i], 'emedding9': feature7[8][i],
                             'emedding10': feature7[9][i], 'emedding11': feature7[10][i], 'emedding12': feature7[11][i],
                             'emedding13': feature7[12][i], 'emedding14': feature7[13][i], 'emedding15': feature7[14][i],
                             'emedding16': feature7[15][i], 'emedding17': feature7[16][i], 'emedding18': feature7[17][i],
                             'emedding19': feature7[18][i], 'emedding20': feature7[19][i], 'emedding21': feature7[20][i],
                             'emedding22': feature7[21][i], 'emedding23': feature7[22][i], 'emedding24': feature7[23][i],
                             'emedding25': feature7[24][i], 'emedding26': feature7[25][i], 'emedding27': feature7[26][i],
                             'emedding28': feature7[27][i], 'emedding29': feature7[28][i], 'emedding30': feature7[29][i],
                             'emedding31': feature7[30][i], 'emedding32': feature7[31][i],
                             'train_label': label_train[i], 'test_label': label_test[i],
                             'y_val': label_val[i], 'LGBM Score': lgbm_pred_per_user[i]})
    excel_df = excel_df.sort_values('LGBM Score', ascending=False)
    relevance_lgbm_user = excel_df['test_label'].values.tolist()
    lgbm_NDCG.append(NDCG_forUser(relevance_lgbm_user))
    excel_df = excel_df.sort_values('model1_score', ascending=False)
    ifm_relevance = excel_df['test_label'].values.tolist()
    IFM_NDCG.append(NDCG_forUser(ifm_relevance))
    #excel_df = excel_df.sort_values('model2_score', ascending=False)
    #svd_relevance_h = excel_df['test_label'].values.tolist()
    #SVD_NDCG.append(NDCG_forUser(svd_relevance_h))

In [None]:
np.array(lgbm_pred_per_user).shape

In [None]:
lgbm_NDCG = []
IFM_NDCG = []
#SVD_NDCG = []
for i in range(len(num_users)):
    excel_df = pd.DataFrame({'movie_id': target_movies[i], 'model1_score': feature1[i],
                             'rank': feature2[i], '#users_watched': feature5[i],
                             'train_label': label_train[i], 'test_label': label_test[i],
                             'y_val': label_val[i], 'LGBM Score': lgbm_pred_per_user[i]})
    excel_df = excel_df.sort_values('LGBM Score', ascending=False)
    relevance_lgbm_user = excel_df['test_label'].values.tolist()
    lgbm_NDCG.append(NDCG_forUser(relevance_lgbm_user))
    excel_df = excel_df.sort_values('model1_score', ascending=False)
    ifm_relevance = excel_df['test_label'].values.tolist()
    IFM_NDCG.append(NDCG_forUser(ifm_relevance))
    #excel_df = excel_df.sort_values('model2_score', ascending=False)
    #svd_relevance_h = excel_df['test_label'].values.tolist()
    #SVD_NDCG.append(NDCG_forUser(svd_relevance_h))

In [None]:
lgbm_NDCG = pd.Series(lgbm_NDCG).fillna(0).tolist()
IFM_NDCG = pd.Series(IFM_NDCG).fillna(0).tolist()
#SVD_NDCG = pd.Series(SVD_NDCG).fillna(0).tolist()

## NDCG Comparison

In [None]:
allUsers_implicit_ndcg[0]

In [None]:
ndcg_df = pd.DataFrame({'Implicit FM': allUsers_implicit_ndcg, 'LightGBM': lgbm_NDCG, 'IFM_this': IFM_NDCG})

In [None]:
ndcg_df

In [None]:
ndcg_df['Implicit FM'].mean()

In [None]:
lgbm_mean = ndcg_df['LightGBM'].mean()
lgbm_mean

In [None]:
ifm_mean = ndcg_df['IFM_this'].mean()
ifm_mean

In [None]:
#svd_mean = ndcg_df['SVD_this'].mean()
#svd_mean

## Visualization

## Excel-Like

In [None]:
excel1_df = pd.DataFrame({'movie_id': target_movies[0],'model1_score':feature1[0], 'model2_score':feature3[0],'is_comedy':feature4[0],
                            '#users_watched':feature5[0], 'train_label':label_train[0], 'test_label':label_test[0],
                            'label_val':label_val[0],'LGBM Score':lgbm_pred_per_user[0]})

In [None]:
excel1_df.head(10)

In [None]:
excel1_df = excel1_df.sort_values('LGBM Score',ascending = False)
excel1_df.head(10)

In [None]:
relevance_lgbm_firstUser = excel1_df['test_label'].values.tolist()

In [None]:
L1GBM_ndcg = NDCG_forUser(relevance_lgbm_firstUser)
L1GBM_ndcg