In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from spotlight.factorization.implicit import ImplicitFactorizationModel
from spotlight.interactions import Interactions
from spotlight.cross_validation import random_train_test_split

In [2]:
ratings_df = pd.read_csv('C:/Users/papad/Desktop/goodbooks-10k-master/goodbooks-10k-master/ratings.csv')
books_df = pd.read_csv('C:/Users/papad/Desktop/goodbooks-10k-master/goodbooks-10k-master/books.csv')
to_read_df = pd.read_csv('C:/Users/papad/Desktop/goodbooks-10k-master/goodbooks-10k-master/to_read.csv')

# Dataset

In [3]:
percentage = 1
ratings_df = ratings_df.head(int(len(ratings_df)*(percentage/100)))

In [205]:
books_df[books_df['book_id']==9439]

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
9438,9439,35539,35539,1654509,17,316013315,9780316000000.0,Bob Spitz,2005.0,The Beatles: The Biography,...,8120,8464,390,205,314,1270,3020,3655,https://images.gr-assets.com/books/1327951066m...,https://images.gr-assets.com/books/1327951066s...


In [5]:
to_read_df.head()

Unnamed: 0,user_id,book_id
0,9,8
1,15,398
2,15,275
3,37,7173
4,34,380


In [6]:
num_users = ratings_df['user_id'].unique()
print(len(num_users))
num_items = ratings_df['book_id'].unique()
print(len(num_items))

1157
4477


# Spotlight

In [7]:
from collections import defaultdict
from itertools import count
uid_map = defaultdict(count().__next__)
iid_map = defaultdict(count().__next__)
uids = np.array([uid_map[uid] for uid in ratings_df["user_id"].values ], dtype=np.int32)
iids = np.array([iid_map[iid] for iid in ratings_df["book_id"].values ], dtype=np.int32)

uid_rev_map = {v: k for k, v in uid_map.items()}
iid_rev_map = {v: k for k, v in iid_map.items()}


print("userId %d got uid %d" % (556, uid_map[556]))
print("movieId %d got iid %d" % (54001, iid_map[54001]))

userId 556 got uid 1157
movieId 54001 got iid 4477


In [8]:
from spotlight.interactions import Interactions
from spotlight.cross_validation import random_train_test_split
from spotlight.factorization.implicit import ImplicitFactorizationModel
import time  


#pecify the ids of the users and items for all the pairs of user-item interactions
implicit_interaction = Interactions(user_ids=uids,
                                   item_ids=iids)


#lets initialise the seed, so that its repeatable and reproducible 
implicit_train_tmp, implicit_test = random_train_test_split(implicit_interaction, random_state=np.random.seed(42))
implicit_train, implicit_val = random_train_test_split(implicit_train_tmp, random_state=np.random.seed(42))
implicit_train_features, implicit_train_labels = random_train_test_split(implicit_train, random_state=np.random.seed(42))

In [9]:
print(implicit_train)
print(implicit_test)
print(implicit_val)
print(implicit_train_features)
print(implicit_train_labels)

<Interactions dataset (1157 users x 4477 items x 38248 interactions)>
<Interactions dataset (1157 users x 4477 items x 11953 interactions)>
<Interactions dataset (1157 users x 4477 items x 9563 interactions)>
<Interactions dataset (1157 users x 4477 items x 30598 interactions)>
<Interactions dataset (1157 users x 4477 items x 7650 interactions)>


In [10]:
def interactionToDataFrame(dataset):
    converted_users = []
    converted_books = []
    rating = []
    for i in range(len(dataset.user_ids)):
        converted_users.append(uid_rev_map[dataset.user_ids[i]])
        converted_books.append(iid_rev_map[dataset.item_ids[i]])
        rating.append(float(ratings_df['rating'][(ratings_df['book_id'] == converted_books[i]) & (
            ratings_df['user_id'] == converted_users[i])].values))
    df = pd.DataFrame({'user_id': converted_users,
                       'book_id': converted_books, 'rating': rating})
    return df

In [11]:
val_df = interactionToDataFrame(implicit_val)

In [12]:
train_df = interactionToDataFrame(implicit_train)
val_df = interactionToDataFrame(implicit_val)
test_df = interactionToDataFrame(implicit_test)
train_features_df = interactionToDataFrame(implicit_train_features)
train_labels_df = interactionToDataFrame(implicit_train_labels)

## Implicit Factorization model

In [13]:
implicit_model = ImplicitFactorizationModel(n_iter=5)

current = time.time()

implicit_model.fit(implicit_train, verbose=True)

end = time.time()
diff = end - current
print('Training took %d second' % (diff))

Epoch 0: loss 0.8914661049842835
Epoch 1: loss 0.5310210671027501
Epoch 2: loss 0.4398420977592468
Epoch 3: loss 0.384601792494456
Epoch 4: loss 0.33657042801380155
Training took 14 second


### Top - K

In [14]:
import scipy.stats as st
k = 100
def topK_implicit(user, model, k=100):
  a= list()
  ranks = st.rankdata(-model.predict(user))   
  for iid in np.argwhere(ranks <= k):
    a.append(iid_rev_map[iid.item(0)])
  return a

rec_forUser = topK_implicit(0,implicit_model)
print(rec_forUser)

[26, 33, 301, 18, 27, 21, 2, 23, 24, 35, 58, 111, 5, 8, 65, 45, 113, 476, 14, 55, 36, 102, 13, 50, 87, 32, 28, 42, 184, 495, 11, 426, 101, 344, 372, 122, 9, 81, 60, 238, 119, 213, 10, 94, 233, 115, 177, 4, 483, 326, 100, 92, 71, 148, 48, 131, 662, 405, 378, 323, 130, 78, 741, 150, 66, 114, 22, 7, 85, 75, 162, 121, 63, 217, 80, 228, 43, 57, 125, 160, 138, 154, 29, 353, 839, 248, 635, 718, 354, 64, 383, 127, 278, 38, 335, 225, 595, 25, 46, 67]


In [15]:
def allUsers_topK_implicit(model):
    a = []
    for i in range(len(num_users)):
       a.append(topK_implicit(i,model))
    return a

### NDCG Score

In [16]:
allUsers_topK_imp = allUsers_topK_implicit(implicit_model)

Compute the relevance for each user. The relevance containes 0s and 1s based on which movies each has watched. For every recommended movie that the user has actually watched, the value of 1 is assigned. There is a relevance list for each user.

In [17]:
def relevance_spotlight_implicit(allUsers_topK):
    relevance_implicit_full = []
    helper = []
    count = 0
    for i in range(1,len(num_users)+1):
        for j in range(len(allUsers_topK[count])):
            if allUsers_topK[count][j] in test_df['book_id'][test_df['user_id'] == i].values.tolist():
                helper.append(1)
            else:
                helper.append(0)
        relevance_implicit_full.append(helper)
        helper = []
        count += 1
    return relevance_implicit_full

In [18]:
relevance_implicit_full = relevance_spotlight_implicit(allUsers_topK_imp)

The dcg and ideal_dcg functions are taken form https://github.com/lezzago/LambdaMart/blob/master/lambdamart.py. They are necessary to compute the desired NDCG evaluation metric.

In [19]:
def dcg(scores):
    """
    Returns the DCG value of the list of scores.
    Parameters
    ----------
    scores : list
        Contains labels in a certain ranked order

    Returns
    -------
    DCG_val: int
        This is the value of the DCG on the given scores
    """
    return np.sum([
                    (np.power(2, scores[i]) - 1) / np.log2(i + 2)
                    for i in range(len(scores))
                ])

In [20]:
def ideal_dcg(scores):
    """
    Returns the Ideal DCG value of the list of scores.
    Parameters
    ----------
    scores : list
        Contains labels in a certain ranked order

    Returns
    -------
    Ideal_DCG_val: int
        This is the value of the Ideal DCG on the given scores
    """
    scores = [score for score in sorted(scores)[::-1]]
    return dcg(scores)

In [21]:
def NDCG_forUser(relevance):
    return dcg(relevance) / ideal_dcg(relevance)

In [22]:
ndcg = NDCG_forUser(relevance_implicit_full)
print(ndcg)

0.7857975526449194


Creation of a list (allUsers_implicit_ndcg) that contains the NDCG score for each user.

In [23]:
def allUsers_ndcg(relevance):
    allUsers_implicit_ndcg = []
    for i in range(len(num_users)):
        allUsers_implicit_ndcg.append(NDCG_forUser(relevance[i]))
    return allUsers_implicit_ndcg

In [24]:
allUsers_implicit_ndcg = allUsers_ndcg(relevance_implicit_full)

  


Fill the nan values with 0s.

In [25]:
allUsers_implicit_ndcg = pd.Series(allUsers_implicit_ndcg).fillna(0).tolist()

In [26]:
np.array(allUsers_implicit_ndcg).mean()

0.15528082612401123

# SVD

Pivot is used to have a matrix that has as rows the userIDs and as columns the movieIDs. The unrated, by a user, movies are assigned the value 0 instead of null. Pivot organizes a DataFrame by giving index and column values. It is helpful in the recommender systems approach.

In [27]:
df_book_features = train_df.pivot(
    index='user_id',
    columns='book_id',
    values='rating'
).fillna(0)

In [28]:
df_book_features

book_id,2,3,4,5,7,8,9,10,11,13,...,9974,9978,9981,9985,9988,9990,9991,9995,9998,10000
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,4.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1942,0.0,5.0,0.0,0.0,3.0,0.0,3.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1947,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## SVD implementation

In [29]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(df_book_features, k = 10)

## Predictions

To get the prediction of a rating of an item dj, we can calculate the dot product of the two vectors.
The sigma variable has just the values, so I have to convert to a diagonal matrix in orded to implement the matrix multiplication.

In [30]:
sigma = np.diag(sigma)
svd_predictions_score = np.dot(np.dot(U, sigma), Vt)
svd_predictions_score_df = pd.DataFrame(svd_predictions_score, columns = df_book_features.columns)


In [31]:
svd_predictions_score_df

book_id,2,3,4,5,7,8,9,10,11,13,...,9974,9978,9981,9985,9988,9990,9991,9995,9998,10000
0,-0.049298,0.003692,-0.002757,0.039626,-0.033749,-0.052833,-0.024511,0.132729,0.072617,-0.067501,...,-0.000988,0.000645,-0.002088,-0.000244,-0.000365,0.001394,0.002273,-0.000170,-0.000207,-0.002350
1,0.073866,0.005820,-0.003101,0.154533,-0.037335,-0.098766,0.116468,0.194036,0.436235,0.029497,...,-0.000782,0.001042,0.003692,0.000375,-0.000288,0.000474,-0.003896,-0.003194,-0.001226,-0.006164
2,4.094888,0.047320,4.652056,3.074406,1.618992,3.185479,1.338068,1.789311,2.012343,2.561870,...,0.066197,-0.040661,0.004311,-0.000514,-0.002891,-0.001911,0.032142,0.048433,-0.002958,0.040937
3,-0.000462,-0.000057,-0.000121,0.000834,-0.000370,0.001290,0.000341,0.000253,0.001287,-0.000213,...,-0.000021,0.000029,0.000018,0.000001,0.000015,0.000008,-0.000008,-0.000050,0.000004,-0.000017
4,-0.041045,0.045452,1.553995,0.436229,1.586831,-0.211696,0.043967,-0.010388,1.275714,0.739684,...,0.030648,-0.058905,-0.037550,0.002853,-0.016601,0.007262,0.021558,0.062604,0.001035,-0.016143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1139,0.186668,-0.003004,-0.161435,0.679254,-0.120213,-0.120348,0.358933,-0.383086,2.782027,0.110153,...,0.024413,0.000158,0.013547,0.002375,-0.001155,0.014456,0.001070,-0.001904,0.000033,-0.023758
1140,0.014225,0.003949,0.046181,0.002087,0.030339,-0.029959,-0.008724,0.082055,-0.022925,-0.004635,...,-0.000456,-0.000796,-0.001413,-0.000059,-0.000578,-0.000155,0.000808,0.001797,-0.000146,-0.000350
1141,0.144568,0.101535,1.135020,0.816500,0.490991,-0.383510,0.210712,2.969982,0.066582,0.203215,...,-0.034337,0.008915,-0.010159,-0.000772,-0.004760,-0.008157,0.004159,0.019308,-0.005514,-0.018770
1142,1.087707,0.006272,0.337514,0.343551,0.217767,0.549196,0.313417,0.555213,0.275634,0.241986,...,0.003651,0.003413,0.004876,0.000022,0.000467,-0.001032,-0.006346,-0.005842,-0.001072,0.005331


In [32]:
svd_predictions_score_df[svd_predictions_score_df.index == 1137]

book_id,2,3,4,5,7,8,9,10,11,13,...,9974,9978,9981,9985,9988,9990,9991,9995,9998,10000
1137,1.162786,-0.012179,0.36298,0.449454,0.056825,0.439091,0.556299,0.037031,2.473144,-0.337238,...,0.012052,-0.006725,0.013464,0.002995,-0.001901,0.006563,-0.012309,-0.016845,0.00014,-0.02208


## Top-K recommendations

### First

The function below returns a list that contains the top-100 recommendations for each user

In [33]:
def SVD_topK_first(score_df):
    topK_svd_rec = []
    c = 0
    for i in range(len(svd_predictions_score_df)):
        svd_h_df = pd.DataFrame({'book_id': score_df[score_df.index == i].columns,
                                 'score': score_df[score_df.index == i].values[0]})
        svd_h_df = svd_h_df.sort_values('score', ascending=False)
        svd_h_df = svd_h_df.head(100)
        topK_svd_rec.append(svd_h_df['book_id'].values)
    return topK_svd_rec

## NDCG Score

The function below returns the relevance list of a list of movies.

In [34]:
def SVD_relevance_first(score_df):
    svd_topK = SVD_topK_first(score_df)
    relevance_svd_full = []
    helper = []
    count = 0
    for i in range(1,len(svd_predictions_score_df)+1):
        for j in range(len(svd_topK[count])):
            if svd_topK[count][j] in test_df['book_id'][test_df['user_id'] == i].values.tolist():
                helper.append(1)
            else:
                helper.append(0)
        relevance_svd_full.append(helper)
        helper = []
        count += 1
    return relevance_svd_full

In [35]:
relevance_first = SVD_relevance_first(svd_predictions_score_df)

In [36]:
np.asarray(relevance_first).shape

(1144, 100)

In [37]:
n_f = NDCG_forUser(relevance_first[0])

  


In [38]:
n_f

nan

Below is the NDCG scores of each user

In [39]:
def allUsers_ndcg_svd(relevance):
    allUsers_implicit_ndcg = []
    for i in range(len(svd_predictions_score_df)):
        allUsers_implicit_ndcg.append(NDCG_forUser(relevance[i]))
    return allUsers_implicit_ndcg
all_ndcg_f = allUsers_ndcg_svd(relevance_first)

  


In [40]:
all_ndcg_f = pd.Series(all_ndcg_f).fillna(0).tolist()

In [41]:
np.asarray(all_ndcg_f).mean()

0.14469821380734263

# LightGBM Baseline

##  Feature 1 (Implicit factorization model scores)

Train an implicit factorization model using the train_features dataset

In [42]:
f1_model = ImplicitFactorizationModel(n_iter = 5)

current = time.time()

f1_model.fit(implicit_train_features,verbose = True)

end = time.time()
diff = end - current
print('Training took %d second'%(diff))

Epoch 0: loss 0.9283265088995297
Epoch 1: loss 0.5826922381917635
Epoch 2: loss 0.44931991895039874
Epoch 3: loss 0.3980232814947764
Epoch 4: loss 0.34253555883963904
Training took 12 second


### Target Movies

In [43]:
allUsers_topK_f1 = allUsers_topK_implicit(f1_model)
target_positive = allUsers_topK_f1

Add negative samples to the target movies

In [44]:
len(train_df['user_id'].unique())

1144

In [73]:
'''import random
target_negative = []
for i in range(1,len(train_features_df)+1):
    target_negative.append(random.choices(train_features_df[train_features_df['user_id']==i]['book_id'].values, k = 5))'''

"import random\ntarget_negative = []\nfor i in range(1,len(train_features_df)+1):\n    target_negative.append(random.choices(train_features_df[train_features_df['user_id']==i]['book_id'].values, k = 5))"

In [74]:
'''target_movies = []
for i in range(len(num_users)):
    target_movies.append(target_positive[i] + target_negative[i])'''
target_movies = target_positive

Map the target movies to pick the correct predictions based on the item id

In [75]:
def map_movies(target_movies):
    map_target_movies = []
    map_help = []
    for movies in target_movies:
        for movie in movies:
            map_help.append(iid_map[movie])
        map_target_movies.append(map_help)
        map_help = []
    return map_target_movies

Implicit Factorization model Scores

Train implicit on implicit_features_train! and take these scores.

In [76]:
def give_movies_implicit_scores(map_movies):
    implicit_scores = 0
    feature = []
    for i in range(len(num_users)):
        #for movies in map_target_movies:
        my_list = map_movies[i]
        my_array = np.array(my_list)
        implicit_scores = f1_model.predict(user_ids = i,item_ids = my_array)
        feature.append(implicit_scores)
    return feature

In [77]:
map_target_movies = map_movies(target_movies)
feature1 = give_movies_implicit_scores(map_target_movies)
feature1 = np.asarray(feature1)
feature1.shape

(1157, 100)

In [78]:
#feature1 = np.asarray(feature1).reshape(len(num_users),100)

## Feature 2 (Rank)

In [79]:
feature2 = []
for i in range(len(num_users)):
    my_list = map_target_movies[i]
    my_array = np.array(my_list)
    feature2.append(st.rankdata(f1_model.predict(i,my_array)))

In [98]:
np.array(feature2).shape

(1157, 100)

## Feature 3 (SVD scores)

SVD model Scores. Take advantage of the DataFrame containing the scores to extract the predicted scores for the desired movies.

In [154]:
df_movie_features1 = train_features_df.pivot(
    index='user_id',
    columns='book_id',
    values='rating'
).fillna(0)

In [155]:
U1, sigma1, Vt1 = svds(df_movie_features1)
sigma1 = np.diag(sigma1)
svd_predictions_score1 = np.dot(np.dot(U1, sigma1), Vt1)
svd_predictions_score_df1 = pd.DataFrame(svd_predictions_score1, columns = df_movie_features1.columns)

In [156]:
target_movies[0][2]

301

In [157]:
svd_predictions_score_df1[svd_predictions_score_df1.index == 0]

book_id,2,3,4,5,7,8,9,10,11,13,...,9974,9978,9981,9985,9988,9990,9991,9995,9998,10000
0,-0.011926,-0.000368,-0.028007,0.056066,-0.029936,-0.009058,-0.000974,0.017639,0.074756,0.008876,...,-5e-06,0.00044,-0.000365,2e-06,9.2e-05,0.00067,0.000875,-0.000487,-0.000219,-0.001444


In [158]:
float(svd_predictions_score_df1[svd_predictions_score_df1.index == 0][target_movies[0][0]].values)

-0.0014342283303730155

In [204]:

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(svd_predictions_score_df1[svd_predictions_score_df1.index == 0])

book_id     2         3         4         5         7         8         9      \
0       -0.011926 -0.000368 -0.028007  0.056066 -0.029936 -0.009058 -0.000974   

book_id     10        11        13       14        15       18        19     \
0        0.017639  0.074756  0.008876 -0.01659 -0.001072 -0.00515 -0.026239   

book_id     21        22        23        24        25        26        27     \
0       -0.001276  0.034502 -0.016088 -0.025889 -0.030926 -0.001434 -0.020799   

book_id     28        29        32        33        35        36        37     \
0        0.013007 -0.008369  0.009697  0.051777  0.024407  0.005516 -0.007113   

book_id     38        39        40        41        42        43        44     \
0        0.049168 -0.008523  0.009868 -0.000052  0.009247  0.019097  0.002967   

book_id     45        46       47        48        49        50       52     \
0        0.066023  0.015663  0.00124 -0.007862 -0.001782 -0.008793  0.00048   

book_id     53        54      

In [183]:
feature3 = []
feature3_user = []
count = 0
for i in range(len(num_users)):
    for j in range(len(target_movies[i])):
        feature3_user.append(float(svd_predictions_score_df1[svd_predictions_score_df1.index == i][target_movies[i][j]].values))
    feature3.append(feature3_user)
    feature3_user = []

a
0
0
1
0
1
2
0
2
3
0
3
4
0
4
5
0
5
6
0
6
7
0
7
8
0
8
9
0
9
10
0
10
11
0
11
12
0
12
13
0
13
14
0
14
15
0
15
16
0
16
17
0
17
18
0
18
19
0
19
20
0
20
21
0
21
22
0
22
23
0
23
24
0
24
25
0
25
26
0
26
27
0
27
28
0
28
29
0
29
30
0
30
31
0
31
32
0
32
33
0
33
34
0
34
35
0
35
36
0
36
37
0
37
38
0
38
39
0
39
40
0
40
41
0
41
42
0
42
43
0
43
44
0
44
45
0
45
46
0
46
47
0
47
48
0
48
49
0
49
50
0
50
51
0
51
52
0
52
53
0
53
54
0
54
55
0
55
56
0
56
57
0
57
58
0
58
59
0
59
60
0
60
61
0
61
62
0
62
63
0
63
64
0
64
65
0
65
66
0
66
67
0
67
68
0
68
69
0
69
70
0
70
71
0
71
72
0
72
73
0
73
74
0
74
75
0
75
76
0
76
77
0
77
78
0
78
79
0
79
80
0
80
81
0
81
82
0
82
83
0
83
84
0
84
85
0
85
86
0
86
87
0
87
88
0
88
89
0
89
90
0
90
91
0
91
92
0
92
93
0
93
94
0
94
95
0
95
96
0
96
97
0
97
98
0
98
99
0
99
100
a
1
0
101
1
1
102
1
2
103
1
3
104
1
4
105
1
5
106
1
6
107
1
7
108
1
8
109
1
9
110
1
10
111
1
11
112
1
12
113
1
13
114
1
14
115
1
15
116
1
16
117
1
17
118
1
18
119
1
19
120
1
20
121
1
21
122
1
22
123
1
23
124
1
24
125

KeyError: 9439

In [None]:
type(feature3)

## Feature 4 (Genre)

In [None]:
def create_genre_feature(genre):
    feature = []
    genre_per_user = []
    for i in range(len(num_users)):
        for j in range(len(target_movies[i])):
            a =  movies_df[movies_df['movieId'] == target_movies[i][j]]['genres']
            a = a.tolist()
            a = str(a[0])
            a = a.split('|')
            if genre in a:
                genre_per_user.append(1)
            else:
                genre_per_user.append(0)
        feature.append(genre_per_user)
        genre_per_user = []
    return feature

In [None]:
feature4 = create_genre_feature('Comedy')

### Features 8-15 (genres)

In [None]:
feature8 = create_genre_feature('Adventure')
feature9 = create_genre_feature('Drama')
feature10 = create_genre_feature('Romance')
feature11 = create_genre_feature('Fantasy')
feature12 = create_genre_feature('Thriller')
feature13 = create_genre_feature('Animation')
feature14 = create_genre_feature('Children')
feature15 = create_genre_feature('Horror')

## Feature 5 (#Users watched each target movie)

In [89]:
number_views = []
number_view_per_movie = []
for i in range(len(num_users)):
    for j in range(len(target_movies[i])):
        a = len(ratings_df[ratings_df['book_id'] == target_movies[i][j]]['user_id'])
        number_view_per_movie.append(a)
    number_views.append(number_view_per_movie)
    number_view_per_movie = []


In [90]:
feature5 = number_views

In [99]:
np.array(feature5).shape

(1157, 100)

## Feature 6 (New user)

In [92]:
len(ratings_df[ratings_df['user_id']==1]) > 10

False

In [94]:
new_users_binary = []
binary_per_user = []
for i in range(len(num_users)):
    for j in range(len(target_movies[i])):
        a = len(ratings_df[ratings_df['user_id']==1]) > 10
        if a:
            binary_per_user.append(1)
        else:
            binary_per_user.append(0)
    new_users_binary.append(binary_per_user)
    binary_per_user = []

In [95]:
#rankings of old/new users

In [96]:
feature6 = new_users_binary

In [97]:
np.array(feature6).shape

(1157, 100)

## Feature 7(Embeddings) x32

In [101]:
emb = f1_model._net.item_embeddings

In [102]:
emb_features = []
helper = []
for i in range(32):
    for j in range(len(num_items)):
        helper.append(emb.weight[j][i].item())
    emb_features.append(helper)
    helper = []

In [103]:
a = []
b = []
feature7 = []
for i in range(32):
    for user in range(len(num_users)):
        for movie in map_target_movies[user]:
            a.append(emb_features[0][movie])
        b.append(a)
        a = []
    feature7.append(b)
    b = []

In [104]:
np.array(feature7).shape

(32, 1157, 100)

In [105]:
np.array(feature7[0]).shape

(1157, 100)

## Features stack

In [106]:
'''features = np.stack((feature1,feature2,feature3,feature4,feature5,feature6,
                     feature7[0], feature7[1],feature7[2], feature7[3],
                     feature7[4], feature7[5],feature7[6], feature7[7],
                     feature7[8], feature7[9],feature7[10], feature7[11],
                     feature7[12], feature7[13],feature7[14], feature7[15],
                     feature7[16], feature7[17],feature7[18], feature7[19],
                     feature7[20], feature7[21],feature7[22], feature7[23],
                     feature7[24], feature7[25],feature7[26], feature7[27],
                     feature7[28], feature7[29],feature7[30], feature7[31],
                     feature8,feature9,feature10,feature11,feature12,
                    feature13,feature14,feature15))'''
features = np.stack((feature1,feature2,feature5,feature6,
                     feature7[0], feature7[1],feature7[2], feature7[3],
                     feature7[4], feature7[5],feature7[6], feature7[7],
                     feature7[8], feature7[9],feature7[10], feature7[11],
                     feature7[12], feature7[13],feature7[14], feature7[15],
                     feature7[16], feature7[17],feature7[18], feature7[19],
                     feature7[20], feature7[21],feature7[22], feature7[23],
                     feature7[24], feature7[25],feature7[26], feature7[27],
                     feature7[28], feature7[29],feature7[30], feature7[31]))

In [107]:
len(features)

36

## Labels

Below are the movies that each user watched

In [110]:
def watched_movies(df):
    watched = []
    for i in range(len(num_users)):
        watched.append(df[df["user_id"]==uid_rev_map.get(i)].groupby("book_id")["book_id"].count().index)  
    return watched

In [111]:
#train_labels_df
#test_df
#val_df
train_watched = watched_movies(train_labels_df)
test_watched = watched_movies(test_df)
val_watched = watched_movies(val_df)

Creation of the target list. It containes 0s and 1s for the 100 recommended movies for each user. Based on the watched movies it assigns the values. If the user watched the recommended movie, then the value of the target is equal to 1, otherwise it is 0.

In [112]:
def create_target(watched):
    user_target = []
    target = []
    for i in range(len(num_users)):
        for target_movie in target_movies[i]:
            if target_movie in watched[i]:
                user_target.append(1)
            else:
                user_target.append(0)
        target.append(user_target)
        user_target = []
    return target

In [113]:
label_train = create_target(train_watched)
label_test = create_target(test_watched)
label_val = create_target(val_watched)
#y_train = y_test = y_val

## LightGBM model

In [114]:
X = np.array(features)
label_train = np.array(label_train)
label_test = np.array(label_test)
label_val = np.array(label_val)

In [115]:
k = len(target_movies[0])
X = X.reshape(len(num_users)*k, len(features))
y_train = label_train.reshape(len(num_users)*k,)
y_test = label_test.reshape(len(num_users)*k,)
y_val = label_val.reshape(len(num_users)*k,)

In [116]:
import lightgbm as lgb
gbm = lgb.LGBMRanker(min_child_samples=1, learning_rate=.1, n_jobs=2, importance_type='gain',
                     num_leaves=31, min_child_weight=100)

target movies
train_df
feature1
feature2

In [117]:
'''X_train_feature1 = []
X_train_feature2 = []
for j in range(610):
    for i in range(len(target_movies[j])):
        if target_movies[j][i] in val_df['movie_id'].values.tolist():
            X_train_feature1.append(feature1[i])
            X_train_feature2.append(feature2[i])
print(X_train_feature1)
print(X_train_feature2)
#X_train = np.stack(X_train_feature1,X_train_feature2)'''

"X_train_feature1 = []\nX_train_feature2 = []\nfor j in range(610):\n    for i in range(len(target_movies[j])):\n        if target_movies[j][i] in val_df['movie_id'].values.tolist():\n            X_train_feature1.append(feature1[i])\n            X_train_feature2.append(feature2[i])\nprint(X_train_feature1)\nprint(X_train_feature2)\n#X_train = np.stack(X_train_feature1,X_train_feature2)"

X_train
X_test
X_val
y_train
y_test
y_val

In [118]:
X_train = X
X_test = X
X_val = X

In [119]:
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [120]:
query_train = [X_train.shape[0]]
query_val = [X_val.shape[0]]
query_test = [X_test.shape[0]]
group_train = []
group_val = []
print(query_train)
print(query_val)
print(query_test)
'''
for i in range(390):
    group_train.append(100)
group_train.append(40)
group_val.append(60)
for i in range(97):
    group_val.append(100)'''

for i in range(len(num_users)):
    group_train.append(k)
    group_val.append(k)


[115700]
[115700]
[115700]


In [121]:
gbm.fit(X_train, y_train, group=group_train,
        eval_set=[(X_val, y_val)], eval_group=[group_val],
        eval_at=[1, 3, 5, 10], early_stopping_rounds=50, eval_metric='ndcg')
#no early stopping

[1]	valid_0's ndcg@1: 0.294728	valid_0's ndcg@3: 0.293734	valid_0's ndcg@5: 0.299182	valid_0's ndcg@10: 0.311325
Training until validation scores don't improve for 50 rounds
[2]	valid_0's ndcg@1: 0.288678	valid_0's ndcg@3: 0.285828	valid_0's ndcg@5: 0.290644	valid_0's ndcg@10: 0.304892
[3]	valid_0's ndcg@1: 0.27917	valid_0's ndcg@3: 0.284013	valid_0's ndcg@5: 0.289947	valid_0's ndcg@10: 0.302371
[4]	valid_0's ndcg@1: 0.27917	valid_0's ndcg@3: 0.283938	valid_0's ndcg@5: 0.289306	valid_0's ndcg@10: 0.305229
[5]	valid_0's ndcg@1: 0.282627	valid_0's ndcg@3: 0.284937	valid_0's ndcg@5: 0.288559	valid_0's ndcg@10: 0.301771
[6]	valid_0's ndcg@1: 0.282627	valid_0's ndcg@3: 0.281483	valid_0's ndcg@5: 0.286198	valid_0's ndcg@10: 0.30359
[7]	valid_0's ndcg@1: 0.273984	valid_0's ndcg@3: 0.280028	valid_0's ndcg@5: 0.288563	valid_0's ndcg@10: 0.30452
[8]	valid_0's ndcg@1: 0.27312	valid_0's ndcg@3: 0.279397	valid_0's ndcg@5: 0.287251	valid_0's ndcg@10: 0.304212
[9]	valid_0's ndcg@1: 0.275713	valid_0's

LGBMRanker(importance_type='gain', min_child_samples=1, min_child_weight=100,
           n_jobs=2)

In [122]:
test_pred = gbm.predict(X_test)
test_pred

array([ 0.00605222,  0.00605222,  0.00605222, ..., -0.02799463,
        0.00731164, -0.02799463])

## NDCG Score

In [139]:
p = 0
m = 100
lgbm_pred_per_user = []
for i in range(len(num_users)):
    lgbm_pred_per_user.append(test_pred[p:m])
    p += 100
    m += 100

### NDCG IFM on this data

In [140]:
np.array(feature7[0][609]).shape

(100,)

In [141]:
feature1.shape

(1157, 100)

In [145]:
lgbm_NDCG = []
IFM_NDCG = []
#SVD_NDCG = []
for i in range(len(num_users)):
    excel_df = pd.DataFrame({'movie_id': target_movies[i], 'model1_score': feature1[i],
                             'rank': feature2[i], '#users_watched': feature5[i],
                             'new_user': feature6[i],
                             'emedding1': feature7[0][i], 'emedding2': feature7[1][i], 'emedding3': feature7[2][i],
                             'emedding4': feature7[3][i], 'emedding5': feature7[4][i], 'emedding6': feature7[5][i],
                             'emedding7': feature7[6][i], 'emedding8': feature7[7][i], 'emedding9': feature7[8][i],
                             'emedding10': feature7[9][i], 'emedding11': feature7[10][i], 'emedding12': feature7[11][i],
                             'emedding13': feature7[12][i], 'emedding14': feature7[13][i], 'emedding15': feature7[14][i],
                             'emedding16': feature7[15][i], 'emedding17': feature7[16][i], 'emedding18': feature7[17][i],
                             'emedding19': feature7[18][i], 'emedding20': feature7[19][i], 'emedding21': feature7[20][i],
                             'emedding22': feature7[21][i], 'emedding23': feature7[22][i], 'emedding24': feature7[23][i],
                             'emedding25': feature7[24][i], 'emedding26': feature7[25][i], 'emedding27': feature7[26][i],
                             'emedding28': feature7[27][i], 'emedding29': feature7[28][i], 'emedding30': feature7[29][i],
                             'emedding31': feature7[30][i], 'emedding32': feature7[31][i],
                             'train_label': label_train[i], 'test_label': label_test[i],
                             'y_val': label_val[i], 'LGBM Score': lgbm_pred_per_user[i]})
    excel_df = excel_df.sort_values('LGBM Score', ascending=False)
    relevance_lgbm_user = excel_df['test_label'].values.tolist()
    lgbm_NDCG.append(NDCG_forUser(relevance_lgbm_user))
    excel_df = excel_df.sort_values('model1_score', ascending=False)
    ifm_relevance = excel_df['test_label'].values.tolist()
    IFM_NDCG.append(NDCG_forUser(ifm_relevance))
    #excel_df = excel_df.sort_values('model2_score', ascending=False)
    #svd_relevance_h = excel_df['test_label'].values.tolist()
    #SVD_NDCG.append(NDCG_forUser(svd_relevance_h))

  


In [146]:
lgbm_NDCG = pd.Series(lgbm_NDCG).fillna(0).tolist()
IFM_NDCG = pd.Series(IFM_NDCG).fillna(0).tolist()
#SVD_NDCG = pd.Series(SVD_NDCG).fillna(0).tolist()

## Excel-Like

In [None]:
excel1_df = pd.DataFrame({'movie_id': target_movies[0],'model1_score':feature1[0], 'model2_score':feature3[0],'is_comedy':feature4[0],
                            '#users_watched':feature5[0], 'train_label':label_train[0], 'test_label':label_test[0],
                            'label_val':label_val[0],'LGBM Score':lgbm_pred_per_user[0]})

In [None]:
excel1_df.head(10)

In [None]:
excel1_df = excel1_df.sort_values('LGBM Score',ascending = False)
excel1_df.head(10)

In [None]:
relevance_lgbm_firstUser = excel1_df['test_label'].values.tolist()

In [None]:
L1GBM_ndcg = NDCG_forUser(relevance_lgbm_firstUser)
L1GBM_ndcg

## NDCG Comparison

In [147]:
allUsers_implicit_ndcg[0]

0.0

In [148]:
ndcg_df = pd.DataFrame({'Implicit FM': allUsers_implicit_ndcg, 'LightGBM': lgbm_NDCG, 'IFM_this': IFM_NDCG})

In [149]:
ndcg_df

Unnamed: 0,Implicit FM,LightGBM,IFM_this
0,0.000000,0.000000,0.000000
1,0.386853,0.255958,1.000000
2,0.000000,0.320261,0.401471
3,0.000000,0.000000,0.000000
4,0.000000,0.386853,0.278943
...,...,...,...
1152,0.159099,0.000000,0.000000
1153,0.000000,0.185449,0.175425
1154,0.257222,0.000000,0.000000
1155,0.000000,0.183169,0.315465


In [150]:
ndcg_df['Implicit FM'].mean()

0.15528082612401134

In [151]:
lgbm_mean = ndcg_df['LightGBM'].mean()
lgbm_mean

0.2477566272389939

In [152]:
ifm_mean = ndcg_df['IFM_this'].mean()
ifm_mean

0.27932911769327834

In [153]:
#svd_mean = ndcg_df['SVD_this'].mean()
#svd_mean

## Visualization