In [1]:
import pandas as pd
import numpy as np

from collections import defaultdict

from surprise import SVD, KNNBasic, Reader, Dataset, accuracy, dump
from surprise.model_selection import train_test_split

In [2]:
mdf = pd.read_csv('data/df_movieId_unique_title.csv')

In [3]:
user_imput_list = [('Notebook, The ', '1'),
                   ('Road to Perdition ', '2'),
                   ('Karate Kid, The ', '3'),
                   ('Pulp Fiction ', '4'),
                   ('Zoolander ', '4'),
                   ('Majestic, The ', '5'),
                   ('Notorious ', '2'),
                   ('Monty Python and the Holy Grail ', '2'),
                   ('Apocalypse Now ', '1')]

id_list = []
for movie in user_imput_list:
    idx = mdf[mdf.title_only == movie[0]].index.tolist()[0]
    id_list.append(mdf.iloc[idx]['movieId'])

id_list_string = [str(x) for x in id_list]

rating_list = []
for rating in user_imput_list:
    rat = rating[1]
    rating_list.append(rat)

rating_list_float = [float(x) for x in rating_list]

user_input_df = pd.DataFrame(list(zip(id_list_string, rating_list_float)), columns =['movieId', 'rating'])
user_input_df['userId'] = '999999'
user_input_df = user_input_df[['userId','movieId', 'rating']]
user_input_df

Unnamed: 0,userId,movieId,rating
0,999999,8533,1.0
1,999999,5464,2.0
2,999999,2420,3.0
3,999999,296,4.0
4,999999,4816,4.0
5,999999,4994,5.0
6,999999,930,2.0
7,999999,1136,2.0
8,999999,1208,1.0


In [4]:
ratings = pd.read_csv('data/ratings.csv')
ratings = ratings [['userId', 'movieId', 'rating']]
concat_df = pd.concat([ratings, user_input_df], axis = 0, ignore_index = True)

In [5]:
reader = Reader()
surprise_data = Dataset.load_from_df(concat_df, reader)
trainset = surprise_data.build_full_trainset()

algo = SVD(n_factors = 23, random_state = 666)
algo.fit(trainset)
testset = trainset.build_anti_testset()

In [6]:
predictions = algo.test(testset)

In [7]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [8]:
top_n = get_top_n(predictions, n=10)
# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

1 [318, 48516, 720, 904, 908, 2324, 4226, 750, 7153, 922]
2 [750, 1204, 858, 1104, 904, 5618, 56782, 908, 923, 930]
3 [356, 1198, 58559, 54503, 318, 898, 4306, 899, 2571, 589]
4 [858, 750, 1089, 112552, 913, 50, 7153, 2160, 1262, 1148]
5 [1204, 3275, 912, 1225, 1197, 260, 2019, 1136, 750, 56782]
6 [1217, 1198, 1223, 56782, 1250, 1266, 1204, 750, 5690, 898]
7 [912, 1204, 1089, 3037, 318, 1104, 1201, 608, 1225, 923]
8 [904, 1136, 750, 858, 1204, 1104, 1276, 1197, 1203, 912]
9 [1204, 318, 1104, 1213, 1178, 3275, 1233, 356, 750, 951]
10 [1210, 260, 4011, 1223, 318, 750, 1204, 527, 1104, 57669]
11 [750, 1204, 4973, 1262, 904, 541, 7153, 260, 1225, 912]
12 [50, 110, 260, 296, 356, 527, 593, 608, 1089, 1136]
13 [318, 750, 1104, 38061, 741, 1204, 2324, 720, 260, 904]
14 [741, 898, 1276, 2959, 1204, 3435, 3275, 2571, 908, 1213]
15 [750, 1221, 1193, 1204, 1089, 1208, 78499, 1213, 1223, 1252]
16 [2324, 3275, 1204, 720, 741, 38061, 922, 898, 1104, 908]
17 [1204, 1208, 720, 1199, 4973, 1262, 7361, 