In [105]:
import pandas as pd
import numpy as np
from ast import literal_eval
from scipy import linalg
from scipy.sparse import csr_matrix
from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split
from collections import defaultdict

In [3]:
ratings = pd.read_csv("../data/ratings_small.csv")

In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [5]:
movies_metadata = pd.read_csv("../data/movies_metadata.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [6]:
movies_metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [7]:
movies_metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [8]:
literal_eval(movies_metadata['genres'].loc[0])

[{'id': 16, 'name': 'Animation'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 10751, 'name': 'Family'}]

In [9]:
def extract_genres(x):
    x = literal_eval(x)
    if isinstance(x, list):
        genre_names = [i['name'] for i in x]
        if len(genre_names) > 3:
            genre_names= genre_names[:3]
        return genre_names
    return []

In [10]:
movies_metadata['genres']= movies_metadata['genres'].apply(extract_genres)

In [11]:
movies_metadata['genres']

0         [Animation, Comedy, Family]
1        [Adventure, Fantasy, Family]
2                   [Romance, Comedy]
3            [Comedy, Drama, Romance]
4                            [Comedy]
                     ...             
45461                 [Drama, Family]
45462                         [Drama]
45463       [Action, Drama, Thriller]
45464                              []
45465                              []
Name: genres, Length: 45466, dtype: object

In [12]:
movies = movies_metadata.drop(columns=['adult', 'belongs_to_collection', 'budget', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'video',
       'vote_average', 'vote_count'])

In [13]:
movies.head(2)

Unnamed: 0,genres,id,title
0,"[Animation, Comedy, Family]",862,Toy Story
1,"[Adventure, Fantasy, Family]",8844,Jumanji


In [14]:
movies = movies.rename(columns={'id':'movieId'})

In [15]:
movies.head(3)

Unnamed: 0,genres,movieId,title
0,"[Animation, Comedy, Family]",862,Toy Story
1,"[Adventure, Fantasy, Family]",8844,Jumanji
2,"[Romance, Comedy]",15602,Grumpier Old Men


In [16]:
movies.shape

(45466, 3)

In [17]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179


In [18]:
ratings.shape

(100004, 4)

In [18]:
ratings_matrix = np.ndarray(
    shape=(np.max(ratings.movieId.values), np.max(ratings.userId.values)),
    dtype=np.uint8)
ratings_matrix[ratings.movieId.values-1, ratings.userId.values-1] = ratings.rating.values

In [19]:
ratings_matrix

array([[0, 0, 0, ..., 0, 4, 5],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [20]:
ratings_matrix.shape

(163949, 671)

In [21]:
normalised_mat = ratings_matrix - np.asarray([(np.mean(ratings_matrix, 1))]).T
normalised_mat

array([[-1.38897168, -1.38897168, -1.38897168, ..., -1.38897168,
         2.61102832,  3.61102832],
       [-0.5290611 , -0.5290611 , -0.5290611 , ..., -0.5290611 ,
        -0.5290611 , -0.5290611 ],
       [-0.27123696, -0.27123696, -0.27123696, ..., -0.27123696,
        -0.27123696, -0.27123696],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.00745156, -0.00745156, -0.00745156, ..., -0.00745156,
        -0.00745156, -0.00745156]])

In [22]:
normalised_mat.shape

(163949, 671)

In [23]:
A = normalised_mat.T / np.sqrt(ratings_matrix.shape[0] - 1)

In [24]:
U, S, V = linalg.svd(A, full_matrices=False)

__Now I have to calculate cosine similarity between the values and then create a function to extract the movies from the movie dataframe__

__Below I will try another approach__

In [19]:
movies.head(2)

Unnamed: 0,genres,movieId,title
0,"[Animation, Comedy, Family]",862,Toy Story
1,"[Adventure, Fantasy, Family]",8844,Jumanji


In [39]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [20]:
movies_ = movies

In [21]:
movies_ = movies[["movieId", "title", "genres"]]
movies_.head(2)

Unnamed: 0,movieId,title,genres
0,862,Toy Story,"[Animation, Comedy, Family]"
1,8844,Jumanji,"[Adventure, Fantasy, Family]"


In [22]:
movies_['genres'] = ["".join(string) for string in movies_['genres']]
movies_.head(2)

Unnamed: 0,movieId,title,genres
0,862,Toy Story,AnimationComedyFamily
1,8844,Jumanji,AdventureFantasyFamily


In [40]:
ratings = ratings.drop(columns=["timestamp"])
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [54]:
ratings.sort_values('movieId').head()

Unnamed: 0,userId,movieId,rating
9713,68,1,4.0
35933,261,1,1.5
52631,383,1,5.0
35983,262,1,2.5
12038,77,1,4.0


In [55]:
movies_.sort_values('movieId').head()

Unnamed: 0,movieId,title,genres
2429,100,"Lock, Stock and Two Smoking Barrels",ComedyCrime
13609,10000,La estrategia del caracol,ComedyDrama
4435,10001,Young Einstein,ComedyScience Fiction
17451,100010,Flight Command,DramaWar
36946,100017,Hounded,Drama


In [68]:
movies_.drop(movies_.index[19730],inplace=True)
movies_.drop(movies_.index[29502],inplace=True)
movies_.drop(movies_.index[35585],inplace=True)

In [69]:
movies_.head(3)

Unnamed: 0,movieId,title,genres
0,862,Toy Story,AnimationComedyFamily
1,8844,Jumanji,AdventureFantasyFamily
2,15602,Grumpier Old Men,RomanceComedy


In [71]:
movies_.movieId = movies_.movieId.astype(np.int64)

In [74]:
ratings_df = pd.merge(ratings,movies_, on='movieId')
ratings_df

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1371,2.5,Rocky III,Drama
1,4,1371,4.0,Rocky III,Drama
2,7,1371,3.0,Rocky III,Drama
3,19,1371,4.0,Rocky III,Drama
4,21,1371,3.0,Rocky III,Drama
...,...,...,...,...,...
44989,652,129009,4.0,Love Is a Ball,ComedyRomance
44990,653,2103,3.0,Solaris,DramaScience FictionMystery
44991,659,167,4.0,K-PAX,DramaScience Fiction
44992,659,563,3.0,Starship Troopers,AdventureActionThriller


In [94]:
ratings_df.drop_duplicates()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1371,2.5,Rocky III,Drama
1,4,1371,4.0,Rocky III,Drama
2,7,1371,3.0,Rocky III,Drama
3,19,1371,4.0,Rocky III,Drama
4,21,1371,3.0,Rocky III,Drama
...,...,...,...,...,...
44989,652,129009,4.0,Love Is a Ball,ComedyRomance
44990,653,2103,3.0,Solaris,DramaScience FictionMystery
44991,659,167,4.0,K-PAX,DramaScience Fiction
44992,659,563,3.0,Starship Troopers,AdventureActionThriller


In [77]:
matrix= ratings_df.pivot_table(index='userId', columns='title', values='rating').fillna(0)
matrix.head()

title,!Women Art Revolution,'Gator Bait,'Twas the Night Before Christmas,...And God Created Woman,00 Schneider - Jagd auf Nihil Baxter,10 Items or Less,10 Things I Hate About You,"10,000 BC",11'09''01 - September 11,12 Angry Men,...,Zodiac,Zombie Flesh Eaters,Zombie Holocaust,Zozo,eXistenZ,xXx,¡Three Amigos!,À nos amours,Ödipussi,Şaban Oğlu Şaban
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


__The code below correctly implements SVD prediction__

In [81]:
movies_

Unnamed: 0,movieId,title,genres
0,862,Toy Story,AnimationComedyFamily
1,8844,Jumanji,AdventureFantasyFamily
2,15602,Grumpier Old Men,RomanceComedy
3,31357,Waiting to Exhale,ComedyDramaRomance
4,11862,Father of the Bride Part II,Comedy
...,...,...,...
45461,439050,Subdue,DramaFamily
45462,111109,Century of Birthing,Drama
45463,67758,Betrayal,ActionDramaThriller
45464,227506,Satan Triumphant,


In [82]:
ratings

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0
...,...,...,...
99999,671,6268,2.5
100000,671,6269,4.0
100001,671,6365,4.0
100002,671,6385,2.5


In [83]:
reader = Reader(rating_scale=(1, 5))

In [84]:
data = Dataset.load_from_df(ratings, reader)

In [85]:
trainset, testset = train_test_split(data, test_size = .25)

In [87]:
model = SVD(n_factors=100)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fb0039a3c10>

In [88]:
model.qi.shape

(8222, 100)

In [100]:
## iid = 862 "Toy Story"
model.predict(20, 862)

Prediction(uid=20, iid=862, r_ui=None, est=3.0486104428127994, details={'was_impossible': False})

In [106]:
predictions = model.test(testset)

In [103]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n
        

In [108]:
top_n = get_top_n(predictions)
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

464 [260, 222, 610, 349, 588, 262, 163, 165, 380, 592]
509 [608, 969, 1221, 1196, 296, 6016, 1172, 3362, 923, 1036]
105 [50, 1233, 1213, 4226, 1193, 2959, 593, 2858, 2762, 1199]
41 [110, 260, 1196, 233, 356, 1965, 2529, 1374, 6863, 2746]
547 [1258, 1617, 908, 307, 2973, 926, 1964, 55820, 1233, 48780]
102 [527, 2318, 1219, 858, 899, 1267, 898, 2064, 3088, 994]
40 [1197, 48780, 260, 7361, 1136, 7153, 49272, 122886]
562 [50, 7361, 5418, 5952, 778, 2858, 2797, 7147, 356, 2599]
452 [2918, 296, 318, 4973, 923, 527, 5225, 903, 908, 235]
406 [750, 912, 1221, 1237, 1212, 908, 1258, 4993, 1077, 111]
347 [1198, 106489, 4085, 786, 1721]
669 [968, 260, 223, 2959, 2772, 1135, 2683, 2722]
468 [1196, 318, 908, 913, 1221, 1212, 1267, 1193, 2132, 3462]
636 [608, 17, 766, 25, 1073, 639, 648, 736, 724, 762]
271 [593, 4993, 7153, 1230, 3578, 3114, 1270, 5060, 5782, 110]
61 [50, 4993, 50872, 1035, 1220, 5349, 46578, 2300, 48394, 5444]
346 [2571, 2019, 260, 7361, 4896, 480, 4995, 1136, 1387, 2997]
534 [50, 2