In [50]:
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [51]:
ratings = pd.read_csv("../data/ratings_small.csv", usecols=['userId','movieId','rating'])

In [52]:
ratings

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0
...,...,...,...
99999,671,6268,2.5
100000,671,6269,4.0
100001,671,6365,4.0
100002,671,6385,2.5


In [121]:
ratings.rating.value_counts()

4.0    28750
3.0    20064
5.0    15095
3.5    10538
4.5     7723
2.0     7271
2.5     4449
1.0     3326
1.5     1687
0.5     1101
Name: rating, dtype: int64

In [53]:
ratings['movieId'].nunique()

9066

In [122]:
# surprise reader API to read the dataset
reader = Reader(rating_scale=(0,5))

In [123]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [147]:
# svd = SVD()
# cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [62]:
movies = pd.read_csv("../data/movies_metadata.csv",usecols=['id','title'])

In [63]:
movies = movies.drop([19730, 29503, 35587])
movies = movies.drop_duplicates(subset="title")
movies = movies.astype({'id': 'int32'})

In [64]:
links = pd.read_csv("../data/links.csv",usecols=['movieId','tmdbId'])

In [65]:
links

Unnamed: 0,movieId,tmdbId
0,1,862.0
1,2,8844.0
2,3,15602.0
3,4,31357.0
4,5,11862.0
...,...,...
45838,176269,439050.0
45839,176271,111109.0
45840,176273,67758.0
45841,176275,227506.0


In [66]:
movies = pd.merge(movies, links, left_on='id', right_on='tmdbId')

In [186]:
movies

Unnamed: 0,id,title,movieId,tmdbId
0,862,Toy Story,1,862.0
1,8844,Jumanji,2,8844.0
2,15602,Grumpier Old Men,3,15602.0
3,31357,Waiting to Exhale,4,31357.0
4,11862,Father of the Bride Part II,5,11862.0
...,...,...,...,...
42301,222848,Caged Heat 3000,176263,222848.0
42302,439050,Subdue,176269,439050.0
42303,111109,Century of Birthing,176271,111109.0
42304,227506,Satan Triumphant,176275,227506.0


## Collaborative Filtering using Matrix Decomposition

In [126]:
trainset= data.build_full_trainset()
algo = SVD()
algo.fit(trainset)

testset= trainset.build_anti_testset()
predictions = algo.test(testset)

## Top n movies for user with given Id
 

In [140]:
def make_predictions_for_user(userId, n_recommendations, model, movies):
    predictions = {}
    for index, movie in movies.iterrows():
        row_pred = {'movieId': movie['movieId'], 'prediction': model.predict(userId, movie['movieId']).est}
        predictions[index] = row_pred
    predictions = pd.DataFrame.from_dict(predictions, orient='index')
    predictions = predictions.sort_values(by='prediction', ascending=False).head(n_recommendations)
    predictions = pd.merge(predictions, movies, left_on='movieId', right_on='movieId')
    return predictions[["movieId", 'prediction', 'title']]
#     return predictions

In [139]:
preds = make_predictions_for_user(1, 10, algo, movies)
preds

Unnamed: 0,movieId,prediction,id,title,tmdbId
0,318,3.816499,278,The Shawshank Redemption,278.0
1,1945,3.728911,654,On the Waterfront,654.0
2,858,3.698391,238,The Godfather,238.0
3,899,3.69087,872,Singin' in the Rain,872.0
4,911,3.682084,4808,Charade,4808.0
5,3035,3.669337,37853,Mister Roberts,37853.0
6,898,3.635809,981,The Philadelphia Story,981.0
7,58559,3.61295,155,The Dark Knight,155.0
8,50,3.604931,629,The Usual Suspects,629.0
9,527,3.599509,424,Schindler's List,424.0


## Top n movies for movie with given Id

In [141]:
from sklearn.metrics import pairwise

In [185]:
def make_predictions_for_item(itemId, n_recommendations, model, movies):
    movie_matrix = model.qi
    movies_similarities = pairwise.cosine_similarity(movie_matrix)
    inner_item_id = trainset.to_inner_iid(itemId)
    sim_scores = list(enumerate(movies_similarities[inner_item_id]))
    predictions = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    predictions
    predictions = predictions[:n_recommendations]
    predictions = [(trainset.to_raw_iid(x), y) for (x,y) in predictions]
    df = pd.DataFrame(predictions, columns=['movieId', 'prediction'])
    dfx = pd.merge(df, movies, left_on='movieId', right_on='movieId')
    return dfx[["movieId", 'prediction', 'title']]


In [184]:
make_predictions_for_item(31, 20, algo, movies)

Unnamed: 0,movieId,prediction,title
0,31,1.0,Dangerous Minds
1,5641,0.359926,The Moderns
2,2351,0.326254,Nights of Cabiria
3,25865,0.319357,The Letter
4,4223,0.318812,Enemy at the Gates
5,4109,0.315856,Flowers in the Attic
6,140739,0.31567,Eighteen
7,6997,0.314326,Hoffa
8,7835,0.312618,Song of the Thin Man
9,54995,0.30646,Planet Terror


In [172]:
movies[movies['title'] == 'Toy Story']

Unnamed: 0,id,title,movieId,tmdbId
0,862,Toy Story,1,862.0


In [176]:
ratings[ratings['movieId'] == 8844]

Unnamed: 0,userId,movieId,rating
