### MLE8

Download dataset https://www.kaggle.com/rounakbanik/the-movies-dataset/home
(You have to sign in with Kaggle username)

Do:
1) Collaborative filtering recommendations that are based on ratings.csv. Use
    a) User based and
    b) Item based approach.

2) Content based recommendations that based on movies_metadata.csv

3) Hybrid recommendations (user based, item based and content based together) based on ratings.csv and movies_metadata.csv.

Try to train models based on the datasets (the ratings data is on raw format and at first you have to make users x movies sparse matrix) and test your trained models so, that you can fill empty cells on the sparse matrix (You can also make your own ratings for multiple movies and check how the model fills your unrated movies).


In [1]:
import pandas as pd
from sklearn.metrics import pairwise_distances
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [2]:
ratings = pd.read_csv("Downloads/the-movies-dataset/ratings_small.csv")

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
movies_metadata = pd.read_csv("Downloads/the-movies-dataset/movies_metadata.csv")
movies_metadata.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


## 1. Collaborative filtering
### a) Item based

In [5]:
ratings_matrix = ratings.pivot_table(index=['movieId'],columns=['userId'],values='rating').reset_index(drop=True)
ratings_matrix.fillna(0,inplace = True)
ratings_matrix.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,...,0.0,4.0,3.5,0.0,0.0,0.0,0.0,0.0,4.0,5.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
cosine_sim = 1 - pairwise_distances(ratings_matrix.as_matrix(), metric="cosine" )
pearson_sim = 1 - pairwise_distances(ratings_matrix.as_matrix(), metric="correlation")

In [7]:
cosine_similarity_matrix = pd.DataFrame(cosine_sim)
cosine_similarity_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9056,9057,9058,9059,9060,9061,9062,9063,9064,9065
0,1.0,0.394511,0.306516,0.133614,0.245102,0.377086,0.278629,0.063031,0.117499,0.310689,...,0.055829,0.031902,0.079755,0.079755,0.079755,0.079755,0.079755,0.0,0.0,0.055829
1,0.394511,1.0,0.217492,0.164651,0.278476,0.222003,0.207299,0.223524,0.113669,0.418124,...,0.0,0.055038,0.068797,0.082557,0.082557,0.137594,0.068797,0.0,0.0,0.0
2,0.306516,0.217492,1.0,0.177012,0.370732,0.247499,0.435648,0.127574,0.306717,0.191255,...,0.0,0.0,0.0,0.116226,0.116226,0.0,0.0,0.0,0.0,0.0
3,0.133614,0.164651,0.177012,1.0,0.179556,0.072518,0.184626,0.501513,0.25463,0.111447,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.245102,0.278476,0.370732,0.179556,1.0,0.272645,0.388476,0.194113,0.367941,0.246846,...,0.0,0.176845,0.0,0.117897,0.117897,0.0,0.0,0.0,0.0,0.0


In [8]:
pearson_similarity_matrix = pd.DataFrame(pearson_sim)
pearson_similarity_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9056,9057,9058,9059,9060,9061,9062,9063,9064,9065
0,1.0,0.223742,0.183266,0.071055,0.105076,0.201503,0.156075,0.019379,0.023699,0.089163,...,0.040978,0.011348,0.070607,0.070607,0.070607,0.070607,0.070607,-0.028157,-0.028157,0.040978
1,0.223742,1.0,0.12379,0.125014,0.193144,0.085889,0.117211,0.209299,0.05381,0.306685,...,-0.0162,0.043525,0.058457,0.073388,0.073388,0.133113,0.058457,-0.0162,-0.0162,-0.0162
2,0.183266,0.12379,1.0,0.147771,0.317911,0.158071,0.390331,0.109818,0.274638,0.086065,...,-0.011221,-0.011221,-0.011221,0.109898,0.109898,-0.011221,-0.011221,-0.011221,-0.011221,-0.011221
3,0.071055,0.125014,0.147771,1.0,0.150562,0.024466,0.156876,0.496859,0.238193,0.063511,...,-0.005073,-0.005073,-0.005073,-0.005073,-0.005073,-0.005073,-0.005073,-0.005073,-0.005073,-0.005073
4,0.105076,0.193144,0.317911,0.150562,1.0,0.186936,0.339605,0.179371,0.339402,0.150292,...,-0.011165,0.173054,-0.011165,0.111648,0.111648,-0.011165,-0.011165,-0.011165,-0.011165,-0.011165


In [23]:
def get_movie_name(movieid):
    if movieid < len(movies_metadata):
        return movies_metadata.iloc[movieid].title
    else:
        return ""

def get_movie_name_id(movieid):
    return movies_metadata[movies_metadata['id']==str(movieid)].title

In [10]:
def recommend_movie_based_on(movie_name, df):
    movie_index=movies_metadata[movies_metadata['original_title']==movie_name].index.tolist()[0]
    recommended = df.iloc[movie_index].sort_values()[::-1]
    counter = 1
    for recommendation in recommended.index[1:6]:
        print(get_movie_name(recommendation) + " rank: " + str(recommended.iloc[counter]))
        counter = counter + 1

In [11]:
recommend_movie_based_on("Toy Story", cosine_similarity_matrix)
## :notsureif:

This Is My Father rank: 0.594709812032
French Kiss rank: 0.576187845977
To Live rank: 0.564533861453
Und keiner weint mir nach rank: 0.56294560026
The Ghost and the Darkness rank: 0.548023021399


In [12]:
recommend_movie_based_on("Toy Story", pearson_similarity_matrix)
## :notsureif:

This Is My Father rank: 0.474140729265
Friday the 13th Part 2 rank: 0.393799044241
The Ghost and the Darkness rank: 0.372371303118
Fantastic Voyage rank: 0.366277254361
Und keiner weint mir nach rank: 0.356876317648


In [13]:
recommend_movie_based_on("Die Hard", cosine_similarity_matrix)

Magic Hunter rank: 0.504234270368
Election rank: 0.469506687685
The Nutty Professor rank: 0.453238864223
Synthetic Pleasures rank: 0.434956483055
Someone Else's America rank: 0.429274602446


In [14]:
recommend_movie_based_on("Die Hard", pearson_similarity_matrix)

Magic Hunter rank: 0.496879503248
Election rank: 0.461952391612
The Nutty Professor rank: 0.441956161158
Synthetic Pleasures rank: 0.41403900206
Someone Else's America rank: 0.41215003099


### KNN

In [15]:
def nearest_rows(df, index, count):
    csr = csr_matrix(df.values)
    knn_model = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
    knn_model.fit(csr)
    matrix = df.loc[index].values.reshape(1, -1) 
    return knn_model.kneighbors(matrix, n_neighbors=count)

In [25]:
def recommend_movies_based_on_item(df, movie_name):
    movie_index=movies_metadata[movies_metadata['original_title']==movie_name].index.tolist()[0]
    distances, indices = nearest_rows(df, movie_index, 6)
    print(indices, distances)
    for i in range(1,6):
        print(get_movie_name(indices[0][i]) + " rating: " + str(distances[0][i]))
    

In [26]:
recommend_movies_based_on_item(ratings_matrix, "Toy Story")

[[   0 2506  232  321  644 1019]] [[  8.88178420e-16   4.05290188e-01   4.23812154e-01   4.35466139e-01
    4.37054400e-01   4.51976979e-01]]
This Is My Father rating: 0.405290187968
French Kiss rating: 0.423812154023
To Live rating: 0.435466138547
Und keiner weint mir nach rating: 0.43705439974
The Ghost and the Darkness rating: 0.451976978601


### b) user based

In [27]:
user_ratings_matrix = ratings.pivot_table(index=['userId'],columns=['movieId'],values='rating').reset_index(drop=True)
user_ratings_matrix.fillna(0,inplace = True)
user_ratings_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
user_cosine_sim = pd.DataFrame(1 - pairwise_distances(user_ratings_matrix.as_matrix(), metric="cosine"))
user_pearson_sim = pd.DataFrame(1 - pairwise_distances(user_ratings_matrix.as_matrix(), metric="correlation"))

In [29]:
def top_for_user(df, user_id, count):
    return df[df['userId'] == user_id].sort_values('rating', ascending = False)[0:count]

In [30]:
def recommend_movies_based_on_user(user_id, df):
    similar_users = df.iloc[user_id].sort_values()[::-1]
    top_movies = []
    for user in similar_users.index[1:6]:
        top2 = top_for_user(ratings, user, 2).movieId.values
        for movieId in top2:
            name = get_movie_name(movieId)
            if name not in top_movies and name != '':
                top_movies.append(get_movie_name(movieId))
    return top_movies[0:5]

In [31]:
recommend_movies_based_on_user(1, user_cosine_sim)

['Pinocchio',
 'Braindead',
 'The Legend of Rita',
 'Super Mario Bros.',
 'Quiz Show']

In [32]:
recommend_movies_based_on_user(1, user_pearson_sim)

['Pinocchio',
 'Braindead',
 'The Legend of Rita',
 'Super Mario Bros.',
 'Quiz Show']