In [1]:
import pandas as pd
import numpy as np
import datetime
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#load 3 dataset
genome_scores_data = pd.read_csv('./ml-20m/genome-scores.csv') 
movies_data = pd.read_csv('./ml-20m/movies.csv') 
ratings_data = pd.read_csv('./ml-20m/ratings.csv')

In [3]:
#review data
genome_scores_data.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.025
1,1,2,0.025
2,1,3,0.05775
3,1,4,0.09675
4,1,5,0.14675


In [4]:
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [6]:
#prepare data for movie matrix
#create 3 dfs, and calculate the cosin similarity

#create mov_tag_df
scores_pivot = genome_scores_data.pivot_table(index = ["movieId"],columns = ["tagId"],values = "relevance").reset_index()
scores_pivot.head()

tagId,movieId,1,2,3,4,5,6,7,8,9,...,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128
0,1,0.025,0.025,0.05775,0.09675,0.14675,0.217,0.067,0.26275,0.262,...,0.0395,0.018,0.04575,0.03275,0.125,0.0415,0.01925,0.03625,0.07775,0.023
1,2,0.03975,0.04375,0.03775,0.048,0.11025,0.0725,0.04775,0.10975,0.09925,...,0.04175,0.01925,0.01725,0.02425,0.1255,0.0225,0.0155,0.01475,0.09025,0.01875
2,3,0.0435,0.05475,0.028,0.077,0.054,0.0685,0.056,0.185,0.04925,...,0.0415,0.02675,0.02775,0.03425,0.1555,0.03675,0.017,0.0195,0.097,0.0185
3,4,0.03725,0.0395,0.03675,0.031,0.06825,0.0405,0.02325,0.087,0.05125,...,0.0575,0.03375,0.02275,0.03975,0.18525,0.05925,0.015,0.01525,0.0645,0.013
4,5,0.042,0.05275,0.05925,0.03675,0.07525,0.12525,0.0285,0.085,0.0295,...,0.0425,0.02825,0.0215,0.026,0.14275,0.02075,0.0165,0.01675,0.1075,0.01825


In [7]:
#join
mov_tag_df = movies_data.merge(scores_pivot, left_on='movieId', right_on='movieId', how='left')
mov_tag_df = mov_tag_df.fillna(0) 
#drop unnessary column
mov_tag_df = mov_tag_df.drop(['title','genres'], axis = 1)
mov_tag_df.head()

Unnamed: 0,movieId,1,2,3,4,5,6,7,8,9,...,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128
0,1,0.025,0.025,0.05775,0.09675,0.14675,0.217,0.067,0.26275,0.262,...,0.0395,0.018,0.04575,0.03275,0.125,0.0415,0.01925,0.03625,0.07775,0.023
1,2,0.03975,0.04375,0.03775,0.048,0.11025,0.0725,0.04775,0.10975,0.09925,...,0.04175,0.01925,0.01725,0.02425,0.1255,0.0225,0.0155,0.01475,0.09025,0.01875
2,3,0.0435,0.05475,0.028,0.077,0.054,0.0685,0.056,0.185,0.04925,...,0.0415,0.02675,0.02775,0.03425,0.1555,0.03675,0.017,0.0195,0.097,0.0185
3,4,0.03725,0.0395,0.03675,0.031,0.06825,0.0405,0.02325,0.087,0.05125,...,0.0575,0.03375,0.02275,0.03975,0.18525,0.05925,0.015,0.01525,0.0645,0.013
4,5,0.042,0.05275,0.05925,0.03675,0.07525,0.12525,0.0285,0.085,0.0295,...,0.0425,0.02825,0.0215,0.026,0.14275,0.02075,0.0165,0.01675,0.1075,0.01825


In [8]:
#create mov_genres_df

#split genres column and check it if it exists or not
def set_genres(genres,col):
    if genres in col.split('|'): return 1
    else: return 0

In [9]:
mov_genres_df = movies_data
#create genres columns
mov_genres_df["Action"] = mov_genres_df.apply(lambda x: set_genres("Action",x['genres']), axis=1)
mov_genres_df["Adventure"] = mov_genres_df.apply(lambda x: set_genres("Adventure",x['genres']), axis=1)
mov_genres_df["Animation"] = mov_genres_df.apply(lambda x: set_genres("Animation",x['genres']), axis=1)
mov_genres_df["Children"] = mov_genres_df.apply(lambda x: set_genres("Children",x['genres']), axis=1)
mov_genres_df["Comedy"] = mov_genres_df.apply(lambda x: set_genres("Comedy",x['genres']), axis=1)
mov_genres_df["Crime"] = mov_genres_df.apply(lambda x: set_genres("Crime",x['genres']), axis=1)
mov_genres_df["Documentary"] = mov_genres_df.apply(lambda x: set_genres("Documentary",x['genres']), axis=1)
mov_genres_df["Drama"] = mov_genres_df.apply(lambda x: set_genres("Drama",x['genres']), axis=1)
mov_genres_df["Fantasy"] = mov_genres_df.apply(lambda x: set_genres("Fantasy",x['genres']), axis=1)
mov_genres_df["Film-Noir"] = mov_genres_df.apply(lambda x: set_genres("Film-Noir",x['genres']), axis=1)
mov_genres_df["Horror"] = mov_genres_df.apply(lambda x: set_genres("Horror",x['genres']), axis=1)
mov_genres_df["Musical"] = mov_genres_df.apply(lambda x: set_genres("Musical",x['genres']), axis=1)
mov_genres_df["Mystery"] = mov_genres_df.apply(lambda x: set_genres("Mystery",x['genres']), axis=1)
mov_genres_df["Romance"] = mov_genres_df.apply(lambda x: set_genres("Romance",x['genres']), axis=1)
mov_genres_df["Sci-Fi"] = mov_genres_df.apply(lambda x: set_genres("Sci-Fi",x['genres']), axis=1)
mov_genres_df["Thriller"] = mov_genres_df.apply(lambda x: set_genres("Thriller",x['genres']), axis=1)
mov_genres_df["War"] = mov_genres_df.apply(lambda x: set_genres("War",x['genres']), axis=1)
mov_genres_df["Western"] = mov_genres_df.apply(lambda x: set_genres("Western",x['genres']), axis=1)
mov_genres_df["(no genres listed)"] = mov_genres_df.apply(lambda x: set_genres("(no genres listed)",x['genres']), axis=1)

In [10]:
#drop column unnessary
mov_genres_df.drop(['title','genres'], axis = 1, inplace=True)
mov_genres_df.head()

Unnamed: 0,movieId,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
0,1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,4,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0
4,5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [11]:
#load again
movies_data = pd.read_csv("./ml-20m/movies.csv")
#create move_rating_df
def set_year(title):
    year = title.strip()[-5:-1]
    if year.isnumeric() == True: return int(year)
    else: return 1800
#add year field
movies_data['year'] = movies_data.apply(lambda x: set_year(x['title']), axis=1)
movies = movies_data.drop('genres', axis = 1)
movies.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story (1995),1995
1,2,Jumanji (1995),1995
2,3,Grumpier Old Men (1995),1995
3,4,Waiting to Exhale (1995),1995
4,5,Father of the Bride Part II (1995),1995


In [12]:
#define function to group years
def set_year_group(year):
    if (year < 1900): return 0
    elif (1900 <= year <= 1975): return 1
    elif (1976 <= year <= 1995): return 2
    elif (1996 <= year <= 2003): return 3
    elif (2004 <= year <= 2009): return 4
    elif (2010 <= year): return 5
    else: return 0
movies['year_group'] = movies.apply(lambda x: set_year_group(x['year']), axis=1)
#no need title and year fields
movies.drop(['title','year'], axis = 1, inplace=True)

In [13]:
agg_movies_rat = ratings_data.groupby(['movieId']).agg({'rating': [np.size, np.mean]}).reset_index()
agg_movies_rat.columns = ['movieId','rating_counts', 'rating_mean']
agg_movies_rat.head()

Unnamed: 0,movieId,rating_counts,rating_mean
0,1,49695.0,3.92124
1,2,22243.0,3.211977
2,3,12735.0,3.15104
3,4,2756.0,2.861393
4,5,12161.0,3.064592


In [None]:
#calculate cosine similarities
print("----0--------")
#cosine similarity for mov_tag_df
cos_tag = cosine_similarity(mov_tag_df.values)*0.5
print("----1--------")
#cosine similarity for mov_genres_df
cos_genres = cosine_similarity(mov_genres_df.values)*0.25
print("----2--------")
#cosine similarity for mov_rating_df
cos_rating = cosine_similarity(mov_rating_df.values)*0.25
print("----3--------")
#mix
cos = cos_tag+cos_genres+cos_rating
print("----4--------")
cols = mov_tag_df.index.values
inx = mov_tag_df.index
movies_sim = pd.DataFrame(cos, columns=cols, index=inx)
print("----5--------")
movies_sim.head()

def get_similar(movieId):
    df = movies_sim.loc[movies_sim.index == movieId].reset_index(). \
            melt(id_vars='movieId', var_name='sim_moveId', value_name='relevance'). \
            sort_values('relevance', axis=0, ascending=False)[1:6]
    return df
#create empty df
movies_similarity = pd.DataFrame(columns=['movieId','sim_moveId','relevance'])
print("----6--------")
for x in movies_sim.index.tolist():
    movies_similarity = movies_similarity.append(get_similar(x))
movies_similarity.head()