In [32]:
import pandas as pd

In [33]:
movies_df = pd.read_csv('movies.dat', sep = '::',engine="python", names = ['MovieID','Title','Genre'],encoding='latin-1')
ratings_df = pd.read_csv('ratings.dat', sep = '::',engine="python", names = ['UserID','MovieID','Rating','Timestamp'],encoding='latin-1')
movies_df.head()

Unnamed: 0,MovieID,Title,Genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [34]:
ratings_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [35]:
movies_df['Year'] = movies_df.Title.str.extract('(\(\d\d\d\d\))',expand=False)
movies_df['Year'] = movies_df.Year.str.extract('(\d\d\d\d)',expand=False)
movies_df['Title'] = movies_df.Title.str.replace('(\(\d\d\d\d\))', '')
movies_df['Title'] = movies_df['Title'].apply(lambda x: x.strip())

# Split the values in the Genre column
movies_df['Genre'] = movies_df.Genre.str.split('|')

movies_df.head()

Unnamed: 0,MovieID,Title,Genre,Year
0,1,Toy Story (1995),"[Animation, Children's, Comedy]",1995
1,2,Jumanji (1995),"[Adventure, Children's, Fantasy]",1995
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",1995
3,4,Waiting to Exhale (1995),"[Comedy, Drama]",1995
4,5,Father of the Bride Part II (1995),[Comedy],1995


In [36]:
ratings_df.drop('Timestamp',axis=1,inplace=True)
ratings_df.head()

Unnamed: 0,UserID,MovieID,Rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [37]:
ratings_df.shape

(1000209, 3)

In [38]:
movies_df.shape

(3883, 4)

In [39]:
ratings_df.MovieID.unique().shape

(3706,)

In [40]:
from collections import Counter
users_per_movie = Counter(ratings_df['MovieID'])
users_per_movie[2858]

3428

In [41]:
movies_df = movies_df.merge( ratings_df.groupby('MovieID')['Rating'].mean(), on="MovieID",how="left")
movies_df['Watchers'] = movies_df['MovieID'].apply(lambda x: users_per_movie[x])
movies_df.head(10)

Unnamed: 0,MovieID,Title,Genre,Year,Rating,Watchers
0,1,Toy Story (1995),"[Animation, Children's, Comedy]",1995,4.146846,2077
1,2,Jumanji (1995),"[Adventure, Children's, Fantasy]",1995,3.201141,701
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",1995,3.016736,478
3,4,Waiting to Exhale (1995),"[Comedy, Drama]",1995,2.729412,170
4,5,Father of the Bride Part II (1995),[Comedy],1995,3.006757,296
5,6,Heat (1995),"[Action, Crime, Thriller]",1995,3.878723,940
6,7,Sabrina (1995),"[Comedy, Romance]",1995,3.41048,458
7,8,Tom and Huck (1995),"[Adventure, Children's]",1995,3.014706,68
8,9,Sudden Death (1995),[Action],1995,2.656863,102
9,10,GoldenEye (1995),"[Action, Adventure, Thriller]",1995,3.540541,888


In [48]:
m= movies_df['Watchers'].quantile(0.9)
m

706.6000000000004

In [49]:
C= movies_df['Rating'].mean()
C

3.238892177910891

In [50]:
def weighted_rating(x, m = m, C = C):
    v = x['Watchers']
    R = x['Rating']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [53]:
movies_df['Score'] = movies_df.apply(weighted_rating,axis=1)

In [58]:
print("Trending movies according to users score:")
movies_df.sort_values( by ="Score",ascending=False)[:10][['Title','Score']]

Trending movies according to users score:


Unnamed: 0,Title,Score
315,"Shawshank Redemption, The (1994)",4.237661
257,Star Wars: Episode IV - A New Hope (1977),4.221549
847,"Godfather, The (1972)",4.214774
523,Schindler's List (1993),4.211985
1180,Raiders of the Lost Ark (1981),4.205925
49,"Usual Suspects, The (1995)",4.154322
2693,"Sixth Sense, The (1999)",4.145692
2789,American Beauty (1999),4.133072
589,"Silence of the Lambs, The (1991)",4.112404
1959,Saving Private Ryan (1998),4.106323


In [60]:
movie_pivot = ratings_df.pivot_table(columns="UserID",index="MovieID",values="Rating")
movie_pivot.fillna(0,inplace=True)
movie_pivot

UserID,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,5.0,5.0,...,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,3.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3950,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [61]:
from scipy.sparse import csr_matrix
movie_sparse=csr_matrix(movie_pivot)

In [62]:
from sklearn.neighbors import NearestNeighbors
model=NearestNeighbors( n_neighbors=7,algorithm='brute',metric='cosine')

In [63]:
model.fit(movie_sparse)

In [123]:
movies_df[movies_df['MovieID'] == 1][['Score','Title']].values[0]

array([3.916367729886419, 'Toy Story (1995)'], dtype=object)

In [146]:
def recommend(user_id,n=5):
    options = set()
    sorted_options=dict()
    if user_id not in ratings_df['UserID']:
        return 0
    movieIDs = ratings_df[ratings_df['UserID']==user_id]['MovieID'].values
    for movie_id in movieIDs:
        distances,suggestions=model.kneighbors(movie_pivot.iloc[movie_id,:].values.reshape(1,-1))
        
        options.update(list(suggestions[0]))
    
    recommends = [movies_df[movies_df['MovieID'] == option][['Score','Title']].values for option in options]
    return pd.DataFrame([recommend[0]  for recommend in recommends if recommend.shape[0]>0]).sort_values(by=0,ascending=False)[:n]
        

In [158]:
recommend(4)

Unnamed: 0,0,1
69,4.221549,Star Wars: Episode IV - A New Hope (1977)
44,4.205925,Raiders of the Lost Ark (1981)
136,4.106323,Saving Private Ryan (1998)
43,4.09149,Star Wars: Episode V - The Empire Strikes Back...
58,3.921986,"Terminator, The (1984)"
