In [1]:
import numpy as np 
import pandas as pd 
  
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

pd.options.display.max_columns=None

import warnings; warnings.simplefilter('ignore')

In [2]:
df_meta = pd. read_csv('cleaned_movie_dataset.csv')

In [3]:
df_meta.isnull().sum()
df_meta.shape

(45463, 16)

In [4]:
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')

In [5]:
keywords['id'] = keywords['id'].astype('int64')
credits['id'] = credits['id'].astype('int64')
df_meta = df_meta.merge(credits, on='id')
df_meta = df_meta.merge(keywords, on='id')

In [6]:
links_small = pd.read_csv('links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [7]:
smd = df_meta[df_meta['id'].isin(links_small)]
smd.shape

(9219, 19)

In [8]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [9]:
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [10]:
smd['director'] = smd['crew'].apply(get_director)

In [11]:
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:4] if len(x) >=4 else x)
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [12]:
s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s = s.value_counts()
s[:10]
s = s[s > 1]

In [13]:
stemmer = SnowballStemmer('english')
stemmer.stem('sportingly')

'sport'

In [14]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [15]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [16]:
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))

smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
smd['director'] = smd['director'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
smd['genres'] = smd['genres'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [17]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [18]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

In [19]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [20]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [21]:
vote_averages = df_meta[df_meta['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
vote_counts = df_meta[df_meta['vote_count'].notnull()]['vote_count'].astype('int')
m = vote_counts.quantile(0.96)

In [22]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [23]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:50]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'release_date']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average']
    C = vote_averages.mean()
    m = vote_counts.quantile(0.50)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())& (movies['vote_average']>=6.0)]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average']
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    #qualified = qualified.sort_values('wr', ascending=False).head(10)
    qualified = qualified.head(10)
    return qualified

In [24]:
get_recommendations('The Dark Knight')

Unnamed: 0,title,vote_count,vote_average,release_date,wr
8031,The Dark Knight Rises,9263,7.6,2012-07-16,7.464025
6218,Batman Begins,7511,7.5,2005-06-10,7.341538
8419,Man of Steel,6462,6.5,2013-06-12,6.398421
8872,Captain America: Civil War,7462,7.1,2016-04-27,6.968772
3049,X-Men,4172,6.8,2000-07-13,6.613487
4635,X2,3572,6.8,2003-04-24,6.586443
7583,Kick-Ass,4747,7.1,2010-03-22,6.901713
7600,Iron Man 2,6969,6.6,2010-04-28,6.497744
8869,Ant-Man,6029,7.0,2015-07-14,6.84884
8871,Deadpool,11444,7.4,2016-02-09,7.298143


In [25]:
df_ratings_s = pd.read_csv('ratings_small.csv')
reader = Reader()

In [26]:
data_svd = Dataset.load_from_df(df_ratings_s[['userId', 'movieId', 'rating']], reader)
svd = SVD()
cross_validate(svd, data_svd, measures=['RMSE', 'MAE'],cv=5)

{'test_rmse': array([0.889684  , 0.89491355, 0.89517899, 0.89260228, 0.90645778]),
 'test_mae': array([0.68601021, 0.69165664, 0.69196055, 0.68535028, 0.69483995]),
 'fit_time': (0.9471724033355713,
  1.0054097175598145,
  1.0185613632202148,
  1.0104196071624756,
  0.9558944702148438),
 'test_time': (0.2684812545776367,
  0.12298011779785156,
  0.12994790077209473,
  0.1409778594970703,
  0.12109971046447754)}

In [27]:
trainset = data_svd.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1b6eaf4f160>

In [28]:
user_rating=pd.merge(df_ratings_s,df_meta,left_on='movieId',right_on='id',how='inner')
user_ratings_final=user_rating[['userId', 'movieId', 'rating','original_title']]
user_ratings=user_ratings_final.sort_values(by='userId')
user_ratings.head()

Unnamed: 0,userId,movieId,rating,original_title
0,1,1371,2.5,Rocky III
93,1,2105,4.0,American Pie
140,1,2193,2.0,My Tutor
47,1,1405,1.0,Greed
182,1,2294,2.0,Jay and Silent Bob Strike Back


In [29]:
#user_ratings[user_ratings['original_title'] == 'My Tutor']

In [30]:
movie1=df_meta['original_title']=='The Conjuring'
df_meta[movie1][['original_title','id']]

Unnamed: 0,original_title,id
21475,The Conjuring,138843


In [31]:
svd.predict(654, 2193,4)

Prediction(uid=654, iid=2193, r_ui=4, est=3.6923680853371823, details={'was_impossible': False})

In [32]:
svd.predict(654, 2193).est

3.6923680853371823

In [33]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [34]:
id_map = pd.read_csv('links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')

In [35]:
indices_map = id_map.set_index('id')

In [36]:
def recommend_my_movie(userId, title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    movie_id = id_map.loc[title]['movieId']
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:50]
    movie_indices = [i[0] for i in sim_scores]
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'release_date','id']]
    movies['est_rating'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies['est_rating'] = movies['est_rating'].apply(lambda x: x*2)
    movies['vote_count'] = movies['vote_count'].astype('int')
    movies=movies.drop(columns=['id'])
    movies = movies.sort_values('est_rating', ascending=False)
    return movies.head(10)

In [37]:
recommend_my_movie(27, "The Dark Knight")

Unnamed: 0,title,vote_count,vote_average,release_date,est_rating
7969,The Avengers,12000,7.4,2012-04-25,8.050004
6218,Batman Begins,7511,7.5,2005-06-10,7.921492
6623,The Prestige,4510,8.0,2006-10-19,7.913056
7009,Iron Man,8951,7.4,2008-04-30,7.885157
8031,The Dark Knight Rises,9263,7.6,2012-07-16,7.781399
2599,Batman: Mask of the Phantasm,218,7.4,1993-12-25,7.737676
7889,X-Men: First Class,5252,7.1,2011-05-24,7.686093
7583,Kick-Ass,4747,7.1,2010-03-22,7.596877
8869,Ant-Man,6029,7.0,2015-07-14,7.533808
8868,Avengers: Age of Ultron,6908,7.3,2015-04-22,7.467637
