In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
import warnings; warnings.simplefilter('ignore')

In [2]:
md = pd. read_csv('movies_metadata.csv')
md.head()
links_small = pd.read_csv('links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')
links_small.head()
md = md.drop([19730, 29503, 35587])
md['id'] = md['id'].astype('int')
smd = md[md['id'].isin(links_small)]
smd.shape

(9099, 24)

In [3]:
smd['genres'] = smd['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])


In [4]:
vote_counts = smd[smd['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = smd[smd['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

5.916804044400483

In [5]:
m = vote_counts.quantile(0.95)
m

2079.1000000000004

In [6]:
smd['year'] = pd.to_datetime(smd['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [7]:
qualified = smd[(smd['vote_count'] >= m) & (smd['vote_count'].notnull()) & (smd['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(455, 6)

In [8]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)
qualified['wr'] = qualified.apply(weighted_rating, axis=1)
qualified = qualified.sort_values('wr', ascending=False).head(250)
qualified.head(20)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010,14075,8,29.108149,"[Action, Thriller, Science Fiction, Mystery, A...",7.731884
12481,The Dark Knight,2008,12269,8,123.167259,"[Drama, Action, Crime, Thriller]",7.698136
22879,Interstellar,2014,11187,8,32.213481,"[Adventure, Drama, Science Fiction]",7.673516
2843,Fight Club,1999,9678,8,63.869599,[Drama],7.631612
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.070725,"[Adventure, Fantasy, Action]",7.60522
292,Pulp Fiction,1994,8670,8,140.950236,"[Thriller, Crime]",7.597066
314,The Shawshank Redemption,1994,8358,8,51.645403,"[Drama, Crime]",7.585021
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.324358,"[Adventure, Fantasy, Action]",7.579706
351,Forrest Gump,1994,8147,8,48.307194,"[Comedy, Drama, Romance]",7.576459
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.423537,"[Adventure, Fantasy, Action]",7.554411


In [9]:
s = smd.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = smd.drop('genres', axis=1).join(s)

In [10]:
def build_chart(genre, percentile=0.85):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

In [18]:
build_chart('Romance').apply(lambda x:x[1:12])

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
876,Vertigo,1958,1162,8,18.20822,7.399188
883,Some Like It Hot,1959,835,8,11.845107,7.247372
1132,Cinema Paradiso,1988,834,8,14.177005,7.24679
19901,Paperman,2012,734,8,7.198633,7.183659
37863,Sing Street,2016,669,8,10.672862,7.136622
882,The Apartment,1960,498,8,11.994281,6.982367
1639,Titanic,1997,7770,7,26.88907,6.937432
19731,Silver Linings Playbook,2012,4840,7,14.488111,6.902837
23437,Maleficent,2014,4607,7,19.467404,6.898368
22168,Her,2013,4215,7,13.829515,6.889845


In [19]:
build_chart('Comedy').apply(lambda x:x[2:15])

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
18465,The Intouchables,2011,5410,8,16.086919,7.774469
22841,The Grand Budapest Hotel,2014,4644,8,14.442048,7.741483
2211,Life Is Beautiful,1997,3643,8,39.39497,7.680398
732,Dr. Strangelove or: How I Learned to Stop Worr...,1964,1472,8,9.80398,7.344438
3342,Modern Times,1936,881,8,8.159556,7.081644
883,Some Like It Hot,1959,835,8,11.845107,7.052067
1236,The Great Dictator,1940,756,8,9.241748,6.996567
26564,Deadpool,2016,11444,7,187.860492,6.93681
13724,Up,2009,7048,7,19.330884,6.900458
22131,The Wolf of Wall Street,2013,6768,7,16.382422,6.896672


In [21]:
build_chart('Horror').apply(lambda x:x[:15])

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
1213,The Shining,1980,3890,8,19.611589,7.64169
1176,Psycho,1960,2405,8,36.826309,7.467736
1171,Alien,1979,4564,7,23.37742,6.813276
14236,Zombieland,2009,3655,7,11.063029,6.773884
1158,Aliens,1986,3282,7,21.761179,6.752454
21276,The Conjuring,2013,3169,7,14.90169,6.745137
1338,Jaws,1975,2628,7,19.726114,6.703123
8147,Shaun of the Dead,2004,2479,7,14.902948,6.689003
8230,Saw,2004,2255,7,23.508433,6.665054
1888,The Exorcist,1973,2046,7,12.137595,6.639125


In [25]:
build_chart('Science Fiction').apply(lambda x:x[:15])

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
15480,Inception,2010,14075,8,29.108149,7.692947
22879,Interstellar,2014,11187,8,32.213481,7.626296
256,Star Wars,1977,6778,8,42.149697,7.441071
1225,Back to the Future,1985,6239,8,25.778509,7.40502
1154,The Empire Strikes Back,1980,5998,8,19.470959,7.387351
1163,A Clockwork Orange,1971,3432,8,17.112594,7.104073
14551,Avatar,2009,12114,7,185.070892,6.799427
17818,The Avengers,2012,12000,7,89.887648,6.797808
23753,Guardians of the Galaxy,2014,10014,7,53.291601,6.764708
26553,Mad Max: Fury Road,2015,9629,7,29.36178,6.756996


In [28]:
build_chart('Thriller').apply(lambda x:x[2:15])

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
292,Pulp Fiction,1994,8670,8,140.950236,7.761463
46,Se7en,1995,5915,8,18.45743,7.667351
24860,The Imitation Game,2014,5895,8,31.59594,7.666395
586,The Silence of the Lambs,1991,4549,8,4.307222,7.586451
11354,The Prestige,2006,4510,8,16.94556,7.583559
289,Leon: The Professional,1994,4293,8,20.477329,7.566702
4099,Memento,2000,4168,8,15.450789,7.556358
1213,The Shining,1980,3890,8,19.611589,7.531481
1057,Reservoir Dogs,1992,3821,8,12.22034,7.524869
49,The Usual Suspects,1995,3334,8,16.302466,7.472302


In [16]:
from IPython.display import HTML
import base64  
import pandas as pd  

def create_download_link( smd, title = "Download CSV file", filename = "data piopularity.csv"):  
    csv = smd.to_csv(index =False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

create_download_link(smd)