In [2]:
# Importing the required libraries

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

import warnings; warnings.simplefilter('ignore')

#### Simple Recommender system 
##### Based on movie ratings and other basic criteria.

In [40]:
meta = pd.read_csv('Data/movies_metadata.csv')
meta['genres'] = meta['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [4]:
vote_counts = meta[meta['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = meta[meta['vote_average'].notnull()]['vote_average'].astype('int')
avg_rating = round(vote_averages.mean(),3)
min_votes = vote_counts.quantile(0.95)
print("Average rating (out of 10) required by the movie to qualify is:",avg_rating)
print("Minimum number of votes required by the movie to qualify is:",min_votes)

Average rating (out of 10) required by the movie to qualify is: 5.245
Minimum number of votes required by the movie to qualify is: 434.0


In [41]:
meta['year'] = pd.to_datetime(meta['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [18]:
# Creating a new df called qualified_movies with the following criteria:
# vote_count >= min_votes and is not null
# vote_average >= avg_rating and is not null

qualified_movies = meta[(meta['vote_count'] >= min_votes) & (meta['vote_count'].notnull()) & (meta['vote_average'].notnull())& (meta['vote_average']>= avg_rating)][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified_movies['vote_count'] = qualified_movies['vote_count'].astype('int')
qualified_movies['vote_average'] = qualified_movies['vote_average'].astype('int')
qualified_movies.shape

(2181, 6)

##### Creating a new attribute called 'weighted_rating' to show top movies. 
$\\$
Weighted_Rating = $(\frac{votes}{votes + min\_votes} . rating) + (\frac{min\_votes}{votes + min\_votes} . avg\_rating)$
$\\$
$\\$
where,
* *votes* is the number of votes for the movie
* *min_votes* is the minimum votes required to be listed in the chart
* *rating* is the average rating of the movie
* *avg_rating* is the mean vote across the whole report

In [19]:
def weighted_rating(df):
    votes = df['vote_count']
    rating = df['vote_average']
    return (votes/(votes+min_votes)*rating) + (min_votes/(min_votes+votes)*avg_rating)


In [20]:
qualified_movies['Weighted_Rating'] = qualified_movies.apply(weighted_rating,axis = 1)
qualified_movies = qualified_movies.sort_values('Weighted_Rating', ascending= False)

In [24]:
qualified_movies.head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,Weighted_Rating
15480,Inception,2010,14075,8,29.108149,"[Action, Thriller, Science Fiction, Mystery, A...",7.917591
12481,The Dark Knight,2008,12269,8,123.167259,"[Drama, Action, Crime, Thriller]",7.905875
22879,Interstellar,2014,11187,8,32.213481,"[Adventure, Drama, Science Fiction]",7.897111
2843,Fight Club,1999,9678,8,63.869599,[Drama],7.881757
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.070725,"[Adventure, Fantasy, Action]",7.871792
292,Pulp Fiction,1994,8670,8,140.950236,"[Thriller, Crime]",7.868665
314,The Shawshank Redemption,1994,8358,8,51.645403,"[Drama, Crime]",7.864005
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.324358,"[Adventure, Fantasy, Action]",7.861932
351,Forrest Gump,1994,8147,8,48.307194,"[Comedy, Drama, Romance]",7.860661
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.423537,"[Adventure, Fantasy, Action]",7.851929


In [31]:
genres = meta.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
genres.name = 'genre'
gen_meta = meta.drop('genres', axis=1).join(s)

In [36]:
meta.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'year'],
      dtype='object')

In [45]:
def top_movies(data,genre,percentile = 0.80):
    genres = meta.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
    genres.name = 'genre'
    gen_meta = meta.drop('genres', axis=1).join(genres)

    df = gen_meta[gen_meta['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    avg_rating = round(vote_averages.mean(),3)
    min_votes = vote_counts.quantile(percentile)

    qualified_movies = df[(df['vote_count'] >= min_votes) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())& (df['vote_average']>= avg_rating)][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genre']]
    qualified_movies['vote_count'] = qualified_movies['vote_count'].astype('int')
    qualified_movies['vote_average'] = qualified_movies['vote_average'].astype('int')
    
    qualified_movies['Weighted_Rating'] = qualified_movies.apply(weighted_rating,axis = 1)
    qualified_movies = qualified_movies.sort_values('Weighted_Rating', ascending= False)

    return qualified_movies
    

In [46]:
top_movies(meta,'Romance')

Unnamed: 0,title,year,vote_count,vote_average,popularity,genre,Weighted_Rating
351,Forrest Gump,1994,8147,8,48.307194,Romance,7.860661
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457024,Romance,7.511717
876,Vertigo,1958,1162,8,18.20822,Romance,7.250833
40251,Your Name.,2016,1030,8,34.461252,Romance,7.183286
883,Some Like It Hot,1959,835,8,11.845107,Romance,7.057786
...,...,...,...,...,...,...,...
21698,Don Jon,2013,1708,5,10.948605,Romance,5.049641
15408,The Twilight Saga: Eclipse,2010,2382,5,34.047399,Romance,5.037759
14425,The Twilight Saga: New Moon,2009,2518,5,30.121292,Romance,5.036020
18150,The Twilight Saga: Breaking Dawn - Part 1,2011,2622,5,25.9725,Romance,5.034794
