# Hands On Recommendation Systems with Python
#### Chapter 3 : Basic Recommender

In [1]:
import pandas as pd
import numpy as np
import os

## Data

In [2]:
datapath = os.getcwd()
datafile = '/Data/movies/movies_metadata.csv'

df = pd.read_csv(datapath + datafile, low_memory=False)
df.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [3]:
print('shape  :', df.shape)
print('columns:', df.columns.tolist())

shape  : (45466, 24)
columns: ['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'video', 'vote_average', 'vote_count']


## Example 1 : Simple Recommendation

### Pre-requisite 1

In [4]:
df['vote_count'].quantile([0.25,0.50,0.75,0.80,1.0])

0.25        3.0
0.50       10.0
0.75       34.0
0.80       50.0
1.00    14075.0
Name: vote_count, dtype: float64

In [5]:
# Calculate the number of votes garnered by the 80th percentile movie
m = df['vote_count'].quantile(0.75)
print( m )

34.0


### Pre-requisite 2

In [6]:
# Select movies longer than 45 minutes and shorter than 300 minutes
q_movies = df[(df['runtime'] >= 45) & (df['runtime'] <= 300)]

# Select movies that have garnered more than m votes
q_movies = q_movies[q_movies['vote_count'] > m]

q_movies.shape

(11033, 24)

In [7]:
# Calculate sample mean of vote_average
C = df['vote_average'].mean()
print( C ) 

5.618207215133889


### Weighted-Rating for Each Movie

In [8]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    
    # Compute the weighted score
    wr = (v/(v+m) * R) + (m/(v+m) * C)
    
    return wr

In [9]:
# Compute the score using the weighted_rating function defined above
q_movies['weighted_score'] = q_movies.apply(weighted_rating, axis=1)

In [10]:
#Sort movies in descending order of their scores
q_movies = q_movies.sort_values('weighted_score', ascending=False)

q_movies[['title', 'vote_count', 'vote_average', 'weighted_score', 'runtime']].head(25)

Unnamed: 0,title,vote_count,vote_average,weighted_score,runtime
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.929668,190.0
314,The Shawshank Redemption,8358.0,8.5,8.488324,142.0
834,The Godfather,6024.0,8.5,8.483826,175.0
40251,Your Name.,1030.0,8.5,8.407913,106.0
12481,The Dark Knight,12269.0,8.3,8.292589,152.0
2843,Fight Club,9678.0,8.3,8.290612,139.0
292,Pulp Fiction,8670.0,8.3,8.289524,154.0
522,Schindler's List,4436.0,8.3,8.279602,195.0
23673,Whiplash,4376.0,8.3,8.279324,105.0
5481,Spirited Away,3968.0,8.3,8.277216,125.0


## Example 2 : Knowledge-based Recommender System

In [11]:
df = df[['title','genres', 'release_date', 'runtime', 'vote_average', 'vote_count']]

df.head()

Unnamed: 0,title,genres,release_date,runtime,vote_average,vote_count
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,81.0,7.7,5415.0
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,104.0,6.9,2413.0
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,101.0,6.5,92.0
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22,127.0,6.1,34.0
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,106.0,5.7,173.0


In [12]:
# Convert release_date into pandas datetime format
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

# Extract year from the datetime
df['year'] = df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [13]:
#Helper function to convert NaT to 0 and all other years to integers.
def convert_int(x):
    try:
        return int(x)
    except:
        return 0

In [14]:
# Apply convert_int to the year feature
df['year'] = df['year'].apply(convert_int)

# Drop release_date column
df = df.drop('release_date', axis=1)

df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",81.0,7.7,5415.0,1995
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",101.0,6.5,92.0,1995
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",127.0,6.1,34.0,1995
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",106.0,5.7,173.0,1995


In [15]:
#Print genres of the first movie
df.iloc[1]['genres']

"[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]"

In [16]:
from ast import literal_eval

a = "[1,2,3]"
print(type(a))


b = literal_eval(a)
print(type(b))

<class 'str'>
<class 'list'>


In [17]:
#Convert all NaN into stringified empty lists
df['genres'] = df['genres'].fillna('[]')

#Apply literal_eval to convert stringified empty lists to the list object
df['genres'] = df['genres'].apply(literal_eval)

#Convert list of dictionaries to a list of strings
df['genres'] = df['genres'].apply(lambda x: [i['name'].lower() for i in x] if isinstance(x, list) else [])

In [18]:
df.head(3)

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"[animation, comedy, family]",81.0,7.7,5415.0,1995
1,Jumanji,"[adventure, fantasy, family]",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"[romance, comedy]",101.0,6.5,92.0,1995


In [19]:
#Create a new feature by exploding genres
s = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)

#Name the new feature as 'genre'
s.name = 'genre'

#Create a new dataframe gen_df which by dropping the old 'genres' feature and adding the new 'genre'.
gen_df = df.drop('genres', axis=1).join(s)

#Print the head of the new gen_df
gen_df.head()

  s = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)


Unnamed: 0,title,runtime,vote_average,vote_count,year,genre
0,Toy Story,81.0,7.7,5415.0,1995,animation
0,Toy Story,81.0,7.7,5415.0,1995,comedy
0,Toy Story,81.0,7.7,5415.0,1995,family
1,Jumanji,104.0,6.9,2413.0,1995,adventure
1,Jumanji,104.0,6.9,2413.0,1995,fantasy


In [20]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    
    # Compute the weighted score
    wr = (v/(v+m) * R) + (m/(v+m) * C)
    
    return wr

In [21]:
def build_chart(gen_df, weighted_rating, percentile=0.8):
    #Ask for preferred genres
    print("Input preferred genre")
    genre = input()
    
    #Ask for lower limit of duration
    print("Input shortest duration")
    low_time = int(input())
    
    #Ask for upper limit of duration
    print("Input longest duration")
    high_time = int(input())
    
    #Ask for lower limit of timeline
    print("Input earliest year")
    low_year = int(input())
    
    #Ask for upper limit of timeline
    print("Input latest year")
    high_year = int(input())
    
    #Define a new movies variable to store the preferred movies. Copy the contents of gen_df to movies
    movies = gen_df.copy()
    
    #Filter based on the condition
    movies = movies[(movies['genre'] == genre) & 
                    (movies['runtime'] >= low_time) & 
                    (movies['runtime'] <= high_time) & 
                    (movies['year'] >= low_year) & 
                    (movies['year'] <= high_year)]
    
    #Compute the values of C and m for the filtered movies
    C = movies['vote_average'].mean()
    m = movies['vote_count'].quantile(percentile)
    
    #Only consider movies that have higher than m votes. Save this in a new dataframe q_movies
    q_movies = movies.copy().loc[movies['vote_count'] >= m]
    
    #Calculate score using the IMDB formula
    q_movies['weighted_score'] = q_movies.apply(weighted_rating, axis=1)

    #Sort movies in descending order of their scores
    q_movies = q_movies.sort_values('weighted_score', ascending=False)
    
    return q_movies

In [22]:
#Compute the values of C and m for the filtered movies
C = gen_df['vote_average'].mean()
m = gen_df['vote_count'].quantile(0.75)

#Only consider movies that have higher than m votes. Save this in a new dataframe q_movies
q_movies = gen_df.copy().loc[gen_df['vote_count'] >= m]

#Calculate score using the IMDB formula
q_movies['weighted_score'] = q_movies.apply(weighted_rating, axis=1)

In [24]:
# Generate the chart for top animation movies and display top 5.
# build_chart(gen_df, weighted_rating).head()