In [2]:
import numpy as np
import pandas as pd
from ast import literal_eval
from difflib import get_close_matches
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
df = pd.read_csv('../data/movies_clean.csv')

# Change some fields to the right data type
df['genres'] = df['genres'].apply(literal_eval)
df['production_companies'] = df['production_companies'].apply(literal_eval)
df['keywords'] = df['keywords'].fillna('')

# Get index from title
titles = df['title']
indices = pd.Series(df.index, index=df['title'])

In [4]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), min_df=0)
tfidf_matrix = tf.fit_transform(df['keywords'])
print(tfidf_matrix.shape)

cnt = CountVectorizer(analyzer='word', ngram_range=(1, 1), min_df=0)
count_matrix = cnt.fit_transform(df['metadata'])
print(count_matrix.shape)

(424378, 297494)
(424378, 111153)


In [12]:
def get_title_index(title):
    idx_list = indices.values[indices.index == title]

    if len(idx_list) == 0:
        suggests1 = get_close_matches(title, titles, n=5)
        suggests2 = list(titles[titles.str.contains(title, case=False)])[:5]
        suggests = set(suggests1 + suggests2)
        print('Title not found, suggestion:')
        print(*suggests, sep = "\n")
        return None
    elif len(idx_list) > 1:
        print('More than one title match, please specify the index:')
        print(df.iloc[idx_list][['title', 'overview', 'release_date']])
        return None
    else:
        return idx_list[0]

def get_recommendations(title, idx=None, k=10):
    if idx is None:
        idx = get_title_index(title)
        if idx is None: return None

    cos_sim_list1 = cosine_similarity(tfidf_matrix[idx], tfidf_matrix)[0]
    cos_sim_list2 = cosine_similarity(count_matrix[idx], count_matrix)[0]
    similarity = 0.55 * cos_sim_list1 + 0.45 * cos_sim_list2

    df['similarity'] = similarity / 1 * 100
    df['similarity'] = df['similarity'].round(2).astype(str) + '%'

    similarity = list(enumerate(similarity))
    similarity = sorted(similarity, key=lambda x: x[1], reverse=True)
    
    movie_indices = [i[0] for i in similarity]

    recommended_movies = df.iloc[movie_indices]
    recommended_movies = recommended_movies[(recommended_movies['vote_average'] > 4) & (recommended_movies['vote_count'] > 700)]
    recommended_movies = recommended_movies[1:k+1]

    # return titles.iloc[movie_indices]
    return recommended_movies[['title', 'release_date', 'vote_average', 'vote_count', 'similarity']]

get_recommendations("Finding Nemo")
# get_recommendations('Coco', idx=170232)

Unnamed: 0,title,release_date,vote_average,vote_count,similarity
74430,Finding Dory,2016-06-16,7.046,10915,55.68%
170453,Riley's First Date?,2015-11-03,7.157,708,46.31%
7272,Presto,2008-06-27,7.788,850,45.71%
7918,Geri's Game,1997-11-24,7.72,972,45.0%
138009,Lava,2014-10-10,7.3,939,45.0%
7919,For the Birds,2000-11-02,7.488,1017,41.0%
40228,Monsters University,2013-06-19,7.029,9402,40.9%
64937,The Good Dinosaur,2015-11-14,6.738,4822,40.89%
13916,Partly Cloudy,2009-05-28,7.828,893,40.79%
700,Toy Story 2,1999-10-30,7.583,12077,40.59%
