In [63]:
import numpy as np
import pandas as pd
from ast import literal_eval
from difflib import get_close_matches
from sklearn.metrics import jaccard_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [64]:
df = pd.read_csv('data/movies_clean.csv')

# Change some fields to the right data type
df['genres'] = df['genres'].apply(literal_eval)
df['production_companies'] = df['production_companies'].apply(literal_eval)
df['keywords'] = df['keywords'].fillna('')

# Get index from title
titles = df['title']
indices = pd.Series(df.index, index=df['title'])

In [65]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), min_df=0)
tfidf_matrix = tf.fit_transform(df['keywords'])
print(tfidf_matrix.shape)

cnt = CountVectorizer(analyzer='word', ngram_range=(1, 1), min_df=0)
count_matrix = cnt.fit_transform(df['metadata'])
print(count_matrix.shape)

(424378, 297494)
(424378, 111153)


In [69]:
def get_title_index(title):
    idx_list = indices.values[indices.index == title]

    if len(idx_list) == 0:
        suggests1 = get_close_matches(title, titles, n=5)
        suggests2 = list(titles[titles.str.contains(title, case=False)])[:5]
        suggests = set(suggests1 + suggests2)
        print('Title not found, suggestion:')
        print(*suggests, sep = "\n")
        return None
    elif len(idx_list) > 1:
        print('More than one title match, please specify the index:')
        print(df.iloc[idx_list][['title', 'overview', 'release_date']])
        return None
    else:
        return idx_list[0]

def get_recommendations(title, idx=None, k=10):
    if idx is None:
        idx = get_title_index(title)
        if idx is None: return None

    cos_sim_list1 = cosine_similarity(tfidf_matrix[idx], tfidf_matrix)[0]
    cos_sim_list2 = cosine_similarity(count_matrix[idx], count_matrix)[0]
    similarity = 1 * cos_sim_list1 +  1 * cos_sim_list2

    df['similarity'] = similarity / 2 * 100
    df['similarity'] = df['similarity'].round(2).astype(str) + '%'

    similarity = list(enumerate(similarity))
    similarity = sorted(similarity, key=lambda x: x[1], reverse=True)
    
    movie_indices = [i[0] for i in similarity]

    recommended_movies = df.iloc[movie_indices]
    recommended_movies = recommended_movies[(recommended_movies['vote_average'] > 5) & (recommended_movies['vote_count'] > 1000)]
    recommended_movies = recommended_movies[1:k+1]

    # return titles.iloc[movie_indices]
    return recommended_movies[['title', 'release_date', 'vote_average', 'vote_count', 'similarity']]

get_recommendations("WALL·E")
# get_recommendations('Coco', idx=170232)

Unnamed: 0,title,release_date,vote_average,vote_count,similarity
327655,Lightyear,2022-06-15,7.201,2391,48.4%
191999,Piper,2016-06-16,8.179,1419,46.47%
40228,Monsters University,2013-06-19,7.029,9402,46.3%
5158,Toy Story 3,2010-06-16,7.794,12973,42.83%
6,Finding Nemo,2003-05-30,7.825,17027,41.14%
593,Fantasia,1940-11-13,7.393,2599,40.82%
7743,Home on the Range,2004-04-02,6.026,1720,40.82%
244873,Luca,2021-06-17,7.909,6865,39.1%
755,Cars,2006-06-08,6.896,12086,38.58%
1400,Ratatouille,2007-06-28,7.789,14575,38.58%
