In [None]:
%config Completer.use_jedi = False

In [None]:
import pandas as pd
import numpy as np
import warnings

In [None]:
warnings.filterwarnings('ignore')

# Read data

In [None]:
movies = pd.read_csv('./tmdb_5000_movies.csv')
movies

In [None]:
movies_df = movies[['id', 'title', 'genres', 'vote_average', 'vote_count', 'popularity', 'keywords', 'overview']]

# Set pandas display option

In [None]:
pd.set_option('max_colwidth', 100)

In [None]:
movies_df[['genres', 'keywords']]

In [None]:
movies_df['genres'] = movies_df['genres'].apply(eval)
movies_df['keywords'] = movies_df['keywords'].apply(eval)
movies_df['genres'] = movies_df['genres'].apply(lambda x : [y['name'] for y in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x : [y['name'] for y in x])

In [None]:
movies_df[['genres', 'keywords']]

# Cosine similarity

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
movies_df['genres_literal'] = movies_df['genres'].apply(lambda x : (' ').join(x))
count_vect = CountVectorizer(min_df=0, ngram_range=(1, 2))
genre_mat = count_vect.fit_transform(movies_df['genres_literal'])
genre_mat

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
genre_sim = cosine_similarity(genre_mat, genre_mat)
genre_sim.shape

In [None]:
genre_sim[:1]

In [None]:
genre_sim_sorted_ind = genre_sim.argsort()[:, ::-1]
genre_sim_sorted_ind[:1]

# Recommend system

In [None]:
def find_sim_movie(df, sorted_ind, title_name, top_n=10):
    # Extract a dataframe with title_name
    title_movie = df[df['title'] == title_name]
    
    # Extract indexes of title_movie as a ndarray
    title_index = title_movie.index.values
    
    # Extract top_n indexes of sorted_ind
    similar_indexes = sorted_ind[title_index, :top_n]
    print(similar_indexes)
    
    # Reshape the similar_indexes to 1-D array
    similar_indexes = similar_indexes.reshape(-1)
    
    return df.iloc[similar_indexes]

In [None]:
similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather', 10)
similar_movies[['title', 'vote_average']]

In [None]:
movies_df[['title', 'vote_average', 'vote_count']].sort_values(by='vote_average', ascending=False)[:10]

# Weighted rating

- v: 개별 영화에 평점을 투표한 횟수
- m: 평점을 부여하기 위한 최소 투표 횟수
- R: 개별 영화에 대한 평균 평점
- C: 전체 영화에 대한 평균 평점

weighted_rating = (v / (v + m)) * R + (m / (v + m)) * C

In [None]:
C = movies_df['vote_average'].mean()
m = movies_df['vote_count'].quantile(0.6)
print('C: ', round(C, 3), 'm: ', round(m, 3))

In [None]:
percentile = 0.6

m = movies_df['vote_count'].quantile(percentile)
C = movies_df['vote_average'].mean()

In [None]:
def weighted_vote_average(record):
    v = record['vote_count']
    R = record['vote_average']
    
    return ((v / (v + m)) * R) + ((m / (v + m)) * C)

In [None]:
movies_df['weighted_vote'] = movies.apply(weighted_vote_average, axis=1)
movies_df[['title', 'vote_average', 'weighted_vote', 'vote_count']].sort_values(by='weighted_vote',
                                                                                ascending=False)[:10]

In [None]:
def find_sim_movie(df, sorted_ind, title_name, top_n=10):
    title_movie = df[df.title == title_name]
    title_index = title_movie.index.values
    
    similar_indexes = sorted_ind[title_index, :top_n*2]
    similar_indexes = similar_indexes.reshape(-1)
    
    similar_indexes = similar_indexes[similar_indexes != title_index]
    
    return df.iloc[similar_indexes].sort_values('weighted_vote', ascending=False)[:top_n]

In [None]:
similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather', 10)
similar_movies[['title', 'vote_average', 'weighted_vote']]