In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
def str_to_set(s):
    return set(item['name'] for item in eval(s))
#     genre_set = set()
#     for item in c:
#         genre_set.add(item['name'])
#     return genre_set

meta = pd.read_csv('/kaggle/input/the-movies-dataset/movies_metadata.csv')

meta = meta[ ['id', 'original_title', 'original_language', 'genres'] ]
meta = meta.rename(columns={'id': 'movieId', 'original_title': 'title',
                    'original_language': 'language'})

meta = meta.loc[meta['language'] == 'en',:] # language가 en인 애만 가져와라 -> location의 약자
meta['movieId'] = pd.to_numeric(meta['movieId'])
meta['movieId']
meta['genres'] = meta['genres'].apply(str_to_set)
meta


In [3]:
a = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9], [10,11,12]])
b = pd.Series([True,False,True,False])

a.loc[b,:]

In [4]:
kewords = pd.read_csv('/kaggle/input/the-movies-dataset/keywords.csv')
keywords = kewords.rename(columns={'id': 'movieId'})

keywords.loc[0, 'keywords']

        
keywords['keywords'] = keywords['keywords'].apply(str_to_set)
keywords


In [5]:
merged = pd.merge(meta, keywords, on='movieId', how='inner') # meta와 keyword 합치기, 'movieId' 기준으로 'inner' join
# inner join -> 두 테이블 중 있는 것만 가지고 테이블을 만듬
# outer join -> 두 테이블 중 한 번이라도 등장한 요소들로 다 합쳐서 만들어
# left join -> 왼쪽 테이블에 있는 값에 오른쪽 테이블에 있는 걸 붙여서 만듬

In [6]:
dk = merged.loc[merged['title'] == 'The Dark Knight', :].iloc[0,:]
dkr = merged.loc[merged['title'] == 'The Dark Knight Rises', :].iloc[0,:]
toy = merged.loc[merged['title'] == 'Toy Story', :].iloc[0,:]

dk_set = dk.keywords | dk.genres
dkr_set = dkr.keywords | dkr.genres
toy_set = toy.keywords | toy.genres

# \ 는 합집합, & 는 교집합 -> 자카드 시밀러리티

def jaccard_similarity(set1, set2):
    if len(set1 | set2) == 0: return 0
    return len(set1 & set2) / len(set1 | set2)

a = jaccard_similarity(dk_set, dkr_set)
b = jaccard_similarity(dk_set, toy_set)
print(a,b)

In [3]:
ratings = pd.read_csv('/kaggle/input/the-movies-dataset/ratings.csv')
ratings.movieId = pd.to_numeric(ratings.movieId)
ratings = pd.merge(ratings, meta[['movieId', 'title']], on='movieId', how='inner')
matrix = ratings.pivot_table(index= 'userId', columns='title', values='rating')

 
def pearson_similarity(u1, u2):
   u1_c = u1 - u1.mean()
   u2_c = u2 - u2.mean()
   denom = np.sqrt(np.sum(u1_c ** 2) * np.sum(u2_c ** 2))
   if denom != 0:
       return np.sum(u1_c * u2_c)/denom
   else:
    return 0

 
dk_rating = matrix['The Dark Knight']
pk_rating = matrix['Prom Night']
print(pearson_similarity(dk_rating, pk_rating))


In [1]:
def find_similar_movies(input_title, matrix, n, alpha): 
    input_meta = meta.loc[ meta[ 'title'] == input_title].iloc[ 0] 
    input_set = input_meta.genres | input_meta.keywords
    result = []
    for this_title in matrix.columns: 
        if this_title == input_title:
            continue
        this_meta = meta.loc[ meta[ 'title'] == this_title].iloc[ 0] 
        this_set = this_meta.genres | this_meta.keywords
        
        pearson = pearson_similarity(matrix[this_title], matrix[input_title])
        jaccard = jaccard_similarity(this_set, input_set)
        
        score = alpha * pearson + ( 1-alpha) * jaccard 
        result.append( (this_title, pearson, jaccard, score) )
        
    result.sort(key=lambda r: r[3], reverse=True)
    
    return result[:n]

In [2]:
 
result = find_similar_movies('The Dark Knight', matrix, 10, 0.3)
pd.DataFrame(result, columns = ['title', 'pearson', 'jaccard', 'score'])