# [실습] 콘텐츠 기반 필터링(using cosine similarity)
- TMDB 5000 영화 데이터 세트
- 장르 속성을 이용한 영화 콘텐츠 기반 필터링

In [36]:
import pandas as pd 
import numpy as np
import warnings; warnings.filterwarnings('ignore')

In [37]:
movies = pd.read_csv('./tmdb_5000_movies.csv')
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [38]:
movies_df = movies[['id', 'title','genres','vote_average','vote_count','popularity','keywords','overview']]

In [39]:
# genres / keyword dict 형태로 데이터가 정리되어있음
pd.set_option('max_colwidth',100)
movies_df[['genres','keywords']].head(1)


Unnamed: 0,genres,keywords
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id"": 3386, ""name"": ""sp..."


In [40]:
# ast 모듈에 literal_eval()함수를 이용해 dict -> list로 변환
from ast import literal_eval
movies_df['genres'] = movies_df['genres'].apply(literal_eval)
movies_df['keywords'] = movies_df['keywords'].apply(literal_eval)

In [42]:
# genres 열에서 name 키에 해당하는 값을 추출
movies_df['genres'] = movies_df['genres'].apply(lambda x : [ y['name'] for y in x])
# keyword 열에서 name 키에 해당하는 값을 추출
movies_df['keywords'] = movies_df['keywords'].apply(lambda x : [y['name'] for y in x])

In [43]:
movies_df.head()

Unnamed: 0,id,title,genres,vote_average,vote_count,popularity,keywords,overview
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]",7.2,11800,150.437577,"[culture clash, future, space war, space colony, society, space travel, futuristic, romance, spa...","In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, ..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]",6.9,4500,139.082615,"[ocean, drug abuse, exotic island, east india trading company, love of one's life, traitor, ship...","Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of t..."
2,206647,Spectre,"[Action, Adventure, Crime]",6.3,4466,107.376788,"[spy, based on novel, secret agent, sequel, mi6, british secret service, united kingdom]",A cryptic message from Bond’s past sends him on a trail to uncover a sinister organization. Whil...
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]",7.6,9106,112.31295,"[dc comics, crime fighter, terrorist, secret identity, burglar, hostage drama, time bomb, gotham...","Following the death of District Attorney Harvey Dent, Batman assumes responsibility for Dent's c..."
4,49529,John Carter,"[Action, Adventure, Science Fiction]",6.1,2124,43.926995,"[based on novel, mars, medallion, space travel, princess, alien, steampunk, martian, escape, edg...","John Carter is a war-weary, former military captain who's inexplicably transported to the myster..."


## 1. 장르콘텐츠 유사도 측정
- 가장 간단한 방법인 코사인 유사도 이용

In [44]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# CountVectorizer를 적용하기 위해 list 를 문자열로 변환
movies_df['genres_literal'] = movies_df['genres'].apply(lambda x : (' ').join(x)) 
movies_df['keywords_literal'] = movies_df['keywords'].apply(lambda x : (' ').join(x))

### 1.1 피쳐 벡터화 시키기

In [57]:
count_vect = CountVectorizer(min_df = 0 , ngram_range=(1,2))
genre_mat = count_vect.fit_transform(movies_df['genres_literal'])
genre_mat.shape

(4803, 276)

### 1.2 피쳐 벡터된 행렬에 코사인 유사도 적용

In [62]:
from sklearn.metrics.pairwise import cosine_similarity

genre_sim = cosine_similarity(genre_mat,genre_mat)
print(genre_sim.shape)
print(genre_sim[:1])

(4803, 4803)
[[1.         0.59628479 0.4472136  ... 0.         0.         0.        ]]


In [65]:
# 높은 순으로 정렬된 비교행 위치 인덱스 값을 가져오기
genre_sim_sorted_ind = genre_sim.argsort()[:,::-1]
genre_sim_sorted_ind[:1]
# 결과 의미 : 0 번 레코드의 경우, 자신인 0번 레코드를 제외하고 3494번 레코드가 가장 유사도가 높음 & 2401레코드가 가장 유사도가 낮음

array([[   0, 3494,  813, ..., 3038, 3037, 2401]])

### 1.3 장르 콘텐츠 필터링을 이용한 영화 추천
- 장르 유사도 기반 영화 추천해주는 함수 생성

In [69]:
def find_sim_movie(df, sorted_ind, title_name, top_n = 10) :
    #인자로 입력된 movies_df DataFrame에서 'title' 컬럼이 입력된 title_name 값인 DataFrame 추출
    title_movie = df[df['title']==title_name]
    
    #title_name을 가진 DataFrame의 index 객체를 ndarray로 반환
    # sorted_ind 인자로 입력된 genre_sim_sorted_ind 객체에서 유사도 순으로 top_n개의 index 추출
    title_index = title_movie.index.values 
    similar_indexs =sorted_ind[title_index, :(top_n)]
    
    # 추출된 top_n index 출력 
    # 2차원이므로 1차원으로 변경
    print(similar_indexs)
    similar_indexs = similar_indexs.reshape(-1)
    
    return df.iloc[similar_indexs]
    

In [71]:
# 'The Godfather' 과 유사한 영화 5개를 뽑자
similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather', 5)
print(similar_movies[['title','vote_average']])

[[2731 1243 3636 1946 2640]]
                                               title  vote_average
2731                          The Godfather: Part II           8.3
1243                                    Mean Streets           7.2
3636                                   Light Sleeper           5.7
1946  The Bad Lieutenant: Port of Call - New Orleans           6.0
2640         Things to Do in Denver When You're Dead           6.7


### 1.4 더 퀄리티 높게 추천해보자
- 평점고려해보기

In [72]:
movies_df[['title','vote_average','vote_count']].sort_values('vote_average',ascending=False)

Unnamed: 0,title,vote_average,vote_count
3519,Stiff Upper Lips,10.0,1
4247,Me You and Five Bucks,10.0,2
4045,"Dancer, Texas Pop. 81",10.0,1
4662,Little Big Top,10.0,1
3992,Sardaarji,9.5,2
...,...,...,...
3960,The Deported,0.0,0
4684,American Beast,0.0,0
3967,Four Single Fathers,0.0,0
4486,Naturally Native,0.0,0


In [76]:
# 평점/투표수를 고려해서 추천해보자
c = movies_df['vote_average'].mean()
m = movies_df['vote_count'].quantile(0.6)
print('m값을 높이면 투표횟수가 많은 영화에 더 많은 가중치를 부여함')
print('c:{}, m :{}'.format(c, m))

def weighted_vote_average(record) :
    v = record['vote_count']
    R = record['vote_average']
    return ((v/(v+m))*R  + (m/(v+m))*C)

movies_df['weighted_vote'] = weighted_vote_average(movies_df)

m값을 높이면 투표횟수가 많은 영화에 더 많은 가중치를 부여함
c:6.092171559442011, m :370.1999999999998


In [77]:
movies_df[['title', 'vote_average', 'vote_count', 'weighted_vote']]

Unnamed: 0,title,vote_average,vote_count,weighted_vote
0,Avatar,7.2,11800,7.166301
1,Pirates of the Caribbean: At World's End,6.9,4500,6.838594
2,Spectre,6.3,4466,6.284091
3,The Dark Knight Rises,7.6,9106,7.541095
4,John Carter,6.1,2124,6.098838
...,...,...,...,...
4798,El Mariachi,6.6,238,6.290894
4799,Newlyweds,5.9,5,6.089611
4800,"Signed, Sealed, Delivered",7.0,6,6.106650
4801,Shanghai Calling,5.7,7,6.084894


In [79]:
def find_sim_movie(df, sorted_ind, title_name, top_n = 10) :
    #인자로 입력된 movies_df DataFrame에서 'title' 컬럼이 입력된 title_name 값인 DataFrame 추출
    title_movie = df[df['title']==title_name]
    
    #title_name을 가진 DataFrame의 index 객체를 ndarray로 반환
    title_index = title_movie.index.values 
    
    #top_n의 2배에 해당하는 장르 유사성이 높은 인덱스 추출
    similar_indexs =sorted_ind[title_index, :(top_n)*2]
    similar_indexs = similar_indexs.reshape(-1)
    # 기준 영화 인덱스는 제외 
    similar_indexs = similar_indexs[similar_indexs != title_index]
    
    #top_n의 2배에 해당하는 후보군에서 weighted_vote가 높은 수능로 top_n만큼 추출
    return df.iloc[similar_indexs].sort_values('weighted_vote', ascending=False)[:top_n]


# 'The Godfather' 과 유사한 영화 5개를 뽑자
similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather', 5)
print(similar_movies[['title','vote_average', 'weighted_vote']])

                       title  vote_average  weighted_vote
2731  The Godfather: Part II           8.3       8.079586
1847              GoodFellas           8.2       7.976937
3866             City of God           8.1       7.759693
883      Catch Me If You Can           7.7       7.557097
1243            Mean Streets           7.2       6.626569
