# 장르 속성을 이용한 영화 콘텐츠 기반 필터링

## 데이터 로딩 및 가공
- TMDB 5000 데이터 셋 : imdb.com 의 영화 중 주요 5000개 영화에 대한 메타 정보를 가공해서 kaggle에서 제공하는 데이터 셋
- https://www.kaggle.com

In [1]:
import pandas as pd
import numpy as np
import warnings; warnings.filterwarnings('ignore')

movies = pd.read_csv("./데이터셋/tmdb_5000_movies.csv")
print(movies.shape)
movies.info()
display(movies.head(2))

(4803, 20)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status    

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


### 분석에 사용할 주요 컬럼 추출
- id, title, genres, vote_average(평균 평점), vote_count(평점 투표 수), popularity(영화 인기도), keywords, overview(영화 개요)

In [2]:
movies_df = movies[['id','title','genres','vote_average','vote_count','popularity','keywords','overview']]
print(movies_df.info())
pd.set_option('max_colwidth',None) # 데이터 프레임 내용이 안보일때 제한 풀기
display(movies_df[['keywords']])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            4803 non-null   int64  
 1   title         4803 non-null   object 
 2   genres        4803 non-null   object 
 3   vote_average  4803 non-null   float64
 4   vote_count    4803 non-null   int64  
 5   popularity    4803 non-null   float64
 6   keywords      4803 non-null   object 
 7   overview      4800 non-null   object 
dtypes: float64(2), int64(2), object(4)
memory usage: 300.3+ KB
None


Unnamed: 0,keywords
0,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id"": 3386, ""name"": ""space war""}, {""id"": 3388, ""name"": ""space colony""}, {""id"": 3679, ""name"": ""society""}, {""id"": 3801, ""name"": ""space travel""}, {""id"": 9685, ""name"": ""futuristic""}, {""id"": 9840, ""name"": ""romance""}, {""id"": 9882, ""name"": ""space""}, {""id"": 9951, ""name"": ""alien""}, {""id"": 10148, ""name"": ""tribe""}, {""id"": 10158, ""name"": ""alien planet""}, {""id"": 10987, ""name"": ""cgi""}, {""id"": 11399, ""name"": ""marine""}, {""id"": 13065, ""name"": ""soldier""}, {""id"": 14643, ""name"": ""battle""}, {""id"": 14720, ""name"": ""love affair""}, {""id"": 165431, ""name"": ""anti war""}, {""id"": 193554, ""name"": ""power relations""}, {""id"": 206690, ""name"": ""mind and soul""}, {""id"": 209714, ""name"": ""3d""}]"
1,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""name"": ""drug abuse""}, {""id"": 911, ""name"": ""exotic island""}, {""id"": 1319, ""name"": ""east india trading company""}, {""id"": 2038, ""name"": ""love of one's life""}, {""id"": 2052, ""name"": ""traitor""}, {""id"": 2580, ""name"": ""shipwreck""}, {""id"": 2660, ""name"": ""strong woman""}, {""id"": 3799, ""name"": ""ship""}, {""id"": 5740, ""name"": ""alliance""}, {""id"": 5941, ""name"": ""calypso""}, {""id"": 6155, ""name"": ""afterlife""}, {""id"": 6211, ""name"": ""fighter""}, {""id"": 12988, ""name"": ""pirate""}, {""id"": 157186, ""name"": ""swashbuckler""}, {""id"": 179430, ""name"": ""aftercreditsstinger""}]"
2,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name"": ""based on novel""}, {""id"": 4289, ""name"": ""secret agent""}, {""id"": 9663, ""name"": ""sequel""}, {""id"": 14555, ""name"": ""mi6""}, {""id"": 156095, ""name"": ""british secret service""}, {""id"": 158431, ""name"": ""united kingdom""}]"
3,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853, ""name"": ""crime fighter""}, {""id"": 949, ""name"": ""terrorist""}, {""id"": 1308, ""name"": ""secret identity""}, {""id"": 1437, ""name"": ""burglar""}, {""id"": 3051, ""name"": ""hostage drama""}, {""id"": 3562, ""name"": ""time bomb""}, {""id"": 6969, ""name"": ""gotham city""}, {""id"": 7002, ""name"": ""vigilante""}, {""id"": 9665, ""name"": ""cover-up""}, {""id"": 9715, ""name"": ""superhero""}, {""id"": 9990, ""name"": ""villainess""}, {""id"": 10044, ""name"": ""tragic hero""}, {""id"": 13015, ""name"": ""terrorism""}, {""id"": 14796, ""name"": ""destruction""}, {""id"": 18933, ""name"": ""catwoman""}, {""id"": 156082, ""name"": ""cat burglar""}, {""id"": 156395, ""name"": ""imax""}, {""id"": 173272, ""name"": ""flood""}, {""id"": 179093, ""name"": ""criminal underworld""}, {""id"": 230775, ""name"": ""batman""}]"
4,"[{""id"": 818, ""name"": ""based on novel""}, {""id"": 839, ""name"": ""mars""}, {""id"": 1456, ""name"": ""medallion""}, {""id"": 3801, ""name"": ""space travel""}, {""id"": 7376, ""name"": ""princess""}, {""id"": 9951, ""name"": ""alien""}, {""id"": 10028, ""name"": ""steampunk""}, {""id"": 10539, ""name"": ""martian""}, {""id"": 10685, ""name"": ""escape""}, {""id"": 161511, ""name"": ""edgar rice burroughs""}, {""id"": 163252, ""name"": ""alien race""}, {""id"": 179102, ""name"": ""superhuman strength""}, {""id"": 190320, ""name"": ""mars civilization""}, {""id"": 195446, ""name"": ""sword and planet""}, {""id"": 207928, ""name"": ""19th century""}, {""id"": 209714, ""name"": ""3d""}]"
...,...
4798,"[{""id"": 5616, ""name"": ""united states\u2013mexico barrier""}, {""id"": 33649, ""name"": ""legs""}, {""id"": 162740, ""name"": ""arms""}, {""id"": 187891, ""name"": ""paper knife""}, {""id"": 206558, ""name"": ""guitar case""}]"
4799,[]
4800,"[{""id"": 248, ""name"": ""date""}, {""id"": 699, ""name"": ""love at first sight""}, {""id"": 2398, ""name"": ""narration""}, {""id"": 5340, ""name"": ""investigation""}, {""id"": 34051, ""name"": ""team""}, {""id"": 173066, ""name"": ""postal worker""}]"
4801,[]


In [21]:
print(type(movies_df['genres'][0])) # eval() 함수를 통해 List 객체로 변환
#>eval(): 문자열 형태로 되어있는 표현식을 실행하여 파이썬 코드로 변환하는 함수

<class 'str'>


In [32]:
movies_df['genres'] = movies_df['genres'].apply(eval)
movies_df['keywords'] = movies_df['keywords'].apply(eval)
movies_df['genres'][0]

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 878, 'name': 'Science Fiction'}]

In [None]:
- 장르와 키워드 컬럼의 name 키만 원소로 출력하여 리스트로 생성

In [33]:
movies_df['genres'] = movies_df['genres'].apply(lambda x:[y ['name'] for y in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x:[y ['name'] for y in x])

In [34]:
display(movies_df[['genres']])

Unnamed: 0,genres
0,"[Action, Adventure, Fantasy, Science Fiction]"
1,"[Adventure, Fantasy, Action]"
2,"[Action, Adventure, Crime]"
3,"[Action, Crime, Drama, Thriller]"
4,"[Action, Adventure, Science Fiction]"
...,...
4798,"[Action, Crime, Thriller]"
4799,"[Comedy, Romance]"
4800,"[Comedy, Drama, Romance, TV Movie]"
4801,[]


## 장르 콘텐츠 유사도 측정
- 문자열로 변환된 genres 컬럼은 카운트 기반으로 피처 벡터화 진행
- genres 문자열을 피쳐 벡터화 행렬로 변환한 데이터 셋, 코사인 유사도 통해 비교
- genre 유사도가 높은 영화 중 평점이 높은 순으로 영화를 추천

In [36]:
movies_df['genres_literal'] = movies_df['genres'].apply(lambda x:' '.join(x))

In [37]:
movies_df['genres_literal'][0]

'Action Adventure Fantasy Science Fiction'

In [40]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(min_df = 0, ngram_range = (1,2))
genre_mat = count_vectorizer.fit_transform(movies_df['genres_literal'])

In [43]:
print(genre_mat.shape)
print(genre_mat.toarray()[0])

(4803, 276)
[1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [42]:
from sklearn.metrics.pairwise import cosine_similarity
genre_sim = cosine_similarity(genre_mat, genre_mat)
print(genre_sim.shape)
print(genre_sim[:3])

(4803, 4803)
[[1.         0.59628479 0.4472136  ... 0.         0.         0.        ]
 [0.59628479 1.         0.4        ... 0.         0.         0.        ]
 [0.4472136  0.4        1.         ... 0.         0.         0.        ]]


In [44]:
genre_sim_sorted_idx = genre_sim.argsort(axis=1)[:,::-1]
print(genre_sim_sorted_idx[0,:10]) # 첫번쨰 영화 0인거 자기자신

[   0 3494  813  870   46   14 1296 1652  419  420]


In [45]:
# 첫번째 영화와 장르 유사도가 가장 높은 3494번째 영화의 장르 유사도 값 확인
print(genre_sim[0, 3494])

1.0000000000000002


In [47]:
print(movies_df.iloc[0,:]['genres'])
print(movies_df.iloc[3494,:]['genres'])
print(movies_df.iloc[0,:]['title'])
print(movies_df.iloc[3494,:]['title'])

['Action', 'Adventure', 'Fantasy', 'Science Fiction']
['Action', 'Adventure', 'Fantasy', 'Science Fiction']
Avatar
Beastmaster 2: Through the Portal of Time


## 장르 콘텐츠 필터링을 이용한 영화 추천

In [48]:
movies_df[movies_df['title']=='The Godfather'] 

Unnamed: 0,id,title,genres,vote_average,vote_count,popularity,keywords,overview,genres_literal
3337,238,The Godfather,"[Drama, Crime]",8.4,5893,143.659698,"[italy, love at first sight, loss of father, patriarch, organized crime, mafia, lawyer, italian american, crime family, rise to power, mob boss, 1940s]","Spanning the years 1945 to 1955, a chronicle of the fictional Italian-American Corleone crime family. When organized crime family patriarch, Vito Corleone barely survives an attempt on his life, his youngest son, Michael steps in to take care of the would-be killers, launching a campaign of bloody revenge.",Drama Crime


In [51]:
# df : 영화정보를 갖는 데이터 프레임
# sorted_idx = 영화의 장르별 유사도가 높은 순으로 정렬된 영화인덱스행렬
# title_name : 추천 기준이 되는 영화 제목
# top_n : 추천 영화 수
def find_sim_movie(df, sorted_idx, title_names, top_n=10):
    target_movie = df[df['title']==title_names]
    title_index = target_movie.index.values
    similar_indexes = sorted_idx[title_index,:top_n]
    similar_indexes = similar_indexes.reshape(-1)

    return df.iloc[similar_indexes]

In [55]:
similar_movies = find_sim_movie(movies_df, genre_sim_sorted_idx,'The Godfather',10)

display(similar_movies[['title','vote_average']])

Unnamed: 0,title,vote_average
2731,The Godfather: Part II,8.3
1243,Mean Streets,7.2
3636,Light Sleeper,5.7
1946,The Bad Lieutenant: Port of Call - New Orleans,6.0
2640,Things to Do in Denver When You're Dead,6.7
4065,Mi America,0.0
1847,GoodFellas,8.2
4217,Kids,6.8
883,Catch Me If You Can,7.7
3866,City of God,8.1


# 평점이 매우 낮은 영화도 추천 영화로 선택되는 문제 발생

- 영화 평점이 높은 순으로 정렬하여 상위 10위 영화 확인

In [70]:
movies_df [['title','vote_average','vote_count']].sort_values('vote_average',ascending=False)[:10]

Unnamed: 0,title,vote_average,vote_count
3519,Stiff Upper Lips,10.0,1
4247,Me You and Five Bucks,10.0,2
4045,"Dancer, Texas Pop. 81",10.0,1
4662,Little Big Top,10.0,1
3992,Sardaarji,9.5,2
2386,One Man's Hero,9.3,2
2970,There Goes My Baby,8.5,2
1881,The Shawshank Redemption,8.5,8205
2796,The Prisoner of Zenda,8.4,11
3337,The Godfather,8.4,5893


- 왜곡된 평점 데이터를 피하기 위해 평점에 평가횟수를 반영한 가중평점방식 적용(IMDB)에서 평점을 매기는 방식
- 가중평점(WeightedRating) = (v/(v+m)) * R + (m/(v+m)) * C
>- v : 개별 영화에 평점을 투표한 횟수
>- m : 평점을 부여하기 위한 최소 투표 횟수 (임의의 값<전체 투표 횟수에서 상위 60%에 해당하는 횟수를 기준으로 적용>)
>- R : 개별 영화에 대한 평균 평점 (vote_average 값)
>- C : 전체 영화에 대한 평균 평점 (vote_average 평균 값)

In [71]:
C = movies_df['vote_average'].mean()
m = movies_df['vote_count'].quantile(0.6)
print(f'C:{C:.3f}, m :{m:.3f}')

C:6.092, m :370.200


In [72]:
def weighted_vote_average(data):
    v = data['vote_count']
    R = data ['vote_average']
    return ((v/(v+m)) * R) + ((m/(v+m)) * C)

In [73]:
movies_df['weighted_rating'] = movies_df.apply(weighted_vote_average,axis=1)

In [74]:
movies_df [['title','vote_average','weighted_rating','vote_count']].sort_values('weighted_rating',ascending=False)[:10]

Unnamed: 0,title,vote_average,weighted_rating,vote_count
1881,The Shawshank Redemption,8.5,8.396052,8205
3337,The Godfather,8.4,8.263591,5893
662,Fight Club,8.3,8.216455,9413
3232,Pulp Fiction,8.3,8.207102,8428
65,The Dark Knight,8.2,8.13693,12002
1818,Schindler's List,8.3,8.126069,4329
3865,Whiplash,8.3,8.123248,4254
809,Forrest Gump,8.2,8.105954,7927
2294,Spirited Away,8.3,8.105867,3840
2731,The Godfather: Part II,8.3,8.079586,3338


In [78]:
def find_sim_movie(df, sorted_idx, title_names, top_n=10):
    target_movie = df[df['title']==title_names]
    title_index = target_movie.index.values
    #top_n의 2배에 해당하는 장르 유사도가 높은 영화 index 추출
    similar_indexes = sorted_idx[title_index,:top_n*2]
    similar_indexes = similar_indexes.reshape(-1)
    #기준 영화 index는 제외
    similar_indexes = similar_indexes[similar_indexes != title_index]
    return df.iloc[similar_indexes].sort_values('weighted_rating',ascending=False)[:top_n]

In [79]:
similar_movies = find_sim_movie(movies_df, genre_sim_sorted_idx,'The Godfather',10)
similar_movies[['title','vote_average','weighted_rating']].reset_index(drop=True)

Unnamed: 0,title,vote_average,weighted_rating
0,The Godfather: Part II,8.3,8.079586
1,GoodFellas,8.2,7.976937
2,City of God,8.1,7.759693
3,Once Upon a Time in America,8.2,7.657811
4,Catch Me If You Can,7.7,7.557097
5,American Gangster,7.4,7.141396
6,This Is England,7.4,6.739664
7,American Hustle,6.8,6.717525
8,Mean Streets,7.2,6.626569
9,Rounders,6.9,6.530427


## 장르 콘텐츠 유사도 측정
- 문자열로 변환된 genres 컬럼은 카운트 기반으로 피처 벡터화 변환