In [1]:
import pandas as pd

In [2]:
metadata = pd.read_csv('data/IMDB-Movie-Data.csv')

In [3]:
metadata.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


# 1. Simple RS

In [4]:
C = metadata['Rating'].mean()
print(C)

6.723200000000003


In [5]:
m = metadata['Votes'].quantile(0.9)
print(m)

406251.10000000003


In [6]:
q_movies = metadata.copy().loc[metadata['Votes']>=m]
q_movies.shape

(100, 12)

- R은 C는?

In [7]:
def WR(x, m=m, C=C):
    v = x['Votes']
    R = x['Rating']
    return (v/(v+m)*R) + (m/(m+v)*C)

In [8]:
metadata['score'] = metadata.apply(WR, axis=1)

In [9]:
col = ['Title','Rating','Votes','score']
rs_original = metadata.sort_values('Rating', ascending=False)[col]
rs_new = metadata.sort_values('score', ascending=False)[col]

In [10]:
rs_original.head()

Unnamed: 0,Title,Rating,Votes,score
54,The Dark Knight,9.0,1791916,8.579216
80,Inception,8.8,1583625,8.376003
117,Dangal,8.8,48969,6.946606
36,Interstellar,8.6,1047747,8.075617
96,Kimi no na wa,8.6,34110,6.868575


In [11]:
rs_new.head()

Unnamed: 0,Title,Rating,Votes,score
54,The Dark Knight,9.0,1791916,8.579216
80,Inception,8.8,1583625,8.376003
36,Interstellar,8.6,1047747,8.075617
124,The Dark Knight Rises,8.5,1222645,8.056861
99,The Departed,8.5,937414,7.962793


# 2. Content-based RS
* 아이템의 컨텐츠를 직접 분석하여 아이템과 아이템 또는 사용자 선호도간 유사성을 토대로 추천하는 기법

In [12]:
metadata['Description'][0]

'A group of intergalactic criminals are forced to work together to stop a fanatical warrior from taking control of the universe.'

In [13]:
metadata['Description'].head()

0    A group of intergalactic criminals are forced ...
1    Following clues to the origin of mankind, a te...
2    Three girls are kidnapped by a man with a diag...
3    In a city of humanoid animals, a hustling thea...
4    A secret government agency recruits some of th...
Name: Description, dtype: object

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
tfidf = TfidfVectorizer(stop_words='english')

In [16]:
metadata.Description = metadata.Description.fillna('')

In [17]:
tfidf_matrix = tfidf.fit_transform(metadata.Description)
tfidf_matrix.shape

(1000, 5667)

### 코사인 유사도 구하기

In [18]:
from sklearn.metrics.pairwise import linear_kernel

In [19]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [20]:
indices = pd.Series(metadata.index, index=metadata['Title']).drop_duplicates()

### 추천 리스트 생성
* 영화타이틀 인덱스화
* 인덱스를 기준으로 유사도 추출
* 유사도 기준 10개 영화 선정
* 인덱스를 영화 타이틀로 변환

In [21]:
def rs(title, n=10, sim=cosine_sim):
    idx = indices[title]
    sim_score = list(enumerate(cosine_sim[idx]))
    sim_score = sorted(sim_score, key=lambda x: x[1], reverse=True)
    sim_score = sim_score[1:(n+1)]
    movie_index = [i[0] for i in sim_score]
    return metadata.Title.iloc[movie_index]

In [22]:
rs('The Dark Knight Rises')

54              The Dark Knight
144            Django Unchained
522                      Viking
471            Marie Antoinette
254               The Conjuring
920                   Centurion
253    The Amazing Spider-Man 2
604       The Rise of the Krays
615                Bastille Day
915                     Goksung
Name: Title, dtype: object

In [23]:
rs('Suicide Squad', n=5)

153                         Sicario
676                     Escape Plan
32                X-Men: Apocalypse
65     Kingsman: The Secret Service
552                  Fantastic Four
Name: Title, dtype: object