In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

import warnings; warnings.simplefilter('ignore')

# 1. Simple Recommender
## 장르를 이용한 추천

In [88]:
md =  pd.read_csv("Data_OTT/movie_inventory_change.csv", encoding="euc-kr")
md.head()

Unnamed: 0,item_id,contract_year,movie_id,title,release_year,release_date,runtime,mpa_rating,mpa_rating_origin,imdb_score,...,director,Genre_1,Genre_2,Genre_3,actor_1,actor_2,actor_3,contract_price,studio_score,price_class
0,I-1001,2015,tt0121766,Star Wars: Episode III - Revenge of the Sith,2005,05-19-2005,140,PG-13,PG-13,7.5,...,George Lucas,Action,Adventure,Fantasy,Ewan McGregor,Natalie Portman,Hayden Christensen,2700,10,10
1,I-1002,2015,tt0330373,Harry Potter and the Goblet of Fire,2005,11-18-2005,157,PG-13,PG-13,7.7,...,Mike Newell,Adventure,Family,Fantasy,Eric Sykes,Timothy Spall,David Tennant,2700,10,10
2,I-1003,2015,tt0363771,"The Chronicles of Narnia: The Lion, the Witc",2005,12-09-2005,143,PG,PG,6.9,...,Andrew Adamson,Adventure,Family,Fantasy,Georgie Henley,Skandar Keynes,William Moseley,2295,10,1
3,I-1004,2015,tt0383574,Pirates of the Caribbean: Dead Man's Chest,2006,07-07-2006,151,PG-13,PG-13,7.3,...,Gore Verbinski,Action,Adventure,Fantasy,Johnny Depp,Orlando Bloom,Keira Knightley,2295,10,1
4,I-1005,2015,tt0413300,Spider-Man 3,2007,05-04-2007,139,PG-13,PG-13,6.2,...,Sam Raimi,Action,Adventure,Sci-Fi,Tobey Maguire,Kirsten Dunst,James Franco,2295,10,1


## 여기까지 결측치 및 이상치 제거

In [89]:
md.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106 entries, 0 to 105
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   item_id            106 non-null    object 
 1   contract_year      106 non-null    int64  
 2   movie_id           106 non-null    object 
 3   title              106 non-null    object 
 4   release_year       106 non-null    int64  
 5   release_date       106 non-null    object 
 6   runtime            106 non-null    int64  
 7   mpa_rating         106 non-null    object 
 8   mpa_rating_origin  106 non-null    object 
 9   imdb_score         106 non-null    float64
 10  votes              106 non-null    int64  
 11  reviews_users      106 non-null    int64  
 12  reviews_critics    106 non-null    int64  
 13  budget             106 non-null    int64  
 14  income_usa         106 non-null    int64  
 15  income_ww          106 non-null    int64  
 16  theater_opening    106 non

## 여기까지 결측치 및 이상치 제거

In [86]:
md.head()

Unnamed: 0,item_id,contract_year,movie_id,title,release_year,release_date,runtime,mpa_rating,mpa_rating_origin,imdb_score,...,director,Genre_1,Genre_2,Genre_3,actor_1,actor_2,actor_3,contract_price,studio_score,price_class
0,I-1001,2015,tt0121766,Star Wars: Episode III - Revenge of the Sith,2005,05-19-2005,140,PG-13,PG-13,7.5,...,George Lucas,Action,Adventure,Fantasy,Ewan McGregor,Natalie Portman,Hayden Christensen,2700,10,10
1,I-1002,2015,tt0330373,Harry Potter and the Goblet of Fire,2005,11-18-2005,157,PG-13,PG-13,7.7,...,Mike Newell,Adventure,Family,Fantasy,Eric Sykes,Timothy Spall,David Tennant,2700,10,10
2,I-1003,2015,tt0363771,"The Chronicles of Narnia: The Lion, the Witc",2005,12-09-2005,143,PG,PG,6.9,...,Andrew Adamson,Adventure,Family,Fantasy,Georgie Henley,Skandar Keynes,William Moseley,2295,10,1
3,I-1004,2015,tt0383574,Pirates of the Caribbean: Dead Man's Chest,2006,07-07-2006,151,PG-13,PG-13,7.3,...,Gore Verbinski,Action,Adventure,Fantasy,Johnny Depp,Orlando Bloom,Keira Knightley,2295,10,1
4,I-1005,2015,tt0413300,Spider-Man 3,2007,05-04-2007,139,PG-13,PG-13,6.2,...,Sam Raimi,Action,Adventure,Sci-Fi,Tobey Maguire,Kirsten Dunst,James Franco,2295,10,1


### IMDB's 가중치 비율
$$Weighted Rating(WR) = ({\frac{v}{v+m}\cdot R}) + ({\frac{m}{v+m}\cdot C}) $$

- v: 무비 리뷰 수
- m: 차트에 들어가기 위한 최소 리뷰 수
- R: 영화의 평균 rating
- C: 전체 리포트의 vote평균

In [90]:
md.columns

Index(['item_id', 'contract_year', 'movie_id', 'title', 'release_year',
       'release_date', 'runtime', 'mpa_rating', 'mpa_rating_origin',
       'imdb_score', 'votes', 'reviews_users', 'reviews_critics', 'budget',
       'income_usa', 'income_ww', 'theater_opening', 'theater_total',
       'country_1', 'studio', 'director', 'Genre_1', 'Genre_2', 'Genre_3',
       'actor_1', 'actor_2', 'actor_3', 'contract_price', 'studio_score',
       'price_class'],
      dtype='object')

In [69]:
vote_count = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_average = md[md['vote_average'].notnull()]['vote_average'].astype('int')

#quanitle은 4분위 데이터를 말한건데 0.95는 95% 부터의 데이터를 의미한다.
m = vote_count.quantile(0.96) 
C = vote_average.mean()

In [70]:
print(m, "/",C)

557057.64 / 6.113908872901678


In [77]:
## qualfied df생성
qualified = md[(md['vote_count'].notnull()) & (md['vote_average'].notnull()) & 
               (md['vote_count'] >= m)][['title', 'year', 'vote_count',
                                         'vote_average', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype(int)
qualified['vote_average'] = qualified['vote_average'].astype(int)

KeyError: "['genres'] not in index"

In [10]:
## 점수를 만드는 함수
def weight_rating(x):
    v = x['vote_count']
    r = x['vote_average']
    return (v/(v+m)*r) + (m/(m+v)*C)

In [11]:
## qualified['wr']을 만들어서 해당 칼럼을 기준으로 정렬한다.(weight_rating)
qualified['wr'] = qualified.apply(weight_rating, axis=1)
qualified = qualified.sort_values(by='wr', ascending=False).head(250)

In [12]:
qualified.head()

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010,14075,8,29.1081,"[Action, Thriller, Science Fiction, Mystery, A...",7.891568
12481,The Dark Knight,2008,12269,8,123.167,"[Drama, Action, Crime, Thriller]",7.876324
22879,Interstellar,2014,11187,8,32.2135,"[Adventure, Drama, Science Fiction]",7.864948
2843,Fight Club,1999,9678,8,63.8696,[Drama],7.845075
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.0707,"[Adventure, Fantasy, Action]",7.832214


In [13]:
## genre라는 데이터를 빼서 이를 통해 한 데이터에 몰아넣는다.
s = md.apply(lambda x: pd.Series(x['genres']), axis=1).stack().\
    reset_index(level=1, drop=True)
s.name = 'genre'
# gen_md == genre movie data
gen_md = md.drop("genres", axis=1).join(s)

In [14]:
def build_chart(genre, percentile=0.85):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) 
                   & (df['vote_average'].notnull())]\
                    [['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

### 결과

In [15]:
### top romance movie
build_chart('Horror').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
1213,The Shining,1980,3890,8,19.6116,7.901294
1176,Psycho,1960,2405,8,36.8263,7.843335
1171,Alien,1979,4564,7,23.3774,6.941936
41492,Split,2016,4461,7,28.920839,6.940631
14236,Zombieland,2009,3655,7,11.063,6.927969
1158,Aliens,1986,3282,7,21.7612,6.920081
21276,The Conjuring,2013,3169,7,14.9017,6.917338
42169,Get Out,2017,2978,7,36.894806,6.912248
1338,Jaws,1975,2628,7,19.7261,6.901088
8147,Shaun of the Dead,2004,2479,7,14.9029,6.895426


# 2. 콘텐츠 기반 알고리즘

## based on
- Movie Overviews and Taglines
- Movie Cast, Crew, Keywords and Genre

In [16]:
links_small = pd.read_csv("Data_kaggle/links_small.csv")
links_small.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Data_kaggle/links_small.csv'

In [None]:
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [None]:
links_small.head()

In [None]:
# 결측치
md = md.drop([19730, 29503, 35587])

In [None]:
# 새로운 데이터 만들기 tmdbid 연결
md['id'] = md['id'].astype('int')
smd = md[md['id'].isin(links_small)]
smd.shape

In [None]:
## 문장 분석을 위한 vectorized
# 결측치 제거

smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

$$tf - idf(d, t) = tf(d, t) \cdot idf(t)$$

In [None]:
# min_df는 토큰 무시 정도
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2), 
                     min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

In [None]:
tfidf_matrix.shape

### Cosine Similarty
- 두 영화 사이의 유사성 계산
-$cosine(x, y) - (x.y^T)/(||x|| \cdot||y||)$

In [None]:
# 해당 코드의 변경으로 인해 업데이트 됨
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[0]

In [None]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

## 추천 함수 (줄거리)

In [None]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:31] # 입력된 값과 비슷한 몇 개만 가져오
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [None]:
movie='3 Idiots'
print("Description of the Movie: ", movie)
print('---------------------------------------------------------------------')
print(smd[smd['title']==movie]['overview'])

In [None]:
get_recommendations('3 Idiots').head(10)

In [None]:
movie='The Dark Knight'
print("Description of the Movie: ", movie)
print('---------------------------------------------------------------------')
print(smd[smd['title']==movie]['overview'])

In [None]:
get_recommendations('The Dark Knight').head(20)

### 감독, 배우 등을 고려해서 추천

In [None]:
credits = pd.read_csv('Data_kaggle/credits.csv', error_bad_lines=False, engine = "python")
keywords = pd.read_csv('Data_kaggle/keywords.csv', error_bad_lines=False, engine = "python")

In [None]:
keywords.head()

In [None]:
credits.head()

In [None]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

## 데이터 개수 확인 
- 안에 있는 지를 확인 .isin

In [None]:
smd = md[md['id'].isin(links_small)]
smd.shape

In [None]:
#cast데이터를 ''문자열안에 있는 document를 진짜 document로 만든다.
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [None]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [None]:
# 감독 이름 넣기
smd['director'] = smd['crew'].apply(get_director)

# 배우 이름 넣기
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

# 가장 영향력 있는 3명의 배우만 넣기
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

# 키워드도 넣어준다.
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [None]:
# 단어들을 원형으로 만들어주고 띄어쓰기를 없앤다
## director는 강조를 위해 세 번 반복해서 적는다.

smd['cast'] = smd['cast'].apply(lambda x : [str.lower(i.replace(" ", "")) for i in x])
smd['director'] = smd['director'].astype(str).apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x, x, x])

### Keyword 전처리

In [None]:
## keywords라는 인덱스를 기준으로 각 영화마다의 장르를 동일한 인덱스로 표현할 수 있다.
s = smd.apply(lambda x: pd.Series(x['keywords']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keywords'
s = s.value_counts()

In [None]:
s[:10]

In [None]:
# 키워드가 한 개밖에 없는 것은 제거
s = s[s>1]

In [None]:
# stem을 통해 원형을 받을 수 있다.
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

In [None]:
# 키워들르 골라내 인덱스에 담는다.
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [None]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)

In [None]:
# 단어를 원형으로 만들어주고 띄어쓰기를 없애준다.

smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [None]:
smd['keywords']

In [None]:
# 전체적인 데이터들을 합쳐 soup로 만들어준다.

smd['soup'] = smd['keywords']+smd['cast']+smd['director']+smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [None]:
# 단어별로 벡터화 해주기

count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

In [None]:
# 코사인 유사도

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [None]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [None]:
get_recommendations('The Dark Knight').head(15)

### 리뷰 수 기반으로 버림

In [None]:
## 영화 추천 중에 쓰레기 같은 영화는 버린다.

def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weight_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [None]:
improved_recommendations('The Dark Knight').head(15)

## 3. 협업 필터링

In [None]:
reader = Reader()
ratings = pd.read_csv('Data_kaggle/ratings_small.csv')
ratings.head()

In [None]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'],cv=5)

- rmse가 0.8944이면 우리가 하려는 데이터에 충분하다.
- 이제 우리의 데이터 셋을 예측을 위해 train한다.

In [None]:
trainset = data.build_full_trainset()
svd.fit(trainset)

In [None]:
ratings[ratings['userId'] == 1]

In [None]:
svd.predict(1, 302, 3)

In [None]:
## user_평점 만든다.

user_rating=pd.merge(ratings,md,left_on='movieId',right_on='id',how='inner')
user_ratings_final=user_rating[['userId', 'movieId', 'rating','original_title']]
user_ratings=user_ratings_final.sort_values(by='userId')
user_ratings.head()

In [None]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [None]:
id_map = pd.read_csv('../input/links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')
#id_map = id_map.set_index('tmdbId')

In [None]:
indices_map = id_map.set_index('id')

In [None]:
def hybrid(userId, title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    #print(idx)
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

In [91]:
hybrid(1, 'Avatar')

NameError: name 'hybrid' is not defined