In [1]:
import pandas as pd # pandas is a data manipulation library
import numpy as np #provides numerical arrays and functions to manipulate the arrays efficiently
import random
import re
import matplotlib.pyplot as plt # data visualization library
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS #used to generate world cloud

In [2]:
#데이터 불러오기
movies= pd.read_csv('movies.csv')
tags = pd.read_csv('tags.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
#max 컬럼수 조정
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [4]:
#각 데이터 미리보기
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [5]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [6]:
tags

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455
...,...,...,...,...
1093355,162521,66934,Neil Patrick Harris,1427311611
1093356,162521,103341,cornetto trilogy,1427311259
1093357,162534,189169,comedy,1527518175
1093358,162534,189169,disabled,1527518181


In [7]:
#영화 이름에서 출시일 추출
movies['movie_year'] = movies['title']
movies['movie_year'] = movies['movie_year'].str.extract(r"\(([0-9]+)\)", expand=False)

#영화 이름에서 이름 추출
movies['title_only'] = movies['title']
movies['title_only'] = movies['title_only'].str.extract('(.*?)\s*\(', expand=False)

movies

Unnamed: 0,movieId,title,genres,movie_year,title_only
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,Toy Story
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,Jumanji
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,Grumpier Old Men
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,Waiting to Exhale
4,5,Father of the Bride Part II (1995),Comedy,1995,Father of the Bride Part II
...,...,...,...,...,...
62418,209157,We (2018),Drama,2018,We
62419,209159,Window of the Soul (2001),Documentary,2001,Window of the Soul
62420,209163,Bad Poems (2018),Comedy|Drama,2018,Bad Poems
62421,209169,A Girl Thing (2001),(no genres listed),2001,A Girl Thing


In [8]:
#Drop all rows containing incorrect year values - such as 0, 6, 69, 500 and -2147483648, 잘못된 출시일 드랍
movies.drop(movies[movies.movie_year == 'NaN'].index, inplace=True)
movies.drop(movies[movies.movie_year == '0'].index, inplace=True)
movies.drop(movies[movies.movie_year == '6'].index, inplace=True)
movies.drop(movies[movies.movie_year == '06'].index, inplace=True)
movies.drop(movies[movies.movie_year == '69'].index, inplace=True)
movies.drop(movies[movies.movie_year == '500'].index, inplace=True)
movies.drop(movies[movies.movie_year == '-2147483648'].index, inplace=True)

movies.drop(movies[movies.movie_year == 0].index, inplace=True)
movies.drop(movies[movies.movie_year == 6].index, inplace=True)
movies.drop(movies[movies.movie_year == 69].index, inplace=True)
movies.drop(movies[movies.movie_year == 500].index, inplace=True)
movies.drop(movies[movies.movie_year == -2147483648].index, inplace=True)

movies

Unnamed: 0,movieId,title,genres,movie_year,title_only
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,Toy Story
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,Jumanji
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,Grumpier Old Men
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,Waiting to Exhale
4,5,Father of the Bride Part II (1995),Comedy,1995,Father of the Bride Part II
...,...,...,...,...,...
62418,209157,We (2018),Drama,2018,We
62419,209159,Window of the Soul (2001),Documentary,2001,Window of the Soul
62420,209163,Bad Poems (2018),Comedy|Drama,2018,Bad Poems
62421,209169,A Girl Thing (2001),(no genres listed),2001,A Girl Thing


In [9]:
#convert the string values to numeric (년도 숫자형으로 변경)
movies['movie_year'] = pd.to_numeric(movies['movie_year'])

In [10]:
#2015년 이후 출시된 영화추출
movie2015_index = movies[movies['movie_year'] < 2015].index
movie2015 = movies.drop(movie2015_index)

movie2015

Unnamed: 0,movieId,title,genres,movie_year,title_only
15036,79607,"Millions Game, The (Das Millionenspiel)",Action|Drama|Sci-Fi|Thriller,,"Millions Game, The"
21697,111781,Mission: Impossible - Rogue Nation (2015),Action|Adventure|Thriller,2015.0,Mission: Impossible - Rogue Nation
22072,113345,Jupiter Ascending (2015),Action|Adventure|Sci-Fi,2015.0,Jupiter Ascending
22595,115713,Ex Machina (2015),Drama|Sci-Fi|Thriller,2015.0,Ex Machina
23218,117466,In the Heart of the Sea (2015),Action|Adventure|Drama,2015.0,In the Heart of the Sea
...,...,...,...,...,...
62412,209143,The Painting (2019),Animation|Documentary,2019.0,The Painting
62413,209145,Liberté (2019),Drama,2019.0,Liberté
62415,209151,Mao Zedong 1949 (2019),(no genres listed),2019.0,Mao Zedong 1949
62418,209157,We (2018),Drama,2018.0,We


In [11]:
#movie year NaN 삭제
movie2015 = movie2015.drop(index=15036, axis=0)

In [12]:
#rating에서 tumestamp 삭제
del ratings['timestamp']

In [13]:
#2015년 영화에 대한 rating
movie2015_rating = movie2015.merge(ratings, on = 'movieId', how = 'inner')
movie2015_rating

Unnamed: 0,movieId,title,genres,movie_year,title_only,userId,rating
0,111781,Mission: Impossible - Rogue Nation (2015),Action|Adventure|Thriller,2015.0,Mission: Impossible - Rogue Nation,4,3.5
1,111781,Mission: Impossible - Rogue Nation (2015),Action|Adventure|Thriller,2015.0,Mission: Impossible - Rogue Nation,119,4.5
2,111781,Mission: Impossible - Rogue Nation (2015),Action|Adventure|Thriller,2015.0,Mission: Impossible - Rogue Nation,243,2.0
3,111781,Mission: Impossible - Rogue Nation (2015),Action|Adventure|Thriller,2015.0,Mission: Impossible - Rogue Nation,248,3.5
4,111781,Mission: Impossible - Rogue Nation (2015),Action|Adventure|Thriller,2015.0,Mission: Impossible - Rogue Nation,253,3.5
...,...,...,...,...,...,...,...
832099,209143,The Painting (2019),Animation|Documentary,2019.0,The Painting,145795,1.0
832100,209145,Liberté (2019),Drama,2019.0,Liberté,145795,1.0
832101,209151,Mao Zedong 1949 (2019),(no genres listed),2019.0,Mao Zedong 1949,125475,3.5
832102,209157,We (2018),Drama,2018.0,We,119571,1.5


In [14]:
movie2015_rating.describe()

Unnamed: 0,movieId,movie_year,userId,rating
count,832104.0,820374.0,832104.0,832104.0
mean,153933.51118,2016.189887,80246.887265,3.529311
std,24292.970598,1.145163,46752.252181,1.056876
min,111781.0,2015.0,3.0,0.5
25%,134130.0,2015.0,39440.0,3.0
50%,155581.0,2016.0,78939.0,3.5
75%,173007.0,2017.0,120829.0,4.0
max,209163.0,2019.0,162538.0,5.0


In [15]:
movie2015_rating = movie2015_rating.dropna()
movie2015_rating

Unnamed: 0,movieId,title,genres,movie_year,title_only,userId,rating
0,111781,Mission: Impossible - Rogue Nation (2015),Action|Adventure|Thriller,2015.0,Mission: Impossible - Rogue Nation,4,3.5
1,111781,Mission: Impossible - Rogue Nation (2015),Action|Adventure|Thriller,2015.0,Mission: Impossible - Rogue Nation,119,4.5
2,111781,Mission: Impossible - Rogue Nation (2015),Action|Adventure|Thriller,2015.0,Mission: Impossible - Rogue Nation,243,2.0
3,111781,Mission: Impossible - Rogue Nation (2015),Action|Adventure|Thriller,2015.0,Mission: Impossible - Rogue Nation,248,3.5
4,111781,Mission: Impossible - Rogue Nation (2015),Action|Adventure|Thriller,2015.0,Mission: Impossible - Rogue Nation,253,3.5
...,...,...,...,...,...,...,...
832099,209143,The Painting (2019),Animation|Documentary,2019.0,The Painting,145795,1.0
832100,209145,Liberté (2019),Drama,2019.0,Liberté,145795,1.0
832101,209151,Mao Zedong 1949 (2019),(no genres listed),2019.0,Mao Zedong 1949,125475,3.5
832102,209157,We (2018),Drama,2018.0,We,119571,1.5


In [16]:
# 82만개의 rating 중 10만개 랜덤 추출
movie2015_rating = movie2015_rating.sample(n=100000)
movie2015_rating

Unnamed: 0,movieId,title,genres,movie_year,title_only,userId,rating
453921,159858,The Conjuring 2 (2016),Horror,2016.0,The Conjuring 2,19024,3.0
220998,134170,Kung Fury (2015),Action|Comedy|Fantasy|Sci-Fi,2015.0,Kung Fury,5503,4.0
90074,122904,Deadpool (2016),Action|Adventure|Comedy|Sci-Fi,2016.0,Deadpool,8992,2.5
443149,158966,Captain Fantastic (2016),Drama,2016.0,Captain Fantastic,28036,4.5
313062,140247,The Gift (2015),Drama|Horror,2015.0,The Gift,80571,4.0
...,...,...,...,...,...,...,...
486321,162578,Kubo and the Two Strings (2016),Adventure|Animation|Children|Fantasy,2016.0,Kubo and the Two Strings,94154,3.0
699125,180095,Wonder (2017),Drama,2017.0,Wonder,102451,4.5
493123,162606,The Accountant (2016),Crime|Drama|Thriller,2016.0,The Accountant,117985,3.5
609097,170813,Baywatch (2017),Action|Comedy,2017.0,Baywatch,84482,4.0


In [17]:
#top200 영화 추출
count_rated =movie2015_rating.groupby('title')[['userId']].count()
top200_count = count_rated.nlargest(199, 'userId')
top200_movie = top200_count.merge(movie2015, on='title', how ='inner')

top200_movie

Unnamed: 0,title,userId,movieId,genres,movie_year,title_only
0,The Martian (2015),2033,134130,Adventure|Drama|Sci-Fi,2015.0,The Martian
1,Deadpool (2016),1696,122904,Action|Adventure|Comedy|Sci-Fi,2016.0,Deadpool
2,Mad Max: Fury Road (2015),1683,122882,Action|Adventure|Sci-Fi|Thriller,2015.0,Mad Max: Fury Road
3,Inside Out (2015),1672,134853,Adventure|Animation|Children|Comedy|Drama|Fantasy,2015.0,Inside Out
4,Ex Machina (2015),1565,115713,Drama|Sci-Fi|Thriller,2015.0,Ex Machina
5,Star Wars: Episode VII - The Force Awakens (2015),1510,122886,Action|Adventure|Fantasy|Sci-Fi|IMAX,2015.0,Star Wars: Episode VII - The Force Awakens
6,Arrival (2016),1237,164179,Sci-Fi,2016.0,Arrival
7,Kingsman: The Secret Service (2015),1152,119145,Action|Adventure|Comedy|Crime,2015.0,Kingsman: The Secret Service
8,Zootopia (2016),1030,152081,Action|Adventure|Animation|Children|Comedy,2016.0,Zootopia
9,"Big Short, The (2015)",904,148626,Drama,2015.0,"Big Short, The"


In [18]:
#userid count 삭제
top200_movie = top200_movie.drop(['userId'], axis =1)


In [19]:
#top200 movie 저장
top200_movie.to_csv('top200_movie', mode='w')

In [20]:
#rating2015/100000 부가 정보 드랍
rating2015_100000 = movie2015_rating.drop(columns=['title', 'genres', 'movie_year', 'title_only'], axis = 1)
#top200 movie rating 머지
top200_movie_rating = top200_movie.merge(rating2015_100000, on = 'movieId', how = 'inner')
top200_movie_rating

Unnamed: 0,title,movieId,genres,movie_year,title_only,userId,rating
0,The Martian (2015),134130,Adventure|Drama|Sci-Fi,2015.0,The Martian,132215,5.0
1,The Martian (2015),134130,Adventure|Drama|Sci-Fi,2015.0,The Martian,83444,4.5
2,The Martian (2015),134130,Adventure|Drama|Sci-Fi,2015.0,The Martian,97279,5.0
3,The Martian (2015),134130,Adventure|Drama|Sci-Fi,2015.0,The Martian,155512,5.0
4,The Martian (2015),134130,Adventure|Drama|Sci-Fi,2015.0,The Martian,63066,4.0
...,...,...,...,...,...,...,...
64622,Molly's Game (2017),180045,Drama,2017.0,Molly's Game,92692,4.0
64623,Molly's Game (2017),180045,Drama,2017.0,Molly's Game,118906,1.0
64624,Molly's Game (2017),180045,Drama,2017.0,Molly's Game,77506,3.0
64625,Molly's Game (2017),180045,Drama,2017.0,Molly's Game,152866,2.0


In [21]:
#top200 전 사용자별 영화 리뷰 카운트
user_review_count_top200 = top200_movie_rating.groupby('userId')[['movieId']].count()
user_review_count_top200.describe()

Unnamed: 0,movieId
count,21668.0
mean,2.982601
std,2.975279
min,1.0
25%,1.0
50%,2.0
75%,4.0
max,27.0


In [22]:
#10개 이상 리뷰 남긴 사용자 아이디 추출
is_review_over10 = user_review_count_top200['movieId'] >= 10
userId_review_over10 = user_review_count_top200[is_review_over10]
userId_review_over10 = userId_review_over10.drop(columns=['movieId'])

In [23]:
userId_review_over10

541
606
901
906
997
...
161919
162153
162271
162334
162349


In [24]:
#리뷰 10개 이상 남긴 사용자에 대한 데이터만 남기기
top200_movie_rating = top200_movie_rating.merge(userId_review_over10, on='userId', how='inner')
top200_movie_rating

Unnamed: 0,title,movieId,genres,movie_year,title_only,userId,rating
0,The Martian (2015),134130,Adventure|Drama|Sci-Fi,2015.0,The Martian,7439,4.0
1,Zootopia (2016),152081,Action|Adventure|Animation|Children|Comedy,2016.0,Zootopia,7439,4.5
2,Spotlight (2015),142488,Thriller,2015.0,Spotlight,7439,5.0
3,Passengers (2016),166635,Adventure|Drama|Romance|Sci-Fi,2016.0,Passengers,7439,3.5
4,Lady Bird (2017),177615,Comedy,2017.0,Lady Bird,7439,4.0
...,...,...,...,...,...,...,...
12585,The Witch (2015),140267,Horror,2015.0,The Witch,6553,4.5
12586,War Dogs (2016),161131,Comedy,2016.0,War Dogs,6553,2.5
12587,War for the Planet of the Apes (2017),173145,Action|Adventure|Drama|Sci-Fi,2017.0,War for the Planet of the Apes,6553,1.5
12588,The Conjuring 2 (2016),159858,Horror,2016.0,The Conjuring 2,6553,0.5


In [25]:
#top200 영화에 대한 사용자 평점 행렬 저장
top200_movie_rating.to_csv('top200_movie_rating', mode='w')

In [26]:
#top200_movie_rating에서 사용자 아이디만 추출
user_list = top200_movie_rating['userId'].unique()
user_list = pd.DataFrame(user_list, columns=['userId'])
user_list

Unnamed: 0,userId
0,7439
1,144980
2,100230
3,162153
4,72265
...,...
956,123159
957,158028
958,2389
959,78281


In [27]:
#전체 태그 필터링 사용자/영화 Id
top200_tags = top200_movie.merge(tags, on = 'movieId', how = 'inner')
top200_tags = top200_tags.merge(user_list, on = 'userId', how = 'inner')
top200_tags

Unnamed: 0,title,movieId,genres,movie_year,title_only,userId,tag,timestamp
0,The Martian (2015),134130,Adventure|Drama|Sci-Fi,2015.0,The Martian,7082,mars,1553637106
1,The Martian (2015),134130,Adventure|Drama|Sci-Fi,2015.0,The Martian,7082,overrated,1553637089
2,The Martian (2015),134130,Adventure|Drama|Sci-Fi,2015.0,The Martian,7082,scientific,1553637097
3,The Martian (2015),134130,Adventure|Drama|Sci-Fi,2015.0,The Martian,7082,space,1553637094
4,The Martian (2015),134130,Adventure|Drama|Sci-Fi,2015.0,The Martian,7082,Survival Instinct,1553637103
...,...,...,...,...,...,...,...,...
15002,The Disaster Artist (2017),180297,Comedy|Drama,2017.0,The Disaster Artist,997,hollywood,1529752734
15003,The Disaster Artist (2017),180297,Comedy|Drama,2017.0,The Disaster Artist,997,movie business,1529752732
15004,Molly's Game (2017),180045,Drama,2017.0,Molly's Game,91560,courtroom,1525139037
15005,Molly's Game (2017),180045,Drama,2017.0,Molly's Game,91560,poker,1525139009


In [28]:
#필터링된 태그 저장
top200_tags.to_csv('top200_tags', mode='w')

In [29]:
#영화-장르 (장르 분해)

In [30]:
#각 장르 나눠서 새 행 만들기
result = top200_movie['genres'].str.split('|')
result = result.apply(lambda x: pd.Series(x))
result.stack()
result.stack().reset_index(level=1, drop=True)
result = result.stack().reset_index(level=1, drop=True).to_frame('genres_single')
result

Unnamed: 0,genres_single
0,Adventure
0,Drama
0,Sci-Fi
1,Action
1,Adventure
...,...
197,Fantasy
197,Horror
197,Thriller
198,Drama


In [31]:
#top200_movie 각 장르별 표기
top200_movie_single = top200_movie.merge(result, left_index=True, right_index=True, how='left')
top200_movie_single

Unnamed: 0,title,movieId,genres,movie_year,title_only,genres_single
0,The Martian (2015),134130,Adventure|Drama|Sci-Fi,2015.0,The Martian,Adventure
0,The Martian (2015),134130,Adventure|Drama|Sci-Fi,2015.0,The Martian,Drama
0,The Martian (2015),134130,Adventure|Drama|Sci-Fi,2015.0,The Martian,Sci-Fi
1,Deadpool (2016),122904,Action|Adventure|Comedy|Sci-Fi,2016.0,Deadpool,Action
1,Deadpool (2016),122904,Action|Adventure|Comedy|Sci-Fi,2016.0,Deadpool,Adventure
...,...,...,...,...,...,...
197,The Mummy (2017),170827,Action|Adventure|Fantasy|Horror|Thriller,2017.0,The Mummy,Fantasy
197,The Mummy (2017),170827,Action|Adventure|Fantasy|Horror|Thriller,2017.0,The Mummy,Horror
197,The Mummy (2017),170827,Action|Adventure|Fantasy|Horror|Thriller,2017.0,The Mummy,Thriller
198,13 Hours (2016),138210,Drama,2016.0,13 Hours,Drama


In [32]:
#싱글 장르별 표기 저장
top200_movie_genre = top200_movie_single
top200_movie_genre.to_csv('top200_movie_genre', mode='w')

In [33]:
#사용자별 영화 선호도 top3 추출


In [34]:
#각 장르 나눠서 새 행 만들기
result = top200_movie_rating['genres'].str.split('|')
result = result.apply(lambda x: pd.Series(x))
result.stack()
result.stack().reset_index(level=1, drop=True)
result = result.stack().reset_index(level=1, drop=True).to_frame('genres_single')
result

Unnamed: 0,genres_single
0,Adventure
0,Drama
0,Sci-Fi
1,Action
1,Adventure
...,...
12589,Action
12589,Comedy
12589,Horror
12589,Sci-Fi


In [35]:
#top200_movie_rating 각 장르별 표기
top200_movie_rating_genre = top200_movie_rating.merge(result, left_index=True, right_index=True, how='left')
top200_movie_rating_genre

Unnamed: 0,title,movieId,genres,movie_year,title_only,userId,rating,genres_single
0,The Martian (2015),134130,Adventure|Drama|Sci-Fi,2015.0,The Martian,7439,4.0,Adventure
0,The Martian (2015),134130,Adventure|Drama|Sci-Fi,2015.0,The Martian,7439,4.0,Drama
0,The Martian (2015),134130,Adventure|Drama|Sci-Fi,2015.0,The Martian,7439,4.0,Sci-Fi
1,Zootopia (2016),152081,Action|Adventure|Animation|Children|Comedy,2016.0,Zootopia,7439,4.5,Action
1,Zootopia (2016),152081,Action|Adventure|Animation|Children|Comedy,2016.0,Zootopia,7439,4.5,Adventure
...,...,...,...,...,...,...,...,...
12589,Upgrade (2018),189203,Action|Comedy|Horror|Sci-Fi|Thriller,2018.0,Upgrade,6553,3.0,Action
12589,Upgrade (2018),189203,Action|Comedy|Horror|Sci-Fi|Thriller,2018.0,Upgrade,6553,3.0,Comedy
12589,Upgrade (2018),189203,Action|Comedy|Horror|Sci-Fi|Thriller,2018.0,Upgrade,6553,3.0,Horror
12589,Upgrade (2018),189203,Action|Comedy|Horror|Sci-Fi|Thriller,2018.0,Upgrade,6553,3.0,Sci-Fi


In [36]:
top200_user_genre = top200_movie_rating_genre.groupby(by=['userId', 'genres_single'], as_index=False).count()
top200_user_genre.head(200)

Unnamed: 0,userId,genres_single,title,movieId,genres,movie_year,title_only,rating
0,541,Action,7,7,7,7,7,7
1,541,Adventure,8,8,8,8,8,8
2,541,Animation,2,2,2,2,2,2
3,541,Children,1,1,1,1,1,1
4,541,Comedy,2,2,2,2,2,2
5,541,Crime,2,2,2,2,2,2
6,541,Drama,6,6,6,6,6,6
7,541,Fantasy,3,3,3,3,3,3
8,541,IMAX,1,1,1,1,1,1
9,541,Romance,1,1,1,1,1,1


In [37]:
top200_user_genre = top200_user_genre.drop(columns=['movieId', 'genres', 'movie_year', 'title_only', 'rating'], axis=1)
top200_user_genre

Unnamed: 0,userId,genres_single,title
0,541,Action,7
1,541,Adventure,8
2,541,Animation,2
3,541,Children,1
4,541,Comedy,2
...,...,...,...
10638,162349,Fantasy,2
10639,162349,Horror,2
10640,162349,Romance,2
10641,162349,Sci-Fi,5


In [38]:
#top200_user_genre 저장
top200_user_genre.to_csv('top200_user_genre', mode='w')

In [39]:
#top200_user_genre.head(200)

In [40]:
top085_user_genre['title_x'] >= top085_user_genre['title_y']

NameError: name 'top085_user_genre' is not defined

In [41]:
#0.85 분위 이상의 장르만 남기기
top085 = top200_user_genre.groupby(['userId'])['title'].quantile(q=0.85, interpolation='nearest')
top070 = top200_user_genre.groupby(['userId'])['title'].quantile(q=0.7, interpolation='nearest')
top050 = top200_user_genre.groupby(['userId'])['title'].quantile(q=0.5, interpolation='nearest')

user_genre_quantile = top200_user_genre.merge(top085, on = 'userId', how ='inner')
user_genre_quantile = user_genre_quantile.merge(top070, on = 'userId', how ='inner')
user_genre_quantile = user_genre_quantile.merge(top050, on = 'userId', how ='inner')
user_genre_quantile.columns=['userId', 'genres_single', 'title', 'title085', 'title070', 'title050']
user_genre_quantile

  return merge(


Unnamed: 0,userId,genres_single,title,title085,title070,title050
0,541,Action,7,6,5,2
1,541,Adventure,8,6,5,2
2,541,Animation,2,6,5,2
3,541,Children,1,6,5,2
4,541,Comedy,2,6,5,2
...,...,...,...,...,...,...
10638,162349,Fantasy,2,5,3,2
10639,162349,Horror,2,5,3,2
10640,162349,Romance,2,5,3,2
10641,162349,Sci-Fi,5,5,3,2


In [42]:
#0.85, 0.70 분위 이상의 장르만 남기기
top070_user_genre = user_genre_quantile[user_genre_quantile['title'] >= user_genre_quantile['title070']]
top070_user_genre.to_csv('top070_user_genre', mode='w')

In [72]:
#0.85 분위 이상의 장르만 남기기
top085_user_genre = user_genre_quantile[user_genre_quantile['title'] >= user_genre_quantile['title085']]
top085_user_genre.to_csv('top085_user_genre', mode='w')

In [73]:
#0.70 분위 이상의 장르만 남기기
top070_user_genre = user_genre_quantile[(user_genre_quantile['title'] >= user_genre_quantile['title070']) & (user_genre_quantile['title'] < user_genre_quantile['title085'])]
top070_user_genre.to_csv('top070_user_genre', mode='w')

In [74]:
#0.50 분위 이상의 장르만 남기기
top050_user_genre = user_genre_quantile[(user_genre_quantile['title'] >= user_genre_quantile['title050']) & (user_genre_quantile['title'] < user_genre_quantile['title070'])]
top050_user_genre.to_csv('top050_user_genre', mode='w')