# kaggle movie data recommender system

In [1]:
from scipy import spatial

### 1. 데이터 로드

In [1]:
rating_df = pd.read_csv("ratings_small.csv")
rating_df.drop("timestamp", axis=1, inplace=True)
rating_df.tail()

Unnamed: 0,userId,movieId,rating
99999,671,6268,2.5
100000,671,6269,4.0
100001,671,6365,4.0
100002,671,6385,2.5
100003,671,6565,3.5


In [2]:
links_df = pd.read_csv("links_small.csv")
links_df = links_df.dropna()
# pd.options.display.float_format = '{:.0f}'.format
links_df["tmdbId"] = links_df["tmdbId"].astype('int64')
links_df.tail()

Unnamed: 0,movieId,imdbId,tmdbId
9120,162672,3859980,402672
9121,163056,4262980,315011
9122,163949,2531318,391698
9123,164977,27660,137608
9124,164979,3447228,410803


In [3]:
metadata_df = pd.read_csv("movies_metadata.csv", low_memory=False)
metadata = metadata_df[["id", "original_title", "title", "runtime"]]
metadata_df.tail()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
45461,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,...,,0.0,90.0,"[{'iso_639_1': 'fa', 'name': 'فارسی'}]",Released,Rising and falling between a man and woman,Subdue,False,4.0,1.0
45462,False,,0,"[{'id': 18, 'name': 'Drama'}]",,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,...,2011-11-17,0.0,360.0,"[{'iso_639_1': 'tl', 'name': ''}]",Released,,Century of Birthing,False,9.0,3.0
45463,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",...,2003-08-01,0.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6.0
45464,False,,0,[],,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",...,1917-10-21,0.0,87.0,[],Released,,Satan Triumphant,False,0.0,0.0
45465,False,,0,[],,461257,tt6980792,en,Queerama,50 years after decriminalisation of homosexual...,...,2017-06-09,0.0,75.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Queerama,False,0.0,0.0


### 2. movieId 값으로 영화 정보 출력

In [4]:
def id_to_movie(id_num):
    tmdbId = links_df.loc[links_df["movieId"] == id_num]["tmdbId"].values[0]
    movie_info = metadata_df.loc[metadata_df["id"] == str(tmdbId)]
    return movie_info

In [5]:
id_to_movie(6268)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
6130,False,,800000,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",http://www.lhp.com.sg/victor/,25461,tt0316188,en,Raising Victor Vargas,"The film follows Victor, a Lower East Side tee...",...,2002-05-16,2816116.0,88.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Raising Victor Vargas,False,7.8,14.0


### 3. 데이터 살펴보기
unique count
- rating
- user
- movie

In [6]:
u_user = rating_df["userId"].unique()
u_movie = rating_df["movieId"].unique()
u_rating = rating_df["rating"].unique()

print(sorted(u_rating))

len(u_user), len(u_movie), len(u_rating)

[0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]


(671, 9066, 10)

In [7]:
rating_df.groupby("rating").size().reset_index(name = "rating_count")

Unnamed: 0,rating,rating_count
0,0.5,1101
1,1.0,3326
2,1.5,1687
3,2.0,7271
4,2.5,4449
5,3.0,20064
6,3.5,10538
7,4.0,28750
8,4.5,7723
9,5.0,15095


In [8]:
user_counts_df = rating_df.groupby("userId").size().reset_index(name="user_rating_count")
user_count_df = user_counts_df.sort_values(by=["user_rating_count"], ascending=False)
user_count_df.head()

Unnamed: 0,userId,user_rating_count
546,547,2391
563,564,1868
623,624,1735
14,15,1700
72,73,1610


In [9]:
movie_counts_df = rating_df.groupby("movieId").size().reset_index(name="movie_rating_count")
movie_counts_df = movie_counts_df.sort_values(by=["movie_rating_count"], ascending=False)
movie_counts_df.head()

Unnamed: 0,movieId,movie_rating_count
321,356,341
266,296,324
284,318,311
525,593,304
232,260,291


### 4. 전처리
- 데이터셋을 줄이기

In [10]:
user_limit, movie_limit = 100, 100

In [11]:
# 671 -> 258
filtered_userId = user_counts_df[user_counts_df["user_rating_count"] > user_limit]["userId"]
filtered_userId = list(filtered_userId)
len(filtered_userId), filtered_userId[:5]

(258, [4, 8, 15, 17, 19])

In [12]:
# 9066 -> 149
filtered_movieId = movie_counts_df[movie_counts_df["movie_rating_count"] > movie_limit]["movieId"]
filtered_movieId = list(filtered_movieId)
len(filtered_movieId), filtered_movieId[:5]

(149, [356, 296, 318, 593, 260])

In [13]:
filtered_df = rating_df[rating_df["userId"].isin(filtered_userId)]
filtered_df = filtered_df[filtered_df["movieId"].isin(filtered_movieId)]

In [14]:
len(filtered_df)

15567

### pivot
- user-base로 데이터 프레임 만듦

In [15]:
user_df = filtered_df.pivot_table(values="rating", index="userId", columns="movieId",
                                 aggfunc=np.average, fill_value=0, dropna=False)
user_df.tail()

movieId,1,2,6,10,25,32,34,36,39,47,...,6377,6539,6874,7153,7361,7438,8961,33794,58559,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
656,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
659,0.0,0.0,3.0,0.0,5.0,4.0,0.0,4.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
664,3.5,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,4.5,...,0.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.5,5.0
665,0.0,3.0,0.0,0.0,0.0,4.0,2.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
671,5.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
import pandas as pd

In [18]:
user_df.to_csv("user_df.csv")

### 5. 유사도 함수 작성

In [19]:
def euclidean_similarity(vector_1, vector_2):
    
    idx = vector_1.nonzero()[0]
    if len(idx) == 0:
        return
    vector_1, vector_2 = np.array(vector_1)[idx], np.array(vector_2)[idx]
    
    idx = vector_2.nonzero()[0]
    if len(idx) == 0:
        return
    vector_1, vector_2 = np.array(vector_1)[idx], np.array(vector_2)[idx]
    
    return np.linalg.norm(vector_1 - vector_2)

In [20]:
euclidean_similarity(user_df.loc[4], user_df.loc[8])

4.8218253804964775

In [21]:
def cosine_similarity(vector_1, vector_2):
    
    idx = vector_1.nonzero()[0]
    if len(idx) == 0:
        return
    vector_1, vector_2 = np.array(vector_1)[idx], np.array(vector_2)[idx]
    
    idx = vector_2.nonzero()[0]
    if len(idx) == 0:
        return
    vector_1, vector_2 = np.array(vector_1)[idx], np.array(vector_2)[idx]
    
    return 1 - spatial.distance.cosine(vector_1, vector_2)

In [22]:
cosine_similarity(user_df.loc[4], user_df.loc[8])

0.9911164579376771

### 6. 유사도 행렬 만들기

In [35]:
def similarity_matrix(user_df, similarity_func):
    
    index = user_df.index
    
    matrix = []
    
    for idx_1, value_1 in user_df.iterrows():
        row = []
        for idx_2, value_2 in user_df.iterrows():
            row.append(similarity_func(value_1, value_2))
        matrix.append(row)
        
    return pd.DataFrame(matrix, columns=index, index=index)

In [36]:
len(user_df)

258

In [37]:
sm_df = similarity_matrix(user_df, cosine_similarity)

### 7. 예측 매트릭스 만들기

In [44]:
def mean_score(df, sm_df, target, closer_count):
    
    # 유사도가 가까운 데이터를 선별하기
    ms_df = sm_df.drop(target)
    ms_df = ms_df.sort_values(target, ascending=True)
    ms_df = ms_df[target][:closer_count]
    
    # user_df의 실제 영화 평점 데이터 가져오기
    ms_df = df.loc[ms_df.index]
    
    # 결과 데이터 생성
    pred_df = pd.DataFrame(columns=df.columns)
    pred_df.loc["user"] = df.loc[target]
    pred_df.loc["mean"] = ms_df.mean()
    
    return pred_df

In [45]:
# test code (userId: 4)
pred_df = mean_score(user_df, sm_df, 4, 5)
pred_df

movieId,1,2,6,10,25,32,34,36,39,47,...,6377,6539,6874,7153,7361,7438,8961,33794,58559,79132
user,0.0,0.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,1.0,1.1,0.0,0.0,0.0,0.0,1.1,0.0,0.5,0.5,...,1.4,2.3,0.1,0.0,0.8,0.0,1.8,0.0,0.0,0.7


### 8. Recommend - 영화 추천

In [50]:
def recommend(pred_df, r_count=10):
    recommend_df = pred_df.T
    recommend_df = recommend_df[recommend_df["user"] == 0]
    recommend_df = recommend_df.sort_values("mean", ascending=False)
    
    return list(recommend_df.index)[:5]

In [52]:
movie_ids = recommend(pred_df)
movie_ids

[2959, 4973, 6539, 318, 8961]

### 9. movieId 값으로 영화 정보 가져오기

In [53]:
def movie_info(movieIds):
    datas = []
    for movie_Id in movieIds:
        data = id_to_movie(movie_Id).to_dict('records')[0]
        datas.append(data)
    return pd.DataFrame(datas)

In [54]:
df = movie_info(movie_ids)
df

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,,63000000,"[{'id': 18, 'name': 'Drama'}]",http://www.foxmovies.com/movies/fight-club,550,tt0137523,en,Fight Club,A ticking-time-bomb insomniac and a slippery s...,...,1999-10-15,100853753.0,139.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Mischief. Mayhem. Soap.,Fight Club,False,8.3,9678.0
1,False,,10000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",http://www.die-fabelhafte-welt-der-amelie.de,194,tt0211915,fr,Le fabuleux destin d'Amélie Poulain,"At a tiny Parisian café, the adorable yet pain...",...,2001-04-25,173921954.0,122.0,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",Released,One person can change your life forever.,Amélie,False,7.8,3403.0
2,False,"{'id': 295, 'name': 'Pirates of the Caribbean ...",140000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",http://disney.go.com/disneyvideos/liveaction/p...,22,tt0325980,en,Pirates of the Caribbean: The Curse of the Bla...,"Jack Sparrow, a freewheeling 17th-century pira...",...,2003-07-09,655011224.0,143.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Prepare to be blown out of the water.,Pirates of the Caribbean: The Curse of the Bla...,False,7.5,7191.0
3,False,,25000000,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",,278,tt0111161,en,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,...,1994-09-23,28341469.0,142.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Fear can hold you prisoner. Hope can set you f...,The Shawshank Redemption,False,8.5,8358.0
4,False,"{'id': 468222, 'name': 'The Incredibles Collec...",92000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",http://disney.go.com/disneyvideos/animatedfilm...,9806,tt0317705,en,The Incredibles,Bob Parr has given up his superhero days to lo...,...,2004-11-05,631442092.0,115.0,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",Released,"No gut, no glory",The Incredibles,False,7.4,5290.0


### 10. 실행 함수로 작성

In [None]:
def run(df, similarity_func, target, closer_count):
    
    # 유사도 행렬 데이터 만들기
    sm_df = similarity_matrix(df, similarity_func)