In [119]:
# 추천 시스템
# TF-IDF와 코사인 유사도로 추천 시스템 구현
# 원리 : 유사한 내용 추천으로 유사도가 높은 것 찾기
#     영화 줄거리(overview)가 비슷한 영화를 찾아주기 

In [120]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [121]:
# 데이터 준비
dataDF1 = pd.read_csv("movies_metadata.csv", low_memory=False)
dataDF1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [122]:
dataDF1 = dataDF1.dropna(subset=["id","title","overview"]).reset_index(drop=True) # 결측치 제거 

In [123]:
# 10000개 데이터만 사용
dataDF2 = dataDF1.head(10000)[["id","title","overview"]]

In [124]:
title_to_index = dict(zip(dataDF2['title'], dataDF2.index))
# 영화 제목 Father of the Bride Part II의 인덱스를 리턴
idx = title_to_index['Father of the Bride Part II']
print(idx)

4


In [125]:
# TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(dataDF2['overview'])
cosine_sim = cosine_similarity(X=tfidf_matrix, Y=tfidf_matrix)

In [126]:
print(f'TF-IDF 행렬의 크기(shape) : {tfidf_matrix.shape}')
print(f'코사인 유사도 연산 결과 : {cosine_sim.shape}')

TF-IDF 행렬의 크기(shape) : (10000, 32382)
코사인 유사도 연산 결과 : (10000, 10000)


In [127]:
import numpy as np

tfidf_matrix.toarray()[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [128]:
cosine_sim[:10], dataDF2.loc[:10, "title"]

(array([[1.        , 0.01683474, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.01683474, 1.        , 0.04873996, ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.04873996, 1.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.00686403, ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.1072663 , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]]),
 0                       Toy Story
 1                         Jumanji
 2                Grumpier Old Men
 3               Waiting to Exhale
 4     Father of the Bride Part II
 5                            Heat
 6                         Sabrina
 7                    Tom and Huck
 8                    Sudden Death
 9                       GoldenEye
 10         The American President
 Name: title, dtype: obj

In [137]:
sim_scores = list(enumerate(cosine_sim[idx])) # 리스트 생성
#print(sim_scores)

sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) # 유사도가 제일 높은 것으로 정렬
#print(sim_scores)

sim_scores = sim_scores[1:11] # 성적 중에서 2등~11등만 뽑아옴(1등은 자기 자신이니까 제외한다)
movie_indices = [i[0] for i in sim_scores]

dataDF1[['title','genres']].iloc[movie_indices]

Unnamed: 0,title,genres
6769,Father of the Bride,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '..."
6547,Kuffs,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam..."
6282,North to Alaska,"[{'id': 37, 'name': 'Western'}]"
4984,Wendigo,"[{'id': 27, 'name': 'Horror'}]"
7073,The Out of Towners,"[{'id': 35, 'name': 'Comedy'}]"
914,It's a Wonderful Life,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n..."
5550,All Night Long,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '..."
5728,Another You,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
1500,George of the Jungle,"[{'id': 12, 'name': 'Adventure'}, {'id': 35, '..."
6789,Journeys with George,[]


In [141]:
dataDF1['genres'].iloc[movie_indices[0]]

"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, 'name': 'Romance'}]"