## 연관 분석

In [16]:
import sys; sys.path.insert(0, '..')
import warnings


from test_util.data_loader import DataLoader
from test_util.metric_calculator import MetricCalculator
warnings.simplefilter(action='ignore', category=FutureWarning)


In [17]:
# Movielens 데이터 로딩
data_loader = DataLoader(num_users=1000, num_test_items=5, data_path='../data/ml-10m/')
movielens = data_loader.load()

In [18]:
# 사용자 x 영화 행렬 형식 변경
user_movie_matrix = movielens.train.pivot(index='user_id', columns='movie_id', values='rating')

# 라이브러리를 사용하기 위해 4 이상의 평갓값은 1, 4미만의 평갓값은 0으로 한다.  
user_movie_matrix[user_movie_matrix < 4] = 0
user_movie_matrix[user_movie_matrix.isnull()] = 0
user_movie_matrix[user_movie_matrix >= 4] = 1

user_movie_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,62000,62113,62293,62344,62394,62801,62803,63113,63992,64716
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# mlxtend: scikit-learn, tensorflow, keras 등 파이썬에서 사용되는 툴
# 대표적인 기능으로 frequent itemsets via the Apriori Algorithm이 있음
from mlxtend.frequent_patterns import apriori

# 지지도를 계산하고 지지도가 높은 영화 표시
# 4명 중 1명이 4점 이상을 줬으면 지지도는 1/4, 3명이 4점 이상을 줬으면 3/4 단, 지지도가 일정값(min_support) 이상인 아이템&조합에 대해서만 연산
freq_movies = apriori(
    user_movie_matrix, min_support=0.1, use_colnames=True
)
freq_movies.sort_values('support', ascending=False).head()



Unnamed: 0,support,itemsets
42,0.415,(593)
23,0.378,(318)
21,0.368,(296)
19,0.362,(260)
25,0.32,(356)


In [20]:
movielens.item_content[movielens.item_content.movie_id==593]

Unnamed: 0,movie_id,title,genre,tag
587,593,"Silence of the Lambs, The (1991)","[Crime, Horror, Thriller]","[based on a book, anthony hopkins, demme, psyc..."


In [21]:
# 지지도를 기반으로 리프트값 계산
from mlxtend.frequent_patterns import association_rules

min_threshold = 0.8
# 연관 규칙 계산(리프트값이 높은 순으로 = 상관관계가 강한 순으로)
# mlxtend 업그레이드로 num_itemsets를 추가해야 했음. 조합이므로 2를 입력함
rules = association_rules(freq_movies, metric="lift", min_threshold=min_threshold, num_itemsets=2)
rules.sort_values('lift', ascending=False).head()[['antecedents', 'consequents', 'lift']]

Unnamed: 0,antecedents,consequents,lift
659,(4993),(5952),5.45977
658,(5952),(4993),5.45977
1454,"(1196, 1198)","(1291, 260)",4.669188
1455,"(1291, 260)","(1196, 1198)",4.669188
1452,"(1291, 1196)","(260, 1198)",4.171359


위 결과에서 antecedents: 조건부, consequents: 귀결부  
4993: 반지의 제왕 2편, 5952: 반지의 제왕 1편  
해석하자면, 반지의 제왕 2편과 1편의 상관관계가 높다~

In [None]:
from test_src.association import AssociationRecommender

recommender = AssociationRecommender()
recommend_result = recommender.recommend(movielens)



TypeError: association_rules() missing 1 required positional argument: 'num_itemsets'

In [None]:
# 평가 - RSME를 사용하지 않음(평갓값 사용 X)
metric_calculator = MetricCalculator()
metrics = metric_calculator.clac(
    movielens.test.rating.tolist(), recommend_result.rating.tolist(),
    movielens.test_user2items, recommend_result.user2items, k=10
)
print(metrics)