[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oreilly-japan/RecommenderSystems/blob/main/chapter5/colab/Association.ipynb)

# 어소시에이션 분석(Apriori 알고리즘)

In [1]:
# Colab용 notebook입니다. 이 notebook 한 장에서 여러 데이터의 다운로드부터, 추천까지 완결하도록 되어 있습니다(예측 평가는 미포함)
# MovieLens 데이터를 아직 다운로드 하지 않았다면, 이 셀을 실행해서 다운로드합니다.
# MovieLens 데이터 분석은 data_download.ipynb를 참조합니다.

# 데이터 다운로드와 압축 풀기
!wget -nc --no-check-certificate https://files.grouplens.org/datasets/movielens/ml-10m.zip -P ../data
!unzip -n ../data/ml-10m.zip -d ../data/

--2022-12-27 04:45:01--  https://files.grouplens.org/datasets/movielens/ml-10m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 65566137 (63M) [application/zip]
Saving to: ‘../data/ml-10m.zip’


2022-12-27 04:45:02 (63.8 MB/s) - ‘../data/ml-10m.zip’ saved [65566137/65566137]

Archive:  ../data/ml-10m.zip
   creating: ../data/ml-10M100K/
  inflating: ../data/ml-10M100K/allbut.pl  
  inflating: ../data/ml-10M100K/movies.dat  
  inflating: ../data/ml-10M100K/ratings.dat  
  inflating: ../data/ml-10M100K/README.html  
  inflating: ../data/ml-10M100K/split_ratings.sh  
  inflating: ../data/ml-10M100K/tags.dat  


In [2]:
# Movielens 데이터 로딩(데이터량이 많으므로, 로딩에 시간이 걸릴 수 있습니다)
import pandas as pd

# movieID와 제목만 사용
m_cols = ['movie_id', 'title', 'genre']
movies = pd.read_csv('../data/ml-10M100K/movies.dat', names=m_cols, sep='::' , encoding='latin-1', engine='python')

# genre를 list 형식으로 저장한다
movies['genre'] = movies.genre.apply(lambda x:x.split('|'))


# 사용자가 부여한 영화의 태그 정보를 로딩한다
t_cols = ['user_id', 'movie_id', 'tag', 'timestamp']
user_tagged_movies = pd.read_csv('../data/ml-10M100K/tags.dat', names=t_cols, sep='::', engine='python')

# tag를 소문자로 바꾼다
user_tagged_movies['tag'] = user_tagged_movies['tag'].str.lower()


# tag를 영화별로 list 형식으로 저장한다
movie_tags = user_tagged_movies.groupby('movie_id').agg({'tag':list})

# 태그 정보를 결합한다
movies = movies.merge(movie_tags, on='movie_id', how='left')

# 평갓값 데이터만 로딩한다
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('../data/ml-10M100K/ratings.dat', names=r_cols, sep='::', engine='python')


# 데이터량이 많으므로 사용자수를 1000으로 줄여서 시험해본다
valid_user_ids = sorted(ratings.user_id.unique())[:1000]
ratings = ratings[ratings["user_id"].isin(valid_user_ids)]


# 영화 데이터와 평가 데이터를 결합한다
movielens = ratings.merge(movies, on='movie_id')

print(f'unique_users={len(movielens.user_id.unique())}, unique_movies={len(movielens.movie_id.unique())}')

# 학습용과 데이터용으로 데이터를 나눈다
# 각 사용자의 최근 5건의 영화를 평가용으로 사용하고, 나머지는 학습용으로 사용한다
# 우선, 각 사용자가 평가한 영화의 순서를 계산한다
# 최근 부여한 영화부터 순서를 부여한다(1에서 시작)

movielens['timestamp_rank'] = movielens.groupby(
    'user_id')['timestamp'].rank(ascending=False, method='first')
movielens_train = movielens[movielens['timestamp_rank'] > 5]
movielens_test = movielens[movielens['timestamp_rank']<= 5]

unique_users=1000, unique_movies=6736


In [3]:
# 사용자 x 영화 행렬 형식으로 변환한다
user_movie_matrix = movielens_train.pivot(index='user_id', columns='movie_id', values='rating')

# 라이브러리를 사용하기 위해 4 이상의 평갓값은 1, 4 미만의 평갓값과 결손값은 0으로 한다
user_movie_matrix[user_movie_matrix < 4] = 0
user_movie_matrix[user_movie_matrix.isnull()] = 0
user_movie_matrix[user_movie_matrix >= 4] = 1

user_movie_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,62000,62113,62293,62344,62394,62801,62803,63113,63992,64716
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# 어소시에이션 규칙 라이브러리 설치(설치되어 있지 않다면 주석을 해제하고 실행합니다)
# !pip install mlxtend

In [5]:
from mlxtend.frequent_patterns import apriori

# 지지도가 높은 영화를 표시
freq_movies = apriori(
    user_movie_matrix, min_support=0.1, use_colnames=True)
freq_movies.sort_values('support', ascending=False).head()

Unnamed: 0,support,itemsets
42,0.415,(593)
23,0.379,(318)
21,0.369,(296)
19,0.361,(260)
25,0.319,(356)


In [6]:
# movie_id=593의 제목 확인(양들의 침묵)
movies[movies.movie_id == 593]

Unnamed: 0,movie_id,title,genre,tag
587,593,"Silence of the Lambs, The (1991)","[Crime, Horror, Thriller]","[based on a book, anthony hopkins, demme, psyc..."


In [7]:
from mlxtend.frequent_patterns import association_rules

# 어소시에이션 규칙 계산(리프트 값이 높은 순으로 표시)
rules = association_rules(freq_movies, metric='lift', min_threshold=1)
rules.sort_values('lift', ascending=False).head()[['antecedents', 'consequents', 'lift']]

Unnamed: 0,antecedents,consequents,lift
649,(4993),(5952),5.45977
648,(5952),(4993),5.45977
1462,"(1196, 1198)","(1291, 260)",4.669188
1463,"(1291, 260)","(1196, 1198)",4.669188
1460,"(1291, 1196)","(260, 1198)",4.171359


In [8]:
# movie_id=4993, 5952의 제목 확인(반지의 제왕)
movies[movies.movie_id.isin([4993, 5952])]

Unnamed: 0,movie_id,title,genre,tag
4899,4993,"Lord of the Rings: The Fellowship of the Ring,...","[Action, Adventure, Fantasy]","[based on a book, big budget, new zealand, sce..."
5852,5952,"Lord of the Rings: The Two Towers, The (2002)","[Action, Adventure, Fantasy]","[based on a book, big budget, new zealand, sce..."


In [9]:
# 학습용 데이터 평갓값이 4 이상인 것만 얻는다.
movielens_train_high_rating = movielens_train[movielens_train.rating >= 4]

In [10]:
# user_id=2인 사용자가 4 이상의 평가를 남긴 영화 목록
movielens_train_high_rating[movielens_train_high_rating.user_id==2]

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag,timestamp_rank
4732,2,110,5.0,868245777,Braveheart (1995),"[Action, Drama, War]","[bullshit history, medieval, bloodshed, hero, ...",8.0
5246,2,260,5.0,868244562,Star Wars: Episode IV - A New Hope (a.k.a. Sta...,"[Action, Adventure, Sci-Fi]","[desert, quotable, lucas, gfei own it, seen mo...",17.0
5798,2,590,5.0,868245608,Dances with Wolves (1990),"[Adventure, Drama, Western]","[afi 100, lame, native, biopic, american india...",11.0
8381,2,1210,4.0,868245644,Star Wars: Episode VI - Return of the Jedi (1983),"[Action, Adventure, Sci-Fi]","[desert, fantasy, sci-fi, space, lucas, gfei o...",10.0


In [11]:
# user_id=2의 사용자가 4 이상의 평가를 남긴 영화 목록
user2_data = movielens_train_high_rating[movielens_train_high_rating.user_id==2]

# 사용자가 최근 평가한 4개의 영화 얻기
input_data = user2_data.sort_values("timestamp")["movie_id"].tolist()[-5:]

# 그 영화들이 조건부로 포함된 어오시에이션 규칙을 추출
matched_flags = rules.antecedents.apply(lambda x: len(set(input_data) & x)) >= 1
rules[matched_flags]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
3,(110),(1),0.291,0.263,0.105,0.360825,1.371957,0.028467,1.153048
5,(260),(1),0.361,0.263,0.153,0.423823,1.611493,0.058057,1.279120
25,(1210),(1),0.273,0.263,0.116,0.424908,1.615621,0.044201,1.281535
31,(110),(32),0.291,0.255,0.104,0.357388,1.401523,0.029795,1.159332
33,(260),(32),0.361,0.255,0.137,0.379501,1.488241,0.044945,1.200647
...,...,...,...,...,...,...,...,...,...
1476,"(1210, 1196)","(2571, 260)",0.197,0.161,0.108,0.548223,3.405114,0.076283,1.857112
1477,"(2571, 260)","(1210, 1196)",0.161,0.197,0.108,0.670807,3.405114,0.076283,2.439302
1479,"(1196, 260)","(1210, 2571)",0.224,0.139,0.108,0.482143,3.468654,0.076864,1.662621
1480,(1210),"(1196, 2571, 260)",0.273,0.141,0.108,0.395604,2.805705,0.069507,1.421255


In [12]:
from collections import defaultdict, Counter

# 어소시에이션 규칙의 귀결부의 영화를 리스트로 저장한다
# 같은 영화가 여러 차례 귀결부에 나타날 수 있다

consequent_movies = []
for i, row in rules[matched_flags].sort_values("lift", ascending=False).iterrows(): # lift値でソートして、上位１０個のルールだけを使うようにするなどの工夫も可能です
    consequent_movies.extend(row["consequents"])
    
# 귀결부에서의 출현 빈도 카운트
counter = Counter(consequent_movies)
counter.most_common(10)

[(1196, 92),
 (593, 41),
 (1198, 34),
 (260, 34),
 (1210, 34),
 (318, 20),
 (296, 19),
 (2571, 18),
 (356, 17),
 (589, 16)]

In [13]:
# movie_id=1196가 92번 귀결부에 출현하므로, user_id=2에는 movie_id=1196(Star Wars: Episode V)가 추천 후보가 된다
# (user_id=2의 학습 데이터에서는 Star Wars 에피소드 4, 6의 평가가 높다)
movies[movies.movie_id == 1196]

Unnamed: 0,movie_id,title,genre,tag
1171,1196,Star Wars: Episode V - The Empire Strikes Back...,"[Action, Adventure, Sci-Fi]","[lucas, george lucas, george lucas, gfei own i..."


In [14]:
# 추천 방법에는 lift 값이 높은 것을 추출하는 방법 등이 있다. 몇 가지 방법을 시도해 보고 자사의 데이터에 맞는 방법을 선택한다.

In [15]:
# 어소시에이션 규칙을 사용해 각 사용자가 아직 평가하지 않은 영화 10편을 추천한다
pred_user2items = defaultdict(list)
user_evaluated_movies = movielens_train.groupby("user_id").agg({"movie_id": list})["movie_id"].to_dict()

for user_id, data in movielens_train_high_rating.groupby("user_id"):
    # 사용자가 최근 5편의 영화를 얻는다
    input_data = data.sort_values("timestamp")["movie_id"].tolist()[-5:]
    # 그 영화들이 조건부에 1편이라도 포함되어 있는 어소시에이션 규칙을 추출한다
    matched_flags = rules.antecedents.apply(lambda x: len(set(input_data) & x)) >= 1

    # 어소시에이션 규칙의 귀결부의 영화를 리스트로 저장하고, 출편 빈도 순으로 배열하고, 사용자가 아직 평가하지 않았다면 추천 리스트에 추가한다
    consequent_movies = []
    for i, row in rules[matched_flags].sort_values("lift", ascending=False).iterrows():
        consequent_movies.extend(row["consequents"])
    # 출현 빈도를 센다
    counter = Counter(consequent_movies)
    for movie_id, movie_cnt in counter.most_common():
        if movie_id not in user_evaluated_movies[user_id]:
            pred_user2items[user_id].append(movie_id)
        # 추천 리스트가 10편이 되면 종료한다
        if len(pred_user2items[user_id]) == 10:
            break

# 각 사용자에 대한 추천 리스트
pred_user2items

defaultdict(list,
            {2: [1196, 593, 1198, 318, 296, 2571, 356, 589, 1240, 1291],
             6: [593, 296, 318, 541, 47, 608, 50, 589, 527, 1],
             9: [296, 318, 593, 2959, 2762, 1617, 2028, 2571, 858, 50],
             10: [858, 1196, 260, 318],
             11: [2858, 50, 296, 593],
             12: [260],
             13: [593, 318, 527, 356, 260, 47, 110, 2858, 589, 457],
             17: [1196, 296, 1200, 1240, 541, 2571, 1198, 1210],
             18: [1200, 1197, 50, 858, 1193],
             22: [318, 1196, 260, 457, 608, 2571, 1210, 1240, 1198, 541],
             23: [1196, 1210, 1198, 2571, 318, 1291, 1240, 356, 858, 110],
             24: [1198, 1196, 296, 593, 1221, 1213, 1193, 1214, 541, 2028],
             26: [593, 318, 296, 1196, 260, 50, 356, 527, 1210, 1240],
             27: [296, 593, 50, 318, 541, 858, 2858, 1, 260, 1198],
             33: [296, 593, 50, 318, 541, 858, 2858, 260, 1198, 527],
             37: [527, 356, 1196, 260, 608, 2858, 457, 8

In [16]:
# user_id=2인 사용자 학습 데이터에서 4 이상의 평가를 부여한 영화 목록
movielens_train_high_rating[movielens_train_high_rating.user_id==2]

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag,timestamp_rank
4732,2,110,5.0,868245777,Braveheart (1995),"[Action, Drama, War]","[bullshit history, medieval, bloodshed, hero, ...",8.0
5246,2,260,5.0,868244562,Star Wars: Episode IV - A New Hope (a.k.a. Sta...,"[Action, Adventure, Sci-Fi]","[desert, quotable, lucas, gfei own it, seen mo...",17.0
5798,2,590,5.0,868245608,Dances with Wolves (1990),"[Adventure, Drama, Western]","[afi 100, lame, native, biopic, american india...",11.0
8381,2,1210,4.0,868245644,Star Wars: Episode VI - Return of the Jedi (1983),"[Action, Adventure, Sci-Fi]","[desert, fantasy, sci-fi, space, lucas, gfei o...",10.0


In [17]:
# user_id=2에 대한 추천(1196, 593, 1198)
movies[movies.movie_id.isin([1196, 593, 1198])]

Unnamed: 0,movie_id,title,genre,tag
587,593,"Silence of the Lambs, The (1991)","[Crime, Horror, Thriller]","[based on a book, anthony hopkins, demme, psyc..."
1171,1196,Star Wars: Episode V - The Empire Strikes Back...,"[Action, Adventure, Sci-Fi]","[lucas, george lucas, george lucas, gfei own i..."
1173,1198,Raiders of the Lost Ark (Indiana Jones and the...,"[Action, Adventure]","[egypt, lucas, seen more than once, dvd collec..."


In [18]:
# apriori(user_movie_matrix, min_support=0.1, use_colnames=True)
# association_rules(freq_movies, metric='lift', min_threshold=1)
# min_support와 min_threshold가 중요한 파라미터가 되므로 바꾸어 가면서 시험해봅니다.