[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oreilly-japan/RecommenderSystems/blob/main/chapter5/colab/Association.ipynb)

# アソシエーション分析

In [None]:
# Colab用のnotebookです。このnotebook1枚でデータのダウンロードから、レコメンドまで完結するようになっています。（予測評価は含めていません。）
# MovieLensデータがまだダウンロードされてなければこのセルを実行して、ダウンロードしてください
# MovieLensデータの分析は、data_download.ipynbをご参照ください

# データのダウンロードと解凍
!wget -nc --no-check-certificate https://files.grouplens.org/datasets/movielens/ml-10m.zip -P ../data
!unzip -n ../data/ml-10m.zip -d ../data/

In [2]:
# Movielensのデータの読み込み（データ量が多いため、読み込みに時間がかかる場合があります）
import pandas as pd

# movieIDとタイトル名のみ使用
m_cols = ['movie_id', 'title', 'genre']
movies = pd.read_csv('../data/ml-10M100K/movies.dat', names=m_cols, sep='::' , encoding='latin-1', engine='python')

# genreをlist形式で保持する
movies['genre'] = movies.genre.apply(lambda x:x.split('|'))


# ユーザが付与した映画のタグ情報の読み込み
t_cols = ['user_id', 'movie_id', 'tag', 'timestamp']
user_tagged_movies = pd.read_csv('../data/ml-10M100K/tags.dat', names=t_cols, sep='::', engine='python')

# tagを小文字にする
user_tagged_movies['tag'] = user_tagged_movies['tag'].str.lower()


# tagを映画ごとにlist形式で保持する
movie_tags = user_tagged_movies.groupby('movie_id').agg({'tag':list})

# タグ情報を結合する
movies = movies.merge(movie_tags, on='movie_id', how='left')

# 評価値データの読み込み
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('../data/ml-10M100K/ratings.dat', names=r_cols, sep='::', engine='python')


# データ量が多いため、ユーザー数を1000に絞って、試していく
valid_user_ids = sorted(ratings.user_id.unique())[:1000]
ratings = ratings[ratings["user_id"].isin(valid_user_ids)]


# 映画のデータと評価のデータを結合する
movielens = ratings.merge(movies, on='movie_id')

print(f'unique_users={len(movielens.user_id.unique())}, unique_movies={len(movielens.movie_id.unique())}')

# 学習用とテスト用にデータを分割する
# 各ユーザの直近の５件の映画を評価用に使い、それ以外を学習用とする
# まずは、それぞれのユーザが評価した映画の順序を計算する
# 直近付与した映画から順番を付与していく(1始まり)

movielens['timestamp_rank'] = movielens.groupby(
    'user_id')['timestamp'].rank(ascending=False, method='first')
movielens_train = movielens[movielens['timestamp_rank'] > 5]
movielens_test = movielens[movielens['timestamp_rank']<= 5]

unique_users=1000, unique_movies=6736


In [4]:
# ユーザー×映画の行列形式に変更
user_movie_matrix = movielens_train.pivot(index='user_id', columns='movie_id', values='rating')

# ライブラリ使用のために、4以上の評価値は1, 4未満の評価値と欠損値は0にする
user_movie_matrix[user_movie_matrix < 4] = 0
user_movie_matrix[user_movie_matrix.isnull()] = 0
user_movie_matrix[user_movie_matrix >= 4] = 1

user_movie_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,62000,62113,62293,62344,62394,62801,62803,63113,63992,64716
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# アソシエーションルールのライブラリのインストール(インストールされていなければ、コメントアウトを外して、実行してください)
# !pip install mlxtend

In [6]:
from mlxtend.frequent_patterns import apriori

# 支持度が高い映画の表示
freq_movies = apriori(
    user_movie_matrix, min_support=0.1, use_colnames=True)
freq_movies.sort_values('support', ascending=False).head()

Unnamed: 0,support,itemsets
42,0.415,(593)
23,0.379,(318)
21,0.369,(296)
19,0.361,(260)
25,0.319,(356)


In [7]:
# movie_id=593のタイトルの確認(羊たちの沈黙)
movies[movies.movie_id == 593]

Unnamed: 0,movie_id,title,genre,tag
587,593,"Silence of the Lambs, The (1991)","[Crime, Horror, Thriller]","[based on a book, anthony hopkins, demme, psyc..."


In [8]:
from mlxtend.frequent_patterns import association_rules

# アソシエーションルールの計算（リフト値の高い順に表示）
rules = association_rules(freq_movies, metric='lift', min_threshold=1)
rules.sort_values('lift', ascending=False).head()[['antecedents', 'consequents', 'lift']]

Unnamed: 0,antecedents,consequents,lift
649,(4993),(5952),5.45977
648,(5952),(4993),5.45977
1462,"(1196, 1198)","(1291, 260)",4.669188
1463,"(1291, 260)","(1196, 1198)",4.669188
1460,"(1291, 1196)","(260, 1198)",4.171359


In [9]:
# movie_id=4993, 5952のタイトルの確認(ロード・オブ・ザ・リング)
movies[movies.movie_id.isin([4993, 5952])]

Unnamed: 0,movie_id,title,genre,tag
4899,4993,"Lord of the Rings: The Fellowship of the Ring,...","[Action, Adventure, Fantasy]","[based on a book, big budget, new zealand, sce..."
5852,5952,"Lord of the Rings: The Two Towers, The (2002)","[Action, Adventure, Fantasy]","[based on a book, big budget, new zealand, sce..."


In [10]:
# 学習用データで評価値が4以上のものだけ取得する。
movielens_train_high_rating = movielens_train[movielens_train.rating >= 4]

In [11]:
# user_id=2のユーザーが4以上の評価を付けた映画一覧
movielens_train_high_rating[movielens_train_high_rating.user_id==2]

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag,timestamp_rank
4732,2,110,5.0,868245777,Braveheart (1995),"[Action, Drama, War]","[bullshit history, medieval, bloodshed, hero, ...",8.0
5246,2,260,5.0,868244562,Star Wars: Episode IV - A New Hope (a.k.a. Sta...,"[Action, Adventure, Sci-Fi]","[desert, quotable, lucas, gfei own it, seen mo...",17.0
5798,2,590,5.0,868245608,Dances with Wolves (1990),"[Adventure, Drama, Western]","[afi 100, lame, native, biopic, american india...",11.0
8381,2,1210,4.0,868245644,Star Wars: Episode VI - Return of the Jedi (1983),"[Action, Adventure, Sci-Fi]","[desert, fantasy, sci-fi, space, lucas, gfei o...",10.0


In [12]:
# user_id=2のユーザーが4以上の評価を付けた映画一覧
user2_data = movielens_train_high_rating[movielens_train_high_rating.user_id==2]

# ユーザーが直近評価した５つの映画を取得
input_data = user2_data.sort_values("timestamp")["movie_id"].tolist()[-5:]

# それらの映画が、条件部に含まれるアソシエーションルールを抽出
matched_flags = rules.antecedents.apply(lambda x: len(set(input_data) & x)) >= 1
rules[matched_flags]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
3,(110),(1),0.291,0.263,0.105,0.360825,1.371957,0.028467,1.153048
5,(260),(1),0.361,0.263,0.153,0.423823,1.611493,0.058057,1.279120
25,(1210),(1),0.273,0.263,0.116,0.424908,1.615621,0.044201,1.281535
31,(110),(32),0.291,0.255,0.104,0.357388,1.401523,0.029795,1.159332
33,(260),(32),0.361,0.255,0.137,0.379501,1.488241,0.044945,1.200647
...,...,...,...,...,...,...,...,...,...
1476,"(1210, 1196)","(2571, 260)",0.197,0.161,0.108,0.548223,3.405114,0.076283,1.857112
1477,"(2571, 260)","(1210, 1196)",0.161,0.197,0.108,0.670807,3.405114,0.076283,2.439302
1479,"(1196, 260)","(1210, 2571)",0.224,0.139,0.108,0.482143,3.468654,0.076864,1.662621
1480,(1210),"(1196, 2571, 260)",0.273,0.141,0.108,0.395604,2.805705,0.069507,1.421255


In [13]:
from collections import defaultdict, Counter

# アソシエーションルールの帰結部の映画をリストに格納する
# 同じ映画が複数回帰結部に出現することがある

consequent_movies = []
for i, row in rules[matched_flags].sort_values("lift", ascending=False).iterrows(): # lift値でソートして、上位１０個のルールだけを使うようにするなどの工夫も可能です
    consequent_movies.extend(row["consequents"])
    
# 帰結部での登場頻度をカウント
counter = Counter(consequent_movies)
counter.most_common(10)

[(1196, 92),
 (593, 41),
 (1198, 34),
 (260, 34),
 (1210, 34),
 (318, 20),
 (296, 19),
 (2571, 18),
 (356, 17),
 (589, 16)]

In [14]:
# movie_id=1196が92回、帰結部に登場しているので、user_id=2には、movie_id=1196(Star Wars: Episode V )がおすすめ候補となる
# （user_id=2の学習データでは、Star Warsのエピーソード4,6を高く評価している）
movies[movies.movie_id == 1196]

Unnamed: 0,movie_id,title,genre,tag
1171,1196,Star Wars: Episode V - The Empire Strikes Back...,"[Action, Adventure, Sci-Fi]","[lucas, george lucas, george lucas, gfei own i..."


In [15]:
# おすすめの仕方には、lift値が高いものを抽出するやり方もあり、いくつか方法を試してみて、自社のデータに適したやり方を選択ください

In [16]:
# アソシエーションルールを使って、各ユーザーにまだ評価していない映画を１０本推薦する
pred_user2items = defaultdict(list)
user_evaluated_movies = movielens_train.groupby("user_id").agg({"movie_id": list})["movie_id"].to_dict()

for user_id, data in movielens_train_high_rating.groupby("user_id"):
    # ユーザーが直近評価した５つの映画を取得
    input_data = data.sort_values("timestamp")["movie_id"].tolist()[-5:]
    # それらの映画が条件部に１本でも含まれているアソシエーションルールを抽出
    matched_flags = rules.antecedents.apply(lambda x: len(set(input_data) & x)) >= 1

    # アソシエーションルールの帰結部の映画をリストに格納し、登場頻度順に並び替え、ユーザーがまだに評価していないければ、推薦リストに追加する
    consequent_movies = []
    for i, row in rules[matched_flags].sort_values("lift", ascending=False).iterrows():
        consequent_movies.extend(row["consequents"])
    # 登場頻度をカウント
    counter = Counter(consequent_movies)
    for movie_id, movie_cnt in counter.most_common():
        if movie_id not in user_evaluated_movies[user_id]:
            pred_user2items[user_id].append(movie_id)
        # 推薦リストが10本になったら終了する
        if len(pred_user2items[user_id]) == 10:
            break

# 各ユーザーに対するレコメンドリスト
pred_user2items

defaultdict(list,
            {2: [1196, 593, 1198, 318, 296, 2571, 356, 589, 1240, 1291],
             6: [593, 296, 318, 541, 47, 608, 50, 589, 527, 1],
             9: [296, 318, 593, 2959, 2762, 1617, 2028, 2571, 858, 50],
             10: [858, 1196, 260, 318],
             11: [2858, 50, 296, 593],
             12: [260],
             13: [593, 318, 527, 356, 260, 47, 110, 2858, 589, 457],
             17: [1196, 296, 1200, 1240, 541, 2571, 1198, 1210],
             18: [1200, 1197, 50, 858, 1193],
             22: [318, 1196, 260, 457, 608, 2571, 1210, 1240, 1198, 541],
             23: [1196, 1210, 1198, 2571, 318, 1291, 1240, 356, 858, 110],
             24: [1198, 1196, 296, 593, 1221, 1213, 1193, 1214, 541, 2028],
             26: [593, 318, 296, 1196, 260, 50, 356, 527, 1210, 1240],
             27: [296, 593, 50, 318, 541, 858, 2858, 1, 260, 1198],
             33: [296, 593, 50, 318, 541, 858, 2858, 260, 1198, 527],
             37: [527, 356, 1196, 260, 608, 2858, 457, 8

In [17]:
# user_id=2のユーザーが学習データで、4以上の評価を付けた映画一覧
movielens_train_high_rating[movielens_train_high_rating.user_id==2]

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag,timestamp_rank
4732,2,110,5.0,868245777,Braveheart (1995),"[Action, Drama, War]","[bullshit history, medieval, bloodshed, hero, ...",8.0
5246,2,260,5.0,868244562,Star Wars: Episode IV - A New Hope (a.k.a. Sta...,"[Action, Adventure, Sci-Fi]","[desert, quotable, lucas, gfei own it, seen mo...",17.0
5798,2,590,5.0,868245608,Dances with Wolves (1990),"[Adventure, Drama, Western]","[afi 100, lame, native, biopic, american india...",11.0
8381,2,1210,4.0,868245644,Star Wars: Episode VI - Return of the Jedi (1983),"[Action, Adventure, Sci-Fi]","[desert, fantasy, sci-fi, space, lucas, gfei o...",10.0


In [18]:
# user_id=2に対するおすすめ(1196, 593, 1198)
movies[movies.movie_id.isin([1196, 593, 1198])]

Unnamed: 0,movie_id,title,genre,tag
587,593,"Silence of the Lambs, The (1991)","[Crime, Horror, Thriller]","[based on a book, anthony hopkins, demme, psyc..."
1171,1196,Star Wars: Episode V - The Empire Strikes Back...,"[Action, Adventure, Sci-Fi]","[lucas, george lucas, george lucas, gfei own i..."
1173,1198,Raiders of the Lost Ark (Indiana Jones and the...,"[Action, Adventure]","[egypt, lucas, seen more than once, dvd collec..."


In [19]:
# apriori(user_movie_matrix, min_support=0.1, use_colnames=True)
# association_rules(freq_movies, metric='lift', min_threshold=1)
# min_supportとmin_thresholdが重要なパラメーターとなるので、変化させてお試しください