# ▒ 연관규칙분석(Association Rule) ▒


## 0. 환경설정

In [1]:
import pandas as pd
import os
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

## 1. 데이터 준비

### 데이터 설명
#### 미국 Census Bureau의 Census Income데이터 베이스에 추출한 설문조사 자료
- 관측치의 개수 : 48842개
- 나이, 직업군, 교육정도 등의 주로 범주형인 15개의 변수 포함

In [59]:
Adult_file = os.getcwd()+'/data/Adult.csv'
Adult_df = pd.read_csv(Adult_file).transpose()

### 데이터 변환

In [60]:
Adult_list = []
for i in range(0,len(Adult_df.columns)):
    Adult_list.append(Adult_df[i].to_string().split(","))

In [61]:
oht = TransactionEncoder()
oht_ary = oht.fit(Adult_list).transform(Adult_list)
df = pd.DataFrame(oht_ary, columns=oht.columns_)
df.head()

Unnamed: 0,...,age=Middle-aged,capital-gain=Low,capital-loss=None,edu...,educati...,educatio...,education...,education=...,education=1...,....1,relationship=Not-in-family,sex=Male,workclass=Federal-gov,workclass=Local-gov,workclass=Never-worked,workclass=Private,workclass=Self-emp-inc,workclass=Self-emp-not-inc,workclass=State-gov,workclass=Without-pay
0,False,True,True,True,False,False,True,False,False,False,...,True,True,False,False,False,False,False,True,True,False
1,False,True,True,True,False,False,False,False,False,False,...,True,True,False,False,False,True,False,False,True,False
2,False,True,True,True,False,False,False,False,False,False,...,True,True,False,False,False,True,False,False,True,False
3,False,True,True,True,False,False,False,False,False,False,...,True,True,False,False,False,True,False,False,True,False
4,False,True,True,True,False,False,False,False,False,False,...,True,True,False,False,False,True,False,False,True,False


## 2. 연관규칙분석

### 지지도가 0.4 이상인 연관규칙

In [63]:
frequent_itemsets = apriori(df, min_support=0.4, use_colnames=True)

frequent_itemsets

Unnamed: 0,support,itemsets
0,1.000000,(age=Middle-aged)
1,1.000000,(capital-gain=Low)
2,1.000000,(capital-loss=None)
3,1.000000,(education=Bachelors)
4,1.000000,(hours-per-week=Full-time)
...,...,...
12282,0.694212,"(sex=Male, relationship=Not-in-family, educati..."
12283,0.505108,"(sex=Male, relationship=Not-in-family, educati..."
12284,0.694212,"(sex=Male, relationship=Not-in-family, educati..."
12285,0.505108,"(sex=Male, relationship=Not-in-family, educati..."


### 지지도가 0.4이상이면서 신뢰도가 0.7 이상인 item

In [64]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(age=Middle-aged),(capital-gain=Low),1.000000,1.0,1.000000,1.0,1.0,0.0,inf
1,(capital-gain=Low),(age=Middle-aged),1.000000,1.0,1.000000,1.0,1.0,0.0,inf
2,(capital-loss=None),(age=Middle-aged),1.000000,1.0,1.000000,1.0,1.0,0.0,inf
3,(age=Middle-aged),(capital-loss=None),1.000000,1.0,1.000000,1.0,1.0,0.0,inf
4,(age=Middle-aged),(education=Bachelors),1.000000,1.0,1.000000,1.0,1.0,0.0,inf
...,...,...,...,...,...,...,...,...,...
1577935,"(workclass=Private, capital-gain=Low)","(sex=Male, relationship=Not-in-family, educati...",0.694212,1.0,0.694212,1.0,1.0,0.0,inf
1577936,"(workclass=Private, hours-per-week=Full-time)","(sex=Male, relationship=Not-in-family, educati...",0.694212,1.0,0.694212,1.0,1.0,0.0,inf
1577937,"(workclass=Private, workclass=State-gov)","(sex=Male, relationship=Not-in-family, educati...",0.694212,1.0,0.694212,1.0,1.0,0.0,inf
1577938,"(workclass=Private, capital-loss=None)","(sex=Male, relationship=Not-in-family, educati...",0.694212,1.0,0.694212,1.0,1.0,0.0,inf


### 지지도가 0.4이상이면서 신뢰도가 0.75 이상, 향상도 0.9이상인 item

In [65]:
rules[ (rules['confidence'] >= 0.75) &
       (rules['lift'] >= 0.9) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(age=Middle-aged),(capital-gain=Low),1.000000,1.0,1.000000,1.0,1.0,0.0,inf
1,(capital-gain=Low),(age=Middle-aged),1.000000,1.0,1.000000,1.0,1.0,0.0,inf
2,(capital-loss=None),(age=Middle-aged),1.000000,1.0,1.000000,1.0,1.0,0.0,inf
3,(age=Middle-aged),(capital-loss=None),1.000000,1.0,1.000000,1.0,1.0,0.0,inf
4,(age=Middle-aged),(education=Bachelors),1.000000,1.0,1.000000,1.0,1.0,0.0,inf
...,...,...,...,...,...,...,...,...,...
1577935,"(workclass=Private, capital-gain=Low)","(sex=Male, relationship=Not-in-family, educati...",0.694212,1.0,0.694212,1.0,1.0,0.0,inf
1577936,"(workclass=Private, hours-per-week=Full-time)","(sex=Male, relationship=Not-in-family, educati...",0.694212,1.0,0.694212,1.0,1.0,0.0,inf
1577937,"(workclass=Private, workclass=State-gov)","(sex=Male, relationship=Not-in-family, educati...",0.694212,1.0,0.694212,1.0,1.0,0.0,inf
1577938,"(workclass=Private, capital-loss=None)","(sex=Male, relationship=Not-in-family, educati...",0.694212,1.0,0.694212,1.0,1.0,0.0,inf
