In [1]:
import time

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from mlxtend.frequent_patterns import apriori, association_rules

import warnings
warnings.filterwarnings("ignore")

In [2]:
#讀取數據
datafile = "breast_cancer_data.xls"
processedfile = "breast_cancer_processed_data.csv"

data = pd.read_excel(datafile)
data.sample(2)

Unnamed: 0,肝氣鬱結證型係數,熱毒蘊結證型係數,冲任失調證型係數,血氣兩虛證型係數,脾胃虛弱證型係數,肝腎陰虛證型係數,病程階段,TNM分期,轉移部位,確診候幾年發現轉移\n
919,0.165,0.132,0.125,0.144,0.344,0.219,S2,H1,R0,J0
864,0.133,0.216,0.125,0.237,0.104,0.228,S1,H3,R0,J0


In [3]:
#檢視數據
data.sample(n=2)

Unnamed: 0,肝氣鬱結證型係數,熱毒蘊結證型係數,冲任失調證型係數,血氣兩虛證型係數,脾胃虛弱證型係數,肝腎陰虛證型係數,病程階段,TNM分期,轉移部位,確診候幾年發現轉移\n
514,0.397,0.055,0.557,0.404,0.252,0.316,S2,H2,R0,J0
754,0.326,0.157,0.225,0.119,0.127,0.392,S4,H4,R1,J3


In [4]:
typelabel = {
        u"肝氣鬱結證型係數": "A",
        u"熱毒蘊結證型係數": "B",
        u"冲任失調證型係數": "C",
        u"血氣兩虛證型係數": "D",
        u"脾胃虛弱證型係數": "E",
        u"肝腎陰虛證型係數": "F",
    }

#數據離散化函數
def Binning():
    #分4群
    k = 4

    data = pd.read_excel(datafile)
    result = pd.DataFrame()

    for key, item in typelabel.items():
        print(u"正在處理“%s”的分群..." % key)
        # 進行離散化
        kmodel = KMeans(n_clusters=k)
        # 匯入數據為一維數據
        kmodel.fit(data[[key]].values)

        # 分群中心
        r1 = pd.DataFrame(kmodel.cluster_centers_, columns=[item])
        # 分類數量統計
        r2 = pd.Series(kmodel.labels_).value_counts()
        r2 = pd.DataFrame(r2, columns=[item + "n"])
        # 合併DataFrame
        r = pd.concat([r1, r2], axis=1).sort_values(item)
        r.index = list(range(1, 5))
        # 計算相鄰兩組數據的均值，以此作為邊界點
        r[item] = pd.Series.rolling(r[item], 2).mean()
        # 將NaN值轉為0.0 (針對index為1的位置)
        r.loc[1, item] = 0.0
        result = result.append(r.T)
    # 以ABCDEF排序
    result = result.sort_index()
    result.to_csv(processedfile)
    
    return result

In [5]:
#進行數據離散化
result = pd.DataFrame()
result=Binning()

正在處理“肝氣鬱結證型係數”的分群...
正在處理“熱毒蘊結證型係數”的分群...
正在處理“冲任失調證型係數”的分群...
正在處理“血氣兩虛證型係數”的分群...
正在處理“脾胃虛弱證型係數”的分群...
正在處理“肝腎陰虛證型係數”的分群...


In [6]:
#查看六群數據邊界值
result

Unnamed: 0,1,2,3,4
A,0.0,0.179278,0.25796,0.351843
An,244.0,352.0,281.0,53.0
B,0.0,0.153543,0.298217,0.489954
Bn,342.0,380.0,179.0,29.0
C,0.0,0.202149,0.289061,0.423537
Cn,297.0,394.0,204.0,35.0
D,0.0,0.1744,0.253486,0.360007
Dn,298.0,367.0,221.0,44.0
E,0.0,0.152698,0.257873,0.376062
En,273.0,319.0,245.0,93.0


In [7]:
#數據離散化callback函數
def funA(x,y):
    if x<=0.178698:
        return y+'1'
    elif 0.178698<x<=0.257724:
        return y+'2'
    elif 0.257724<x<=0.351843:
        return y+'3'
    else:
        return y+'4'
def funB(x,y):
    if x<=0.153543:
        return y+'1'
    elif 0.153543<x<=0.298217:
        return y+'2'
    elif 0.298217<x<=0.489954:
        return y+'3'
    else:
        return y+'4'
def funC(x,y):
    if x<=0.201910:
        return y+'1'
    elif 0.201910<x<=0.288684:
        return y+'2'
    elif 0.288684<x<=0.423325:
        return y+'3'
    else:
        return y+'4'
def funD(x,y):
    if x<=0.176505:
        return y+'1'
    elif 0.178698<x<=0.257279:
        return y+'2'
    elif 0.257279<x<=0.367217:
        return y+'3'
    else:
        return y+'4'
def funE(x,y):
    if x<=0.152698:
        return y+'1'
    elif 0.152698<x<=0.257873:
        return y+'2'
    elif 0.257873<x<=0.376062:
        return y+'3'
    else:
        return y+'4'
def funF(x,y):
    if x<=0.179143:
        return y+'1'
    elif 0.179143<x<=0.261386:
        return y+'2'
    elif 0.261386<x<=0.354643:
        return y+'3'
    else:
        return y+'4'
    
data['肝氣鬱結證型係數']=data['肝氣鬱結證型係數'].apply(funA,args=('A',))
data['熱毒蘊結證型係數']=data['熱毒蘊結證型係數'].apply(funB,args=('B',))
data['冲任失調證型係數']=data['冲任失調證型係數'].apply(funC,args=('C',))
data['血氣兩虛證型係數']=data['血氣兩虛證型係數'].apply(funD,args=('D',))
data['脾胃虛弱證型係數']=data['脾胃虛弱證型係數'].apply(funE,args=('E',))
data['肝腎陰虛證型係數']=data['肝腎陰虛證型係數'].apply(funF,args=('F',))

In [8]:
#查看離散化後的資料集
data.sample(10)

Unnamed: 0,肝氣鬱結證型係數,熱毒蘊結證型係數,冲任失調證型係數,血氣兩虛證型係數,脾胃虛弱證型係數,肝腎陰虛證型係數,病程階段,TNM分期,轉移部位,確診候幾年發現轉移\n
61,A3,B2,C3,D3,E3,F4,S2,H4,R1R3,J2
97,A1,B3,C2,D1,E2,F4,S3,H3,R0,J0
727,A1,B1,C1,D1,E2,F3,S4,H1,R0,J0
536,A3,B2,C1,D3,E2,F1,S1,H2,R0,J0
63,A3,B2,C2,D2,E3,F2,S2,H4,R2,J1
209,A3,B1,C2,D2,E2,F3,S3,H4,R3,J1
304,A3,B2,C2,D3,E1,F2,S1,H3,R0,J0
280,A1,B2,C2,D2,E2,F1,S4,H3,R0,J0
153,A3,B1,C2,D3,E1,F1,S1,H2,R0,J0
352,A3,B1,C4,D3,E1,F3,S4,H1,R0,J0


In [9]:
#取出建模數據
data=data[['肝氣鬱結證型係數','熱毒蘊結證型係數','冲任失調證型係數','血氣兩虛證型係數','脾胃虛弱證型係數','肝腎陰虛證型係數','TNM分期']]
data=data.reset_index().set_index('index')
data.sample(2)

Unnamed: 0_level_0,肝氣鬱結證型係數,熱毒蘊結證型係數,冲任失調證型係數,血氣兩虛證型係數,脾胃虛弱證型係數,肝腎陰虛證型係數,TNM分期
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
156,A3,B2,C2,D2,E1,F1,H2
11,A2,B2,C1,D2,E2,F3,H4


In [10]:
#找出data數據集中所有的item
cols=[]

for col in list(data.columns):
    for val in data[col].unique():
        cols.append(val)

In [11]:
#建立一個新的數據集，內容值預設為0，且數據大小與data相同
data1=pd.DataFrame(0, index=np.arange(len(data)),columns=cols)
data1.sample(5)

Unnamed: 0,A1,A4,A3,A2,B3,B1,B2,B4,C2,C3,C1,C4,D3,D1,D2,D4,E1,E2,E3,E4,F3,F1,F4,F2,H4,H3,H2,H1
31,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
659,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
819,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
401,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
#更新數據
for index, row in data.iterrows():
    if row['肝氣鬱結證型係數']=='A1':
        data1.loc[index,'A1'] = 1.0
    if row['肝氣鬱結證型係數']=='A2':
        data1.loc[index,'A2'] = 1.0
    if row['肝氣鬱結證型係數']=='A3':
        data1.loc[index,'A3'] = 1.0
    if row['肝氣鬱結證型係數']=='A4':
        data1.loc[index,'A4'] = 1.0
    if row['熱毒蘊結證型係數']=='B1':
        data1.loc[index,'B1'] = 1.0
    if row['熱毒蘊結證型係數']=='B2':
        data1.loc[index,'B2'] = 1.0
    if row['熱毒蘊結證型係數']=='B3':
        data1.loc[index,'B3'] = 1.0
    if row['熱毒蘊結證型係數']=='B4':
        data1.loc[index,'B4'] = 1.0
    if row['冲任失調證型係數']=='C1':
        data1.loc[index,'C1'] = 1.0
    if row['冲任失調證型係數']=='C2':
        data1.loc[index,'C2'] = 1.0
    if row['冲任失調證型係數']=='C3':
        data1.loc[index,'C3'] = 1.0
    if row['冲任失調證型係數']=='C4':
        data1.loc[index,'C4'] = 1.0  
    if row['血氣兩虛證型係數']=='D1':
        data1.loc[index,'D1'] = 1.0
    if row['血氣兩虛證型係數']=='D2':
        data1.loc[index,'D2'] = 1.0
    if row['血氣兩虛證型係數']=='D3':
        data1.loc[index,'D3'] = 1.0
    if row['血氣兩虛證型係數']=='D4':
        data1.loc[index,'D4'] = 1.0
    if row['脾胃虛弱證型係數']=='E1':
        data1.loc[index,'E1'] = 1.0
    if row['脾胃虛弱證型係數']=='E2':
        data1.loc[index,'E2'] = 1.0
    if row['脾胃虛弱證型係數']=='E3':
        data1.loc[index,'E3'] = 1.0
    if row['脾胃虛弱證型係數']=='E4':
        data1.loc[index,'E4'] = 1.0
    if row['肝腎陰虛證型係數']=='F1':
        data1.loc[index,'F1'] = 1.0
    if row['肝腎陰虛證型係數']=='F2':
        data1.loc[index,'F2'] = 1.0
    if row['肝腎陰虛證型係數']=='F3':
        data1.loc[index,'F3'] = 1.0
    if row['肝腎陰虛證型係數']=='F4':
        data1.loc[index,'F4'] = 1.0
    if row['TNM分期']=='H1':
        data1.loc[index,'H1'] = 1.0
    if row['TNM分期']=='H2':
        data1.loc[index,'H2'] = 1.0
    if row['TNM分期']=='H3':
        data1.loc[index,'H3'] = 1.0
    if row['TNM分期']=='H4':
        data1.loc[index,'H4'] = 1.0

In [13]:
data1.sample(5)

Unnamed: 0,A1,A4,A3,A2,B3,B1,B2,B4,C2,C3,C1,C4,D3,D1,D2,D4,E1,E2,E3,E4,F3,F1,F4,F2,H4,H3,H2,H1
4,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0
321,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0
565,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0
183,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1
376,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0


In [14]:
#建立模型
#設定最小支持度(min_support=0.07)
frq_items = apriori(data1, min_support=0.07, use_colnames=True)

In [15]:
rules = association_rules(frq_items, metric ="lift", min_threshold = 1.5)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
20,"(F4, A3)",(H4),0.090323,0.446237,0.079570,0.880952,1.974182,0.039265,4.651613
49,"(F4, C3)",(H4),0.086022,0.446237,0.075269,0.875000,1.960843,0.036883,4.430108
16,(F4),(H4),0.245161,0.446237,0.180645,0.736842,1.651237,0.071245,2.104301
58,"(F4, D2)",(H4),0.105376,0.446237,0.076344,0.724490,1.623555,0.029321,2.009956
56,"(F3, D2)",(H4),0.104301,0.446237,0.075269,0.721649,1.617190,0.028726,1.989446
...,...,...,...,...,...,...,...,...,...
23,(H4),"(F4, A3)",0.446237,0.090323,0.079570,0.178313,1.974182,0.039265,1.107085
61,(H4),"(F4, D2)",0.446237,0.105376,0.076344,0.171084,1.623555,0.029321,1.079270
19,(H4),"(A3, D2)",0.446237,0.109677,0.076344,0.171084,1.559887,0.027402,1.074081
52,(H4),"(F4, C3)",0.446237,0.086022,0.075269,0.168675,1.960843,0.036883,1.099423


In [16]:
#將欄位consequents轉換為文字
rules["consequents"] = rules["consequents"].apply(lambda x: list(x)[0]).astype("unicode")
#只找出乳癌第四期
rules=rules[(rules['consequents']=='H4') &\
            (rules['support']>=0.07) &\
            (rules['confidence']>=0.8)]

In [17]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
20,"(F4, A3)",H4,0.090323,0.446237,0.07957,0.880952,1.974182,0.039265,4.651613
49,"(F4, C3)",H4,0.086022,0.446237,0.075269,0.875,1.960843,0.036883,4.430108
