In [1]:
import pymysql
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.ml.fpm import PrefixSpan

In [2]:
# Crwaling data from the database
db = pymysql.connect(
    host='127.0.0.1',
    user='root',
    passwd='',
    port=3306,
    db='patientdb',
    charset='utf8')
cur = db.cursor()
cur.execute(
    'SELECT MASTERPATIENTID, DISPENSECALENDARDATE, PBSDISEASEGROUP FROM nd_patient_history')
res = cur.fetchall()
df = pd.DataFrame(list(res))
df = df.rename(
    columns={
        0: 'MASTERPATIENTID',
        1: 'DISPENSECALENDARDATE',
        2: 'PBSDISEASEGROUP'})

In [107]:
# if not from database, then simply read the file into pandas
df = pd.read_csv("/Users/lina/desktop/CAPSTONE/final_result.csv")

In [108]:
# Order by name and date 
data = df.sort_values(
    ['MASTERPATIENTID', 'DISPENSECALENDARDATE'], ascending=[1, 1])

# drop duplicate records and leaves the first occurrence
data.drop_duplicates(
    subset=[
        'MASTERPATIENTID',
        'PBSDISEASEGROUP'],
    keep='first',
    inplace=True)

# Total Transactions
N = len(data)

In [109]:
def toli(disea):
    li = []
    li.append(disea)
    return li


def search_freq(disease,freqTable):
    try:
        num = int(freqTable[freqTable.itemset == disease].freq)
    except TypeError:
        num = 0
    return num


def calculate_lift(joint_count,ant_count,cons_count):
    if ant_count*cons_count==0:
        num = 0
        return num
    else: 
        return (joint_count*N)/(ant_count*cons_count)


def calculate_confidence(joint_count,ant_count):
    if ant_count ==0:
        num = 0
        return num
    else:
        return joint_count/ant_count

In [110]:
def prefix_rule(data,num):
    data = pd.DataFrame(data)
    # Transform to spark dataframe
    spark = SparkSession.builder.getOrCreate()
    data = spark.createDataFrame(data)
    data = data.drop('MASTERPATIENTID')
    #Create Model
    print (num)
    prefixSpan = PrefixSpan(minSupport=0.01, maxPatternLength=num,sequenceCol='PBSDISEASEGROUP') # 2=1:1 3=2:1
    freq = prefixSpan.findFrequentSequentialPatterns(data).sort("freq",ascending=False)
    df = freq.toPandas()
    
    return df

In [111]:
def create_table(df):
    
    AssoRule = pd.DataFrame(columns=('antecedent', 'consequent', 'joint_freq'))
    freqTable = pd.DataFrame(columns=('itemset', 'freq'))

    idx1 = 0
    idx2 = 0

    for i in range(len(df)):
    
        if len(df.loc[i]['sequence']) > 1:
            AssoRule.loc[idx1] = [df.loc[i]['sequence'][0], df.loc[i]['sequence'][1:], df.loc[i]['freq'] ]
            idx1 +=1
        
        else:
            freqTable.loc[idx2] = [df.loc[i]['sequence'][0], df.loc[i]['freq']]
            idx2 += 1
    
    return AssoRule,freqTable

In [112]:
def generate_statistics(AssoRule,freqTable):
    
    freqTable['itemset'] = freqTable['itemset'].apply(lambda x: ', '.join(each for each in x))
    AssoRule['antecedent'] = AssoRule['antecedent'].apply(lambda x: ', '.join(each for each in x))
    AssoRule['consequent'] = AssoRule['consequent'].apply(lambda x: ', '.join(each for item in x for each in item))
    AssoRule['ant_freq'] = AssoRule.apply(lambda row: search_freq(row['antecedent'],freqTable),axis=1)
    AssoRule['cons_freq'] = AssoRule.apply(lambda row: search_freq(row['consequent'],freqTable),axis=1)
    AssoRule['confidence'] = AssoRule.apply(lambda row: calculate_confidence(row['joint_freq'], row['ant_freq']), axis=1)
    AssoRule['lift'] = AssoRule.apply(lambda row: calculate_lift(row['joint_freq'], row['ant_freq'], row['cons_freq']), axis=1)
    
    return AssoRule

Confidence is a conditional probability $P(Y|X) =C(X,Y)/C(X) $ which means if a patient got disease X first, then what is the probability this patient got disease B next.

The Lift measures the probability of X and Y occurring together divided by the probability of X and Y occurring if they were independent events.That is, $P(Y|X) =P(X,Y)/P(X)*P(Y)$. If X and Y are independent then the Lift == 1. If they occur together more often than if they were independent, then Lift > 1.

## Not Group Concurrent Disease

In [9]:
# Not group concurrent disease means if multiple disease happens concurrently, we think it is happened in seqence.
# Grouping concurrent disease is preferable.
data['PBSDISEASEGROUP'] = data['PBSDISEASEGROUP'].apply(toli)
data = data.groupby(['MASTERPATIENTID'])['PBSDISEASEGROUP'].apply(list).reset_index()
df = prefix_rule(data)
AssoRule,freqTable = create_table(df)
AssoRule = generate_statistics(AssoRule,freqTable)

## Group Concurrent Disease

In [113]:
# groupby the diseases of the same date
data = data.groupby(['MASTERPATIENTID', 'DISPENSECALENDARDATE'])['PBSDISEASEGROUP'].apply(list).reset_index()
# group all the records for each person in sequence
data = data.groupby(['MASTERPATIENTID'])['PBSDISEASEGROUP'].apply(list).reset_index()

In [54]:
df3 = prefix_rule(data,3)
AssoRule3,freqTable3 = create_table(df3)
AssoRule3 = generate_statistics(AssoRule3,freqTable3)
AssoRule3.sort_values(by="confidence" , ascending=False)[1:10]

Unnamed: 0,antecedent,consequent,joint_freq,ant_freq,cons_freq,confidence,lift
2405,"Lipid-lowering drugs, Zollinger-Ellison syndrome",Bacterial infection,3126,5665,152087,0.551809,9.054478
2356,"Lipid-lowering drugs, Hypertension",Bacterial infection,3171,5983,152087,0.530002,8.696642
1434,"Oesophageal, Lipid-lowering drugs",Severe pain,4168,8145,123948,0.511725,10.302996
2516,"Lipid-lowering drugs, Hypertension",Arthritis,3057,5983,127796,0.510948,9.977589
2788,"Lipid-lowering drugs, Zollinger-Ellison syndrome",Severe pain,2881,5665,123948,0.508561,10.2393
1493,"Oesophageal, Lipid-lowering drugs",Arthritis,4081,8145,127796,0.501044,9.784186
2914,"Lipid-lowering drugs, Zollinger-Ellison syndrome",Arthritis,2811,5665,127796,0.496205,9.689695
2695,"Lipid-lowering drugs, Hypertension",Severe pain,2933,5983,123948,0.490222,9.870064
1566,"Oesophageal, Lipid-lowering drugs",Corticosteroid-responsive dermatoses,3972,8145,110714,0.487661,10.992135


In [55]:
df4 = prefix_rule(data,4)
AssoRule4,freqTable4 = create_table(df4)
AssoRule4 = generate_statistics(AssoRule4,freqTable4)
AssoRule4.sort_values(by="confidence" , ascending=False)[1:10]

Unnamed: 0,antecedent,consequent,joint_freq,ant_freq,cons_freq,confidence,lift
2754,"Lipid-lowering drugs, Zollinger-Ellison syndrome",Bacterial infection,3126,5665,152087,0.551809,9.054478
2689,"Lipid-lowering drugs, Hypertension",Bacterial infection,3171,5983,152087,0.530002,8.696642
1569,"Oesophageal, Lipid-lowering drugs",Severe pain,4168,8145,123948,0.511725,10.302996
2887,"Lipid-lowering drugs, Hypertension",Arthritis,3057,5983,127796,0.510948,9.977589
3228,"Lipid-lowering drugs, Zollinger-Ellison syndrome",Severe pain,2881,5665,123948,0.508561,10.2393
1635,"Oesophageal, Lipid-lowering drugs",Arthritis,4081,8145,127796,0.501044,9.784186
3383,"Lipid-lowering drugs, Zollinger-Ellison syndrome",Arthritis,2811,5665,127796,0.496205,9.689695
3112,"Lipid-lowering drugs, Hypertension",Severe pain,2933,5983,123948,0.490222,9.870064
1721,"Oesophageal, Lipid-lowering drugs",Corticosteroid-responsive dermatoses,3972,8145,110714,0.487661,10.992135


In [60]:
df5 = prefix_rule(data,5)
AssoRule5,freqTable5 = create_table(df5)
AssoRule5 = generate_statistics(AssoRule5,freqTable5)
AssoRule5.sort_values(by="confidence" , ascending=False)[1:10]

5


Unnamed: 0,antecedent,consequent,joint_freq,ant_freq,cons_freq,confidence,lift
2821,"Lipid-lowering drugs, Zollinger-Ellison syndrome",Bacterial infection,3126,5665,152087,0.551809,9.054478
2753,"Lipid-lowering drugs, Hypertension",Bacterial infection,3171,5983,152087,0.530002,8.696642
1587,"Oesophageal, Lipid-lowering drugs",Severe pain,4168,8145,123948,0.511725,10.302996
2962,"Lipid-lowering drugs, Hypertension",Arthritis,3057,5983,127796,0.510948,9.977589
3313,"Lipid-lowering drugs, Zollinger-Ellison syndrome",Severe pain,2881,5665,123948,0.508561,10.2393
1654,"Oesophageal, Lipid-lowering drugs",Arthritis,4081,8145,127796,0.501044,9.784186
3477,"Lipid-lowering drugs, Zollinger-Ellison syndrome",Arthritis,2811,5665,127796,0.496205,9.689695
3197,"Lipid-lowering drugs, Hypertension",Severe pain,2933,5983,123948,0.490222,9.870064
1744,"Oesophageal, Lipid-lowering drugs",Corticosteroid-responsive dermatoses,3972,8145,110714,0.487661,10.992135


In [114]:
df11 = prefix_rule(data,11)
AssoRule11,freqTable11 = create_table(df11)
AssoRule11 = generate_statistics(AssoRule11,freqTable11)
AssoRule11.sort_values(by="confidence" , ascending=False)[1:10]

11


Unnamed: 0,antecedent,consequent,joint_freq,ant_freq,cons_freq,confidence,lift
2962,"Oesophageal, Lipid-lowering drugs, Zollinger-E...",Bacterial infection,3058,5482,152087,0.557826,9.153197
2822,"Lipid-lowering drugs, Zollinger-Ellison syndrome",Bacterial infection,3126,5665,152087,0.551809,9.054478
2753,"Lipid-lowering drugs, Hypertension",Bacterial infection,3171,5983,152087,0.530002,8.696642
3448,"Oesophageal, Lipid-lowering drugs, Zollinger-E...",Severe pain,2821,5482,123948,0.514593,10.360745
1587,"Oesophageal, Lipid-lowering drugs",Severe pain,4168,8145,123948,0.511725,10.302996
2964,"Lipid-lowering drugs, Hypertension",Arthritis,3057,5983,127796,0.510948,9.977589
3316,"Lipid-lowering drugs, Zollinger-Ellison syndrome",Severe pain,2881,5665,123948,0.508561,10.2393
3598,"Oesophageal, Lipid-lowering drugs, Zollinger-E...",Arthritis,2762,5482,127796,0.503831,9.838612
1654,"Oesophageal, Lipid-lowering drugs",Arthritis,4081,8145,127796,0.501044,9.784186


In [115]:
top50 = AssoRule11.sort_values(by="confidence" , ascending=False)[1:51]

In [118]:
outputpath="/Users/lina/desktop/CAPSTONE/top50_rules_disease.csv"
top50.to_csv(outputpath,sep=',',index=False,header=True)