In [1]:
import pymysql
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.ml.fpm import PrefixSpan

In [2]:
# Crwaling data from the database
db = pymysql.connect(
    host='127.0.0.1',
    user='root',
    passwd='',
    port=3306,
    db='patientdb',
    charset='utf8')
cur = db.cursor()
cur.execute(
    'SELECT MASTERPATIENTID, DISPENSECALENDARDATE, PBSDISEASEGROUP FROM nd_patient_history')
res = cur.fetchall()
df = pd.DataFrame(list(res))
df = df.rename(
    columns={
        0: 'MASTERPATIENTID',
        1: 'DISPENSECALENDARDATE',
        2: 'PBSDISEASEGROUP'})

In [3]:
# Simple processing
df['PBSDISEASEGROUP'] = df.PBSDISEASEGROUP.str.replace(
    'Infection where resistance to amoxycillin is suspected',
    'Infections where resistance to amoxycillin is suspected')

df['PBSDISEASEGROUP'] = df.PBSDISEASEGROUP.str.replace(
    'who',
    'that')

# Order by name and date 
data = df.sort_values(
    ['MASTERPATIENTID', 'DISPENSECALENDARDATE'], ascending=[1, 1])

# drop duplicate records and leaves the first occurrence
data.drop_duplicates(
    subset=[
        'MASTERPATIENTID',
        'PBSDISEASEGROUP'],
    keep='first',
    inplace=True)

# Total Transactions
N = len(data)

In [4]:
def toli(disea):
    li = []
    li.append(disea)
    return li


def search_freq(disease):
    return int(freqTable[freqTable.itemset == disease].freq)


def calculate_lift(joint_count,ant_count,cons_count):
    # support(joint)/support(antecedent)*support(consequent)
    return (joint_count*N)/(ant_count*cons_count)


def calculate_confidence(joint_count,ant_count):
    # P(Y|X) = C(X,Y) / C(X)
    return joint_count/ant_count

In [5]:
def prefix_rule(data):
    
    data = pd.DataFrame(data)
    # Transform to spark dataframe
    spark = SparkSession.builder.getOrCreate()
    data = spark.createDataFrame(data)
    data = data.drop('MASTERPATIENTID')
    #Create Model
    prefixSpan = PrefixSpan(minSupport=0.01, maxPatternLength=2,sequenceCol='PBSDISEASEGROUP')
    freq = prefixSpan.findFrequentSequentialPatterns(data).sort("freq",ascending=False)
    df = freq.toPandas()
    
    return df

In [6]:
def create_table(df):
    
    AssoRule = pd.DataFrame(columns=('antecedent', 'consequent', 'joint_freq'))
    freqTable = pd.DataFrame(columns=('itemset', 'freq'))

    idx1 = 0
    idx2 = 0

    for i in range(len(df)):
    
        if len(df.loc[i]['sequence']) > 1:
            AssoRule.loc[idx1] = [df.loc[i]['sequence'][0], df.loc[i]['sequence'][1:], df.loc[i]['freq'] ]
            idx1 +=1
        
        else:
            freqTable.loc[idx2] = [df.loc[i]['sequence'][0], df.loc[i]['freq']]
            idx2 += 1
    
    return AssoRule,freqTable

In [7]:
def generate_statistics(AssoRule,freqTable):
    
    freqTable['itemset'] = freqTable['itemset'].apply(lambda x: ', '.join(each for each in x))
    AssoRule['antecedent'] = AssoRule['antecedent'].apply(lambda x: ', '.join(each for each in x))
    AssoRule['consequent'] = AssoRule['consequent'].apply(lambda x: ', '.join(each for item in x for each in item))
    AssoRule['ant_freq'] = AssoRule.apply(lambda row: search_freq(row['antecedent']),axis=1)
    AssoRule['cons_freq'] = AssoRule.apply(lambda row: search_freq(row['consequent']),axis=1)
    AssoRule['confidence'] = AssoRule.apply(lambda row: calculate_confidence(row['joint_freq'], row['ant_freq']), axis=1)
    AssoRule['lift'] = AssoRule.apply(lambda row: calculate_lift(row['joint_freq'], row['ant_freq'], row['cons_freq']), axis=1)
    
    return AssoRule

Confidence is a conditional probability $P(Y|X) =C(X,Y)/C(X) $ which means if a patient got disease X first, then what is the probability this patient got disease B next.

The Lift measures the probability of X and Y occurring together divided by the probability of X and Y occurring if they were independent events.That is, $P(Y|X) =P(X,Y)/P(X)*P(Y)$. If X and Y are independent then the Lift == 1. If they occur together more often than if they were independent, then Lift > 1.

## Not Group Concurrent Disease

In [None]:
# groupby the diseases of the same date
#data_new = data_sorted.groupby(['MASTERPATIENTID', 'DISPENSECALENDARDATE'])['PBSDISEASEGROUP'].apply(list).reset_index()
# group all the records for each person in sequence
data['PBSDISEASEGROUP'] = data['PBSDISEASEGROUP'].apply(toli)
data = data.groupby(['MASTERPATIENTID'])['PBSDISEASEGROUP'].apply(list).reset_index()
df = prefix_rule(data)
AssoRule,freqTable = create_table(df)
AssoRule = generate_statistics(AssoRule,freqTable)

In [None]:
AssoRule

## Group Concurrent Disease

In [8]:
# groupby the diseases of the same date
data = data.groupby(['MASTERPATIENTID', 'DISPENSECALENDARDATE'])['PBSDISEASEGROUP'].apply(list).reset_index()
# group all the records for each person in sequence
data = data.groupby(['MASTERPATIENTID'])['PBSDISEASEGROUP'].apply(list).reset_index()
df = prefix_rule(data)
AssoRule,freqTable = create_table(df)
AssoRule = generate_statistics(AssoRule,freqTable)

In [9]:
AssoRule

Unnamed: 0,antecedent,consequent,joint_freq,ant_freq,cons_freq,confidence,lift
0,Corticosteroid-responsive dermatoses,Infections where resistance to amoxycillin is ...,561,2553,3312,0.219741,3.975118
1,Oesophageal,Infections where resistance to amoxycillin is ...,559,2318,3312,0.241156,4.362509
2,Corticosteroid-responsive dermatoses,Infections where resistance to amoxycillin is ...,559,2553,3316,0.218958,3.956169
3,Oesophageal,Infections where resistance to amoxycillin is ...,555,2318,3316,0.239431,4.326068
4,Infections where resistance to amoxycillin is ...,Corticosteroid-responsive dermatoses,533,3316,2553,0.160736,3.772161
5,Infections where resistance to amoxycillin is ...,Corticosteroid-responsive dermatoses,531,3312,2553,0.160326,3.762545
6,Arthritis,Infections where resistance to amoxycillin is ...,513,2323,3312,0.220835,3.994902
7,Arthritis,Infections where resistance to amoxycillin is ...,510,2323,3316,0.219544,3.966749
8,Major depressive disorders,Infections where resistance to amoxycillin is ...,494,1850,3312,0.267027,4.830512
9,Major depressive disorders,Infections where resistance to amoxycillin is ...,491,1850,3316,0.265405,4.795386
