In [1]:
import pandas as pd
# load the bank transaction dataset
df = pd.read_csv('D1.csv')
# info and the first 10 transactions
df

Unnamed: 0,patient_id,global_num,date,location,latitude,longitude
0,1000000001,2.0,22/01/2020,Gyeonggi-do_Gimpo-si,37.615246,126.715632
1,1000000001,2.0,24/01/2020,Seoul_Jung-gu,37.567241,127.005659
2,1000000002,5.0,26/01/2020,Seoul_Seongdong-gu,37.563992,127.029534
3,1000000002,5.0,27/01/2020,Seoul_Dongdaemun-gu,37.566262,127.065815
4,1000000002,5.0,28/01/2020,Seoul_Gangnam-gu,37.523674,127.046543
...,...,...,...,...,...,...
1504,6100000083,,6/03/2020,Daegu_Buk-gu,35.891794,128.588890
1505,6100000085,,16/03/2020,Gyeongsangnam-do_Changwon-si,35.227956,128.685595
1506,6100000086,,14/03/2020,Daegu_Dalseong-gun,35.857185,128.466686
1507,6100000090,,24/03/2020,Incheon_Jung-gu,37.460191,126.440696


##Task 1
# 1. What pre-processing was required on the dataset before building the association mining 
model? What variables did you include in the analysis? Justify your choice.
- convert date to correct format (todatetime)
- fill the missing values in "global_num"
- sort data by date column

In [2]:
# Converting date from OBJECT to DATETIME

df['date'] = pd.to_datetime(df['date'],infer_datetime_format=True)

#Replace null values with mode of blood_type

global_mode = df['global_num'].mode()[0]
df['global_num'].fillna(global_mode, inplace = True)

#sort data by date column
df= df.sort_values(by='date', ascending=True)


In [3]:
print(df.info())
print(df.head(20))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1509 entries, 0 to 748
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   patient_id  1509 non-null   int64         
 1   global_num  1509 non-null   float64       
 2   date        1509 non-null   datetime64[ns]
 3   location    1509 non-null   object        
 4   latitude    1509 non-null   float64       
 5   longitude   1509 non-null   float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(1)
memory usage: 82.5+ KB
None
      patient_id  global_num       date                 location   latitude  \
0     1000000001         2.0 2020-01-22     Gyeonggi-do_Gimpo-si  37.615246   
1095  2000000001         3.0 2020-01-24    Gyeonggi-do_Goyang-si  37.677860   
1098  2000000006        17.0 2020-01-24       Incheon_Namdong-gu  37.456256   
1     1000000001         2.0 2020-01-24            Seoul_Jung-gu  37.567241   
958   1300000001        16.0 

In [4]:
# group by account, then list all services
transactions = df.groupby(['patient_id'])['location'].apply(list)
print(transactions.head(5))


patient_id
1000000001                [Gyeonggi-do_Gimpo-si, Seoul_Jung-gu]
1000000002    [Seoul_Seongdong-gu, Seoul_Dongdaemun-gu, Seou...
1000000004                                  [Seoul_Jungnang-gu]
1000000005                                  [Seoul_Jungnang-gu]
1000000006                              [Gyeonggi-do_Goyang-si]
Name: location, dtype: object


In [5]:
pip install apyori

Note: you may need to restart the kernel to use updated packages.


In [6]:
from apyori import apriori
# type cast the transactions from pandas into normal list format and run apriori
transaction_list = list(transactions)
results = list(apriori(transaction_list, min_support=0.002, min_confidence=0.05))
# print first 5 rules
print(results[:5])


[RelationRecord(items=frozenset({'Busan_Yeonje-gu'}), support=0.05723905723905724, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Busan_Yeonje-gu'}), confidence=0.05723905723905724, lift=1.0)]), RelationRecord(items=frozenset({'Daegu_Jung-gu'}), support=0.05499438832772166, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Daegu_Jung-gu'}), confidence=0.05499438832772166, lift=1.0)]), RelationRecord(items=frozenset({'Incheon_Jung-gu'}), support=0.14927048260381592, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Incheon_Jung-gu'}), confidence=0.14927048260381592, lift=1.0)]), RelationRecord(items=frozenset({'Seoul_Dongjak-gu'}), support=0.08866442199775533, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Seoul_Dongjak-gu'}), confidence=0.08866442199775533, lift=1.0)]), RelationRecord(items=frozenset({'Seoul_Gangnam-gu'}), support=0.05611672278338945

In [7]:
def convert_apriori_results_to_pandas_df(results):
    rules = []
    for rule_set in results:
         for rule in rule_set.ordered_statistics:
     # items_base = left side of rules, items_add = right side
     # support, confidence and lift for respective rules
             rules.append([','.join(rule.items_base), ','.join(rule.items_add),
                 rule_set.support, rule.confidence, rule.lift])
 
     # typecast it to pandas df
    return pd.DataFrame(rules, columns=['Left_side', 'Right_side', 'Support', 'Confidence', 'Lift'])
result_df = convert_apriori_results_to_pandas_df(results)
print(result_df.head(20))


            Left_side          Right_side   Support  Confidence        Lift
0                         Busan_Yeonje-gu  0.057239    0.057239    1.000000
1                           Daegu_Jung-gu  0.054994    0.054994    1.000000
2                         Incheon_Jung-gu  0.149270    0.149270    1.000000
3                        Seoul_Dongjak-gu  0.088664    0.088664    1.000000
4                        Seoul_Gangnam-gu  0.056117    0.056117    1.000000
5                           Seoul_Jung-gu  0.063973    0.063973    1.000000
6                       Seoul_Jungnang-gu  0.088664    0.088664    1.000000
7                      Seoul_Yangcheon-gu  0.060606    0.060606    1.000000
8        Busan_Buk-gu    Busan_Gangseo-gu  0.003367    0.600000   76.371429
9    Busan_Gangseo-gu        Busan_Buk-gu  0.003367    0.428571   76.371429
10       Busan_Buk-gu     Busan_Yeonje-gu  0.003367    0.600000   10.482353
11    Busan_Yeonje-gu        Busan_Buk-gu  0.003367    0.058824   10.482353
12       Bus

In [8]:
# sort all acquired rules descending by lift
result_df = result_df.sort_values(by='Lift', ascending=False)
print(result_df.head(5))

                                 Left_side  \
368             Seoul_Jung-gu,Daegu_Buk-gu   
288         Chungcheongbuk-do_Jincheon-gun   
369          Seoul_Dongjak-gu,Daegu_Seo-gu   
293  Daegu_Suseong-gu,Gyeonggi-do_Suwon-si   
370             Seoul_Jung-gu,Daegu_Seo-gu   

                                Right_side   Support  Confidence   Lift  
368          Seoul_Dongjak-gu,Daegu_Seo-gu  0.002245    1.000000  445.5  
288  Daegu_Suseong-gu,Gyeonggi-do_Suwon-si  0.002245    1.000000  445.5  
369             Seoul_Jung-gu,Daegu_Buk-gu  0.002245    1.000000  445.5  
293         Chungcheongbuk-do_Jincheon-gun  0.002245    1.000000  445.5  
370          Daegu_Buk-gu,Seoul_Dongjak-gu  0.002245    0.666667  297.0  


3. List four most interesting routes taken by individuals who have tested positive for 
COVID19 and have travelled from Buk-gu City in Busan Province. 



In [9]:
result_df.loc[(result_df['Left_side'] == 'Busan_Buk-gu')]

Unnamed: 0,Left_side,Right_side,Support,Confidence,Lift
260,Busan_Buk-gu,"Busan_Yeonje-gu,Gwangju_Buk-gu",0.002245,0.4,118.8
12,Busan_Buk-gu,Gwangju_Buk-gu,0.002245,0.4,118.8
255,Busan_Buk-gu,"Busan_Gangseo-gu,Busan_Yeonje-gu",0.002245,0.4,118.8
8,Busan_Buk-gu,Busan_Gangseo-gu,0.003367,0.6,76.371429
10,Busan_Buk-gu,Busan_Yeonje-gu,0.003367,0.6,10.482353


## Sequantial  

In [10]:

transactions = df.groupby(['patient_id'])['location'].apply(list)
sequences = transactions.values.tolist()
# show the first 5 sequences
print(sequences[:5])

[['Gyeonggi-do_Gimpo-si', 'Seoul_Jung-gu'], ['Seoul_Seongdong-gu', 'Seoul_Dongdaemun-gu', 'Seoul_Gangnam-gu'], ['Seoul_Jungnang-gu'], ['Seoul_Jungnang-gu'], ['Gyeonggi-do_Goyang-si']]


In [29]:
from collections import defaultdict
import subprocess
import re

''' Uses SPMF to find association rules in supplied transactions '''
def get_association_rules(sequences, min_sup, min_conf):
    # step 1: create required input for SPMF
    
    # prepare a dict to uniquely assign each item in the transactions to an int ID
    item_dict = defaultdict(int)
    output_dict = defaultdict(str)
    item_id = 1
    
    # write your sequences in SPMF format
    with open('seq_rule_input.txt', 'w+') as f:
        for sequence in sequences:
            z = []
            for itemset in sequence:
                # if there are multiple items in one itemset
                if isinstance(itemset, list):
                    for item in itemset:
                        if item not in item_dict:
                            item_dict[item] = item_id
                            item_id += 1

                        z.append(item_dict[item])
                else:
                    if itemset not in item_dict:
                        item_dict[itemset] = item_id
                        output_dict[str(item_id)] = itemset
                        item_id += 1
                    z.append(item_dict[itemset])
                    
                # end of itemset
                z.append(-1)
            
            # end of a sequence
            z.append(-2)
            f.write(' '.join([str(x) for x in z]))
            f.write('\n')
    
    # run SPMF with supplied parameters
    supp_param = '{}%'.format(int(min_sup * 100))
    conf_param = '{}%'.format(int(min_conf * 100))
    subprocess.call(['java', '-jar', 'spmf.jar', 'run', 'RuleGrowth', 
                     'seq_rule_input.txt', 'seq_rule_output.txt', 
                     supp_param, conf_param], shell=True)
    
    # read back the output rules
    outputs = open('seq_rule_output.txt', 'r').read().strip().split('\n')
    output_rules = []
    for rule in outputs:
        left, right, sup, conf = re.search(pattern=r'([0-9\,]+) ==> ([0-9\,]+) #SUP: ([0-9]+) #CONF: ([0-9\.]+)', string=rule).groups()
        sup = int(sup) / len(sequences)
        conf = float(conf)
        output_rules.append([[output_dict[x] for x in left.split(',')], [output_dict[x] for x in right.split(',')], sup, conf])
    
    print(outputs)
    return pd.DataFrame(output_rules, columns=['Left_rule', 'Right_rule', 'Support', 'Confidence'])


In [25]:
# ...

def get_association_rules(sequences, min_sup, min_conf):
    # ...

    # Update the file path to the absolute file path of seq_rule_output.txt
    output_file_path = "Documents/Documents - Naman’s MacBook Pro/Sem 1 Courses/IFN509 Data Exploration & Mining/Assessment/Assessment 2/Project datasets/seq_rule_input.txt"

    # ...

    # run SPMF with supplied parameters
    supp_param = '{}%'.format(int(min_sup * 100))
    conf_param = '{}%'.format(int(min_conf * 100))
    subprocess.call(['java', '-jar', 'spmf.jar', 'run', 'RuleGrowth', 
                     'seq_rule_input.txt', output_file_path, 
                     supp_param, conf_param], shell=True)

    # ...

    # read back the output rules
    outputs = open(output_file_path, 'r').read().strip().split('\n')
    
    # ...

    return pd.DataFrame(output_rules, columns=['Left_rule', 'Right_rule', 'Support', 'Confidence'])


Three sequences 12 ==>5, 12==>6, 17==>45

In [26]:
get_association_rules(sequences, 0.01, 0.1)

Usage: java [-options] class [args...]
           (to execute a class)
   or  java [-options] -jar jarfile [args...]
           (to execute a jar file)
where options include:
    -d32	  use a 32-bit data model if available
    -d64	  use a 64-bit data model if available
    -server	  to select the "server" VM
                  The default VM is server,
                  because you are running on a server-class machine.


    -cp <class search path of directories and zip/jar files>
    -classpath <class search path of directories and zip/jar files>
                  A : separated list of directories, JAR archives,
                  and ZIP archives to search for class files.
    -D<name>=<value>
                  set a system property
    -verbose:[class|gc|jni]
                  enable verbose output
    -version      print product version and exit
    -version:<value>
                  in a future release.
                  require the specified version to run
    -showversion  print

FileNotFoundError: [Errno 2] No such file or directory: 'Documents/Documents - Naman’s MacBook Pro/Sem 1 Courses/IFN509 Data Exploration & Mining/Assessment/Assessment 2/Project datasets/seq_rule_input.txt'