# BPIC log preparation for experiments 

## Dependencies

In [1]:
import pandas as pd
from preparation import save_csv, read_csv, Preparation
import networkx as nx
import numpy as np
from sklearn.model_selection import train_test_split

## Constants

In [2]:
DATASET_PATH = r'/Users/lukas/Desktop/CI_Experiments/CI_Experiments/data/unprepared_process_logs/BPIC17_O_Accepted.csv'
TRUE_GRAPH_PATH = r'C:\Users\Lukas\CI_Experiments\CI_Experiments\data\prepared_process_logs\BPIC_17\true_graph.csv'
DATA_1_TRAIN_PATH = r'C:\Users\Lukas\CI_Experiments\CI_Experiments\data\prepared_process_logs\BPIC_17\activities\train.csv'
DATA_1_TEST_PATH = r'C:\Users\Lukas\CI_Experiments\CI_Experiments\data\prepared_process_logs\BPIC_17\activities\test.csv'
DATA_2_TRAIN_PATH = r'/Users/lukas/Desktop/CI_Experiments/CI_Experiments/data/prepared_process_logs/BPIC_17/activities_numeric2/train.csv'
DATA_2_TEST_PATH = r'/Users/lukas/Desktop/CI_Experiments/CI_Experiments/data/prepared_process_logs/BPIC_17/activities_numeric2/test.csv'
CASE_ID = 'Case ID'
ACTIVITY = 'Activity'
LABEL = 'label'
OUTCOME = 'Outcome'
TIMESTAMP = 'time:timestamp'

## Code

In [3]:
data = read_csv(path=DATASET_PATH)
data.head()

Unnamed: 0,ApplicationType,LoanGoal,RequestedAmount,Case ID,label,Activity,org:resource,Action,EventOrigin,lifecycle:transition,...,CreditScore,timesincelastevent,timesincecasestart,timesincemidnight,event_nr,month,weekday,hour,open_cases,time:timestamp
0,New credit,Existing loan takeover,20000.0,Application_652823628,deviant,A_Create Application,User_1,Created,Application,complete,...,0.0,0.0,0.0,591,1,1,4,9,0,2016-01-01 09:51:15.304
1,New credit,Existing loan takeover,20000.0,Application_652823628,deviant,A_Submitted,User_1,statechange,Application,complete,...,0.0,0.0008,0.0008,591,2,1,4,9,1,2016-01-01 09:51:15.352
2,New credit,Existing loan takeover,20000.0,Application_652823628,deviant,W_Handle leads,User_1,Created,Workflow,schedule,...,0.0,0.007033,0.007833,591,3,1,4,9,1,2016-01-01 09:51:15.774
3,New credit,Existing loan takeover,20000.0,Application_652823628,deviant,W_Handle leads,User_1,Deleted,Workflow,withdraw,...,0.0,1.343633,1.351467,592,4,1,4,9,1,2016-01-01 09:52:36.392
4,New credit,Existing loan takeover,20000.0,Application_652823628,deviant,W_Complete application,User_1,Created,Workflow,schedule,...,0.0,0.000183,1.35165,592,5,1,4,9,1,2016-01-01 09:52:36.403


In [4]:
data = data.sort_values([CASE_ID, TIMESTAMP])

### 1 Preparation (Only Activities (v1) + Outcome)

In [5]:
data1 = data.copy()
data1.columns

Index(['ApplicationType', 'LoanGoal', 'RequestedAmount', 'Case ID', 'label',
       'Activity', 'org:resource', 'Action', 'EventOrigin',
       'lifecycle:transition', 'Accepted', 'Selected', 'FirstWithdrawalAmount',
       'MonthlyCost', 'NumberOfTerms', 'OfferedAmount', 'CreditScore',
       'timesincelastevent', 'timesincecasestart', 'timesincemidnight',
       'event_nr', 'month', 'weekday', 'hour', 'open_cases', 'time:timestamp'],
      dtype='object')

In [6]:
data1 = data1[[CASE_ID, ACTIVITY, LABEL]]
data1.head()

Unnamed: 0,Case ID,Activity,label
686058,Application_1000086665,A_Create Application,regular
686059,Application_1000086665,A_Submitted,regular
686060,Application_1000086665,W_Handle leads,regular
686061,Application_1000086665,W_Handle leads,regular
686062,Application_1000086665,W_Complete application,regular


In [7]:
data1[OUTCOME] = np.where(data1[LABEL]=='deviant', 1, 0)
data1.head()

Unnamed: 0,Case ID,Activity,label,Outcome
686058,Application_1000086665,A_Create Application,regular,0
686059,Application_1000086665,A_Submitted,regular,0
686060,Application_1000086665,W_Handle leads,regular,0
686061,Application_1000086665,W_Handle leads,regular,0
686062,Application_1000086665,W_Complete application,regular,0


In [10]:
data1 = data1[[CASE_ID, ACTIVITY, OUTCOME]]
data1.head()

Unnamed: 0,Case ID,Activity,Outcome
686058,Application_1000086665,A_Create Application,0
686059,Application_1000086665,A_Submitted,0
686060,Application_1000086665,W_Handle leads,0
686061,Application_1000086665,W_Handle leads,0
686062,Application_1000086665,W_Complete application,0


In [11]:
preparation = Preparation(data1)

In [12]:
preparation.create_activity_graph(path=TRUE_GRAPH_PATH)

In [13]:
preparation.encode(columns_to_encode=[ACTIVITY])
preparation.encoded_data.head()

Unnamed: 0_level_0,Outcome,Activity_A_Accepted,Activity_A_Cancelled,Activity_A_Complete,Activity_A_Concept,Activity_A_Create Application,Activity_A_Denied,Activity_A_Incomplete,Activity_A_Pending,Activity_A_Submitted,...,Activity_O_Sent (mail and online),Activity_O_Sent (online only),Activity_W_Assess potential fraud,Activity_W_Call after offers,Activity_W_Call incomplete files,Activity_W_Complete application,Activity_W_Handle leads,Activity_W_Personal Loan collection,Activity_W_Shortened completion,Activity_W_Validate application
Case ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Application_1000086665,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
Application_1000158214,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
Application_1000311556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
Application_1000334415,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
Application_1000339879,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0


In [14]:
encoded_data = preparation.encoded_data
X_train, X_test = train_test_split(encoded_data, test_size=0.2, random_state=42)

In [17]:
save_csv(X_train, DATA_1_TRAIN_PATH)
save_csv(X_test, DATA_1_TEST_PATH)

### 2 Preparation (All + Outcome)

In [5]:
data2 = data.copy()
data2.head()

Unnamed: 0,ApplicationType,LoanGoal,RequestedAmount,Case ID,label,Activity,org:resource,Action,EventOrigin,lifecycle:transition,...,CreditScore,timesincelastevent,timesincecasestart,timesincemidnight,event_nr,month,weekday,hour,open_cases,time:timestamp
686058,New credit,"Other, see explanation",5000.0,Application_1000086665,regular,A_Create Application,User_1,Created,Application,complete,...,0.0,0.0,0.0,957,1,8,2,15,2249,2016-08-03 15:57:21.673
686059,New credit,"Other, see explanation",5000.0,Application_1000086665,regular,A_Submitted,User_1,statechange,Application,complete,...,0.0,0.001017,0.001017,957,2,8,2,15,2250,2016-08-03 15:57:21.734
686060,New credit,"Other, see explanation",5000.0,Application_1000086665,regular,W_Handle leads,User_1,Created,Workflow,schedule,...,0.0,0.003817,0.004833,957,3,8,2,15,2250,2016-08-03 15:57:21.963
686061,New credit,"Other, see explanation",5000.0,Application_1000086665,regular,W_Handle leads,User_1,Deleted,Workflow,withdraw,...,0.0,1.105383,1.110217,958,4,8,2,15,2250,2016-08-03 15:58:28.286
686062,New credit,"Other, see explanation",5000.0,Application_1000086665,regular,W_Complete application,User_1,Created,Workflow,schedule,...,0.0,0.000117,1.110333,958,5,8,2,15,2250,2016-08-03 15:58:28.293


In [6]:
data2.columns

Index(['ApplicationType', 'LoanGoal', 'RequestedAmount', 'Case ID', 'label',
       'Activity', 'org:resource', 'Action', 'EventOrigin',
       'lifecycle:transition', 'Accepted', 'Selected', 'FirstWithdrawalAmount',
       'MonthlyCost', 'NumberOfTerms', 'OfferedAmount', 'CreditScore',
       'timesincelastevent', 'timesincecasestart', 'timesincemidnight',
       'event_nr', 'month', 'weekday', 'hour', 'open_cases', 'time:timestamp'],
      dtype='object')

In [7]:
data2.dtypes

ApplicationType           object
LoanGoal                  object
RequestedAmount          float64
Case ID                   object
label                     object
Activity                  object
org:resource              object
Action                    object
EventOrigin               object
lifecycle:transition      object
Accepted                  object
Selected                  object
FirstWithdrawalAmount    float64
MonthlyCost              float64
NumberOfTerms            float64
OfferedAmount            float64
CreditScore              float64
timesincelastevent       float64
timesincecasestart       float64
timesincemidnight          int64
event_nr                   int64
month                      int64
weekday                    int64
hour                       int64
open_cases                 int64
time:timestamp            object
dtype: object

In [8]:
numeric_columns = ['RequestedAmount', 'FirstWithdrawalAmount', 'MonthlyCost', 'NumberOfTerms', 
'OfferedAmount', 'CreditScore', 'open_cases']
columns_to_take = [CASE_ID, ACTIVITY, LABEL] + numeric_columns

In [9]:
data2 = data2[columns_to_take]
data2[OUTCOME] = np.where(data2[LABEL]=='deviant', 1, 0)
data2 = data2.drop(columns=[LABEL])
data2.head()

Unnamed: 0,Case ID,Activity,RequestedAmount,FirstWithdrawalAmount,MonthlyCost,NumberOfTerms,OfferedAmount,CreditScore,open_cases,Outcome
686058,Application_1000086665,A_Create Application,5000.0,0.0,0.0,0.0,0.0,0.0,2249,0
686059,Application_1000086665,A_Submitted,5000.0,0.0,0.0,0.0,0.0,0.0,2250,0
686060,Application_1000086665,W_Handle leads,5000.0,0.0,0.0,0.0,0.0,0.0,2250,0
686061,Application_1000086665,W_Handle leads,5000.0,0.0,0.0,0.0,0.0,0.0,2250,0
686062,Application_1000086665,W_Complete application,5000.0,0.0,0.0,0.0,0.0,0.0,2250,0


In [10]:
preparation2 = Preparation(data2)

In [11]:
preparation2.target_encode(columns_to_encode=[ACTIVITY])
preparation2.encoded_data.head()

Unnamed: 0_level_0,RequestedAmount,FirstWithdrawalAmount,MonthlyCost,NumberOfTerms,OfferedAmount,CreditScore,open_cases,Outcome,Activity_A_Accepted,Activity_A_Cancelled,...,Activity_O_Sent (mail and online),Activity_O_Sent (online only),Activity_W_Assess potential fraud,Activity_W_Call after offers,Activity_W_Call incomplete files,Activity_W_Complete application,Activity_W_Handle leads,Activity_W_Personal Loan collection,Activity_W_Shortened completion,Activity_W_Validate application
Case ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Application_1000086665,5000.0,5000.0,241.28,22.0,5000.0,0.0,2306.0,0.0,0.400061,0.0,...,0.0,0.35426,0.0,0.400061,0.40722,0.409135,0.315888,0.0,0.0,0.0
Application_1000158214,12500.0,0.0,250.0,57.0,12500.0,929.0,1659.0,1.0,0.0,0.0,...,0.0,0.394822,0.0,0.40722,0.742512,0.0,0.0,0.0,0.0,0.742512
Application_1000311556,45000.0,0.0,500.0,111.0,45000.0,0.0,1564.0,0.0,0.0,0.0,...,0.0,0.40722,0.0,0.0,0.40722,0.0,0.0,0.0,0.0,0.400061
Application_1000334415,5000.0,5000.0,152.82,36.0,5000.0,0.0,2679.0,0.0,0.400061,0.0,...,0.566506,0.0,0.0,0.568225,0.40722,0.576641,0.315888,0.0,0.0,0.576641
Application_1000339879,37500.0,0.0,375.0,126.0,37500.0,798.0,1720.0,1.0,0.55203,0.409135,...,0.566506,0.0,0.0,0.566506,0.0,0.566506,0.742512,0.0,0.0,0.576641


In [12]:
encoded_data = preparation2.encoded_data
X_train, X_test = train_test_split(encoded_data, test_size=0.2, random_state=42)

In [13]:
save_csv(X_train, DATA_2_TRAIN_PATH)
save_csv(X_test, DATA_2_TEST_PATH)