In [1]:
# A notbook to prepare the BPIC 2012 log

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np

<IPython.core.display.Javascript object>

In [3]:
case_id_col = "Case ID"
activity_col = "Activity"
resource_col = "Resource"
timestamp_col = "start_time"
label_col = "label"
pos_label = "deviant"  # positive outcome that will be predicted
neg_label = "regular"  # negative outcome 

dynamic_cat_cols = [
    "Activity",
    "Resource",
] 
static_cat_cols = []  
dynamic_num_cols = [
    "timesincelastevent",
    "timesincecasestart",
    "timesincemidnight",
    "event_nr",
    "month",
    "weekday",
    "hour",
    "open_cases",
]
static_num_cols = [
    "NumberOfOffers",
    "AMOUNT_REQ",
]

static_cols = static_cat_cols + static_num_cols + [case_id_col, label_col]
dynamic_cols = dynamic_cat_cols + dynamic_num_cols + [timestamp_col]
cat_cols = dynamic_cat_cols + static_cat_cols
freq_threshold = 10

<IPython.core.display.Javascript object>

In [4]:
datasets = ["bpic2012"]
for dataset_name in datasets:
    dataset_name = "%s"%dataset_name
    data = pd.read_csv("./data/%s/%s.csv"%(dataset_name, dataset_name), sep=",")

    # TO REMOVE WHITE SPACE FROM COLUMNS VALUES
    df_obj = data.select_dtypes(["object"])
    data[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())

<IPython.core.display.Javascript object>

In [5]:
# Time Features

<IPython.core.display.Javascript object>

In [6]:
data[timestamp_col] = pd.to_datetime(data[timestamp_col])
data[resource_col] = (
    data.sort_values(timestamp_col, ascending=True)
    .groupby(case_id_col)[resource_col]
    .transform(lambda grp: grp.fillna(method="ffill"))
)
data.rename(columns=lambda x: x.replace("(case) ", ""), inplace=True)

# add event duration
data[timestamp_col] = pd.to_datetime(data[timestamp_col])
data["timesincemidnight"] = (
    data[timestamp_col].dt.hour * 60 + data[timestamp_col].dt.minute
)
data["month"] = data[timestamp_col].dt.month
data["weekday"] = data[timestamp_col].dt.weekday
data["hour"] = data[timestamp_col].dt.hour

<IPython.core.display.Javascript object>

In [7]:
data

Unnamed: 0,Case ID,start_time,end_time,AMOUNT_REQ,REG_DATE,Activity,Resource,timesincemidnight,month,weekday,hour
0,173688,2011-09-30 22:38:44.546,2011-09-30T22:38:44.546,20000,2011-09-30T22:38:44.546Z,A_SUBMITTED,112.0,1358,9,4,22
1,173688,2011-09-30 22:38:44.880,2011-09-30T22:38:44.880,20000,2011-09-30T22:38:44.546Z,A_PARTLYSUBMITTED,112.0,1358,9,4,22
2,173688,2011-09-30 22:39:37.906,2011-09-30T22:39:37.906,20000,2011-09-30T22:38:44.546Z,A_PREACCEPTED,112.0,1359,9,4,22
3,173688,2011-10-01 09:36:46.437,2011-10-01T09:45:13.917,20000,2011-09-30T22:38:44.546Z,W_Completeren aanvraag,112.0,576,10,5,9
4,173688,2011-10-01 09:42:43.308,2011-10-01T09:42:43.308,20000,2011-09-30T22:38:44.546Z,A_ACCEPTED,10862.0,582,10,5,9
...,...,...,...,...,...,...,...,...,...,...,...
164504,214373,2012-03-10 11:46:22.700,2012-03-10T11:50:54.881,8500,2012-02-29T22:43:09.766Z,W_Nabellen offertes,11119.0,706,3,5,11
164505,214376,2012-02-29 22:51:16.799,2012-02-29T22:51:16.799,15000,2012-02-29T22:51:16.799Z,A_SUBMITTED,112.0,1371,2,2,22
164506,214376,2012-02-29 22:51:17.423,2012-02-29T22:51:17.423,15000,2012-02-29T22:51:16.799Z,A_PARTLYSUBMITTED,112.0,1371,2,2,22
164507,214376,2012-03-01 08:26:46.736,2012-03-01T08:27:41.325,15000,2012-02-29T22:51:16.799Z,W_Afhandelen leads,11169.0,506,3,3,8


<IPython.core.display.Javascript object>

In [8]:
# data[data["Case ID"] == 214310]

<IPython.core.display.Javascript object>

In [9]:
def extract_timestamp_features_2(group):
    group[timestamp_col] = pd.to_datetime(group[timestamp_col])

    group = group.sort_values(timestamp_col, ascending=False, kind="mergesort")

    tmp = group[timestamp_col] - group[timestamp_col].shift(-1)
    tmp = tmp.fillna(pd.Timedelta(seconds=0))
    group["timesincelastevent"] = tmp.apply(
        lambda x: float(x / np.timedelta64(1, "m"))
    )  # m is for minutes

    tmp = group[timestamp_col] - group[timestamp_col].iloc[-1]
    tmp = tmp.fillna(pd.Timedelta(seconds=0))
    group["timesincecasestart"] = tmp.apply(
        lambda x: float(x / np.timedelta64(1, "m"))
    )  # m is for minutes

    group = group.sort_values(timestamp_col, ascending=True, kind="mergesort")
    group["event_nr"] = range(1, len(group) + 1)

    group["timesincecasestart_days"] = group["timesincecasestart"] / 1440
    group["timesincecasestart_wks"] = group["timesincecasestart_days"] / 7

    group["timesincelastevent_days"] = group["timesincelastevent"] / 1440
    group["timesincelastevent_wks"] = group["timesincelastevent_days"] / 7

    try:
        #         pd.to_datetime(
        #             group[
        #                 group[activity_col].isin(
        #                     [
        #                         "A_APPROVED",
        #                         "A_REGISTERED",
        #                         "A_ACTIVATED",
        #                         "A_CANCELLED",
        #                         "A_DECLINED",
        #                     ]
        #                 )
        #             ][timestamp_col].values[0]
        #         )
        #         # print("True")

        last_e = pd.to_datetime(
            group[
                group[activity_col].isin(
                    [
                        "A_APPROVED",
                        "A_REGISTERED",
                        "A_ACTIVATED",
                        "A_CANCELLED",
                        "A_DECLINED",
                    ]
                )
            ][timestamp_col].values[0]
        )
        group["time_to_event_m"] = (
            last_e - pd.to_datetime(group[timestamp_col])
        ).dt.total_seconds() / 60.0
        group["time_to_event_m"] = group["time_to_event_m"].fillna(
            pd.Timedelta(seconds=0)
        )
        #     group["time_to_event_days"] = group["time_to_event_m"] / 1440
        #     group["time_to_event_wks"] = group["time_to_event_days"] / 7

        group["case_length"] = group["event_nr"].max()

        return group
    except:
        print(f"False, continue, {set(group['Case ID'])}")


def get_open_cases(date, dt_first_last_timestamps):
    return sum(
        (dt_first_last_timestamps["start_time"] <= date)
        & (dt_first_last_timestamps["end_time"] > date)
    )


# data = data.groupby(case_id_col).apply(extract_timestamp_features_2)
# data

<IPython.core.display.Javascript object>

In [10]:
# def extract_timestamp_features(group):
#     group[timestamp_col] = pd.to_datetime(group[timestamp_col])

#     group = group.sort_values(timestamp_col, ascending=False, kind='mergesort')

#     tmp = group[timestamp_col] - group[timestamp_col].shift(-1)
#     tmp = tmp.fillna(pd.Timedelta(seconds=0))
#     group["timesincelastevent"] = tmp.apply(lambda x: float(x / np.timedelta64(1, 'm')))  # m is for minutes

#     tmp = group[timestamp_col] - group[timestamp_col].iloc[-1]
#     tmp = tmp.fillna(pd.Timedelta(seconds=0))
#     group["timesincecasestart"] = tmp.apply(lambda x: float(x / np.timedelta64(1, 'm')))  # m is for minutes

#     group = group.sort_values(timestamp_col, ascending=True, kind='mergesort')
#     group["event_nr"] = range(1, len(group) + 1)
    
#     group["timesincecasestart_days"] = group["timesincecasestart"] / 1440
#     group["timesincecasestart_wks"] = group["timesincecasestart_days"] / 7

#     group["timesincelastevent_days"] = group["timesincelastevent"] / 1440
#     group["timesincelastevent_wks"] = group["timesincelastevent_days"] / 7

#     last_e = pd.to_datetime(
#     group[
#         group[activity_col].isin(
#             ["A_APPROVED", "A_REGISTERED", "A_ACTIVATED", "A_CANCELLED", "A_DECLINED",]
#         )
#     ][timestamp_col].values[0]
#     )
#     group["time_to_event_m"] = (
#         last_e - pd.to_datetime(group[timestamp_col])
#     ).dt.total_seconds() / 60.0
#     group["time_to_event_m"] = group["time_to_event_m"].fillna(pd.Timedelta(seconds=0))
# #     group["time_to_event_days"] = group["time_to_event_m"] / 1440
# #     group["time_to_event_wks"] = group["time_to_event_days"] / 7
    
#     group['case_length'] = group["event_nr"].max()

#     return group

# def get_open_cases(date, dt_first_last_timestamps):
#     return sum((dt_first_last_timestamps["start_time"] <= date) & (dt_first_last_timestamps["end_time"] > date))


<IPython.core.display.Javascript object>

In [11]:
print("Extracting timestamp features...")
data = data.groupby(case_id_col).apply(extract_timestamp_features_2)

print("Extracting open cases...")
data = data.sort_values([timestamp_col], ascending=True, kind="mergesort").reset_index(
    drop=True
)
dt_first_last_timestamps = data.groupby(case_id_col)[timestamp_col].agg([min, max])
dt_first_last_timestamps.columns = ["start_time", "end_time"]
data["open_cases"] = data[timestamp_col].apply(
    lambda x: get_open_cases(x, dt_first_last_timestamps)
)

Extracting timestamp features...
False, continue, {197219}
False, continue, {197437}
False, continue, {198017}
False, continue, {198996}
False, continue, {199678}
False, continue, {200389}
False, continue, {200725}
False, continue, {201042}
False, continue, {202038}
False, continue, {202307}
False, continue, {202358}
False, continue, {202521}
False, continue, {203362}
False, continue, {203428}
False, continue, {203504}
False, continue, {203630}
False, continue, {204131}
False, continue, {204496}
False, continue, {205199}
False, continue, {205334}
False, continue, {205604}
False, continue, {205694}
False, continue, {205990}
False, continue, {206002}
False, continue, {206177}
False, continue, {206192}
False, continue, {206264}
False, continue, {206558}
False, continue, {206600}
False, continue, {206636}
False, continue, {206721}
False, continue, {206886}
False, continue, {207011}
False, continue, {207032}
False, continue, {207389}
False, continue, {207434}
False, continue, {207777}
False

False, continue, {213534}
False, continue, {213543}
False, continue, {213572}
False, continue, {213575}
False, continue, {213615}
False, continue, {213621}
False, continue, {213630}
False, continue, {213639}
False, continue, {213651}
False, continue, {213660}
False, continue, {213669}
False, continue, {213690}
False, continue, {213696}
False, continue, {213699}
False, continue, {213714}
False, continue, {213729}
False, continue, {213738}
False, continue, {213750}
False, continue, {213768}
False, continue, {213771}
False, continue, {213783}
False, continue, {213795}
False, continue, {213807}
False, continue, {213813}
False, continue, {213837}
False, continue, {213849}
False, continue, {213855}
False, continue, {213858}
False, continue, {213870}
False, continue, {213879}
False, continue, {213888}
False, continue, {213891}
False, continue, {213894}
False, continue, {213918}
False, continue, {213921}
False, continue, {213927}
False, continue, {213939}
False, continue, {213942}
False, conti

<IPython.core.display.Javascript object>

In [12]:
data = data.sort_values(timestamp_col, ascending=True, kind="mergesort")
data

Unnamed: 0,Case ID,start_time,end_time,AMOUNT_REQ,REG_DATE,Activity,Resource,timesincemidnight,month,weekday,...,timesincelastevent,timesincecasestart,event_nr,timesincecasestart_days,timesincecasestart_wks,timesincelastevent_days,timesincelastevent_wks,time_to_event_m,case_length,open_cases
0,173688,2011-09-30 22:38:44.546,2011-09-30T22:38:44.546,20000,2011-09-30T22:38:44.546Z,A_SUBMITTED,112.0,1358,9,4,...,0.000000,0.000000,1,0.000000,0.000000e+00,0.000000,0.000000e+00,17878.744667,18,1
1,173688,2011-09-30 22:38:44.880,2011-09-30T22:38:44.880,20000,2011-09-30T22:38:44.546Z,A_PARTLYSUBMITTED,112.0,1358,9,4,...,0.005567,0.005567,2,0.000004,5.522487e-07,0.000004,5.522487e-07,17878.739100,18,1
2,173688,2011-09-30 22:39:37.906,2011-09-30T22:39:37.906,20000,2011-09-30T22:38:44.546Z,A_PREACCEPTED,112.0,1359,9,4,...,0.883767,0.889333,3,0.000618,8.822751e-05,0.000614,8.767526e-05,17877.855333,18,1
3,173691,2011-10-01 06:08:58.256,2011-10-01T06:08:58.256,5000,2011-10-01T06:08:58.256Z,A_SUBMITTED,112.0,368,10,5,...,0.000000,0.000000,1,0.000000,0.000000e+00,0.000000,0.000000e+00,13328.467517,26,2
4,173691,2011-10-01 06:09:02.195,2011-10-01T06:09:02.195,5000,2011-10-01T06:08:58.256Z,A_PARTLYSUBMITTED,112.0,369,10,5,...,0.065650,0.065650,2,0.000046,6.512897e-06,0.000046,6.512897e-06,13328.401867,26,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156957,208901,2012-03-14 14:31:28.859,2012-03-14T14:31:28.859,11000,2012-02-13T14:50:41.425Z,A_REGISTERED,11339.0,871,3,2,...,19.133400,43180.790567,20,29.986660,4.283809e+00,0.013287,1.898155e-03,0.000000,20,1
156958,206615,2012-03-14 14:33:57.651,2012-03-14T14:33:57.651,22000,2012-02-05T15:42:18.178Z,A_APPROVED,10809.0,873,3,2,...,0.000000,54651.657883,45,37.952540,5.421791e+00,0.000000,0.000000e+00,0.000000,48,0
156959,206615,2012-03-14 14:33:57.651,2012-03-14T14:33:57.651,22000,2012-02-05T15:42:18.178Z,A_REGISTERED,10809.0,873,3,2,...,0.000000,54651.657883,46,37.952540,5.421791e+00,0.000000,0.000000e+00,0.000000,48,0
156960,206615,2012-03-14 14:33:57.651,2012-03-14T14:33:57.651,22000,2012-02-05T15:42:18.178Z,O_ACCEPTED,10809.0,873,3,2,...,0.000000,54651.657883,47,37.952540,5.421791e+00,0.000000,0.000000e+00,0.000000,48,0


<IPython.core.display.Javascript object>

In [13]:
# Add label
# Positive, i.e., we need to predict : "A_CANCELLED", "A_DECLINED": pos_label
# negative : "A_APPROVED", "A_REGISTERED", "A_ACTIVATED": neg_label

<IPython.core.display.Javascript object>

In [14]:
def add_lable(group):
    activities1 = ["A_APPROVED", "A_REGISTERED", "A_ACTIVATED"]
    activities2 = ["A_CANCELLED", "A_DECLINED"]

    if not group[group["Activity"].isin(activities1)].empty:
        group["label"] = neg_label
        return group
    elif not group[group["Activity"].isin(activities2)].empty:
        group["label"] = pos_label
        return group
    else:
        print("\nNot Complete\n")
        return None


print("Add Label ...")
data = data.groupby(case_id_col, as_index=False).apply(add_lable).reset_index(drop=True)
data

Add Label ...


Unnamed: 0,Case ID,start_time,end_time,AMOUNT_REQ,REG_DATE,Activity,Resource,timesincemidnight,month,weekday,...,timesincecasestart,event_nr,timesincecasestart_days,timesincecasestart_wks,timesincelastevent_days,timesincelastevent_wks,time_to_event_m,case_length,open_cases,label
0,173688,2011-09-30 22:38:44.546,2011-09-30T22:38:44.546,20000,2011-09-30T22:38:44.546Z,A_SUBMITTED,112.0,1358,9,4,...,0.000000,1,0.000000,0.000000e+00,0.000000,0.000000e+00,17878.744667,18,1,regular
1,173688,2011-09-30 22:38:44.880,2011-09-30T22:38:44.880,20000,2011-09-30T22:38:44.546Z,A_PARTLYSUBMITTED,112.0,1358,9,4,...,0.005567,2,0.000004,5.522487e-07,0.000004,5.522487e-07,17878.739100,18,1,regular
2,173688,2011-09-30 22:39:37.906,2011-09-30T22:39:37.906,20000,2011-09-30T22:38:44.546Z,A_PREACCEPTED,112.0,1359,9,4,...,0.889333,3,0.000618,8.822751e-05,0.000614,8.767526e-05,17877.855333,18,1,regular
3,173691,2011-10-01 06:08:58.256,2011-10-01T06:08:58.256,5000,2011-10-01T06:08:58.256Z,A_SUBMITTED,112.0,368,10,5,...,0.000000,1,0.000000,0.000000e+00,0.000000,0.000000e+00,13328.467517,26,2,regular
4,173691,2011-10-01 06:09:02.195,2011-10-01T06:09:02.195,5000,2011-10-01T06:08:58.256Z,A_PARTLYSUBMITTED,112.0,369,10,5,...,0.065650,2,0.000046,6.512897e-06,0.000046,6.512897e-06,13328.401867,26,2,regular
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156957,208901,2012-03-14 14:31:28.859,2012-03-14T14:31:28.859,11000,2012-02-13T14:50:41.425Z,A_REGISTERED,11339.0,871,3,2,...,43180.790567,20,29.986660,4.283809e+00,0.013287,1.898155e-03,0.000000,20,1,regular
156958,206615,2012-03-14 14:33:57.651,2012-03-14T14:33:57.651,22000,2012-02-05T15:42:18.178Z,A_APPROVED,10809.0,873,3,2,...,54651.657883,45,37.952540,5.421791e+00,0.000000,0.000000e+00,0.000000,48,0,regular
156959,206615,2012-03-14 14:33:57.651,2012-03-14T14:33:57.651,22000,2012-02-05T15:42:18.178Z,A_REGISTERED,10809.0,873,3,2,...,54651.657883,46,37.952540,5.421791e+00,0.000000,0.000000e+00,0.000000,48,0,regular
156960,206615,2012-03-14 14:33:57.651,2012-03-14T14:33:57.651,22000,2012-02-05T15:42:18.178Z,O_ACCEPTED,10809.0,873,3,2,...,54651.657883,47,37.952540,5.421791e+00,0.000000,0.000000e+00,0.000000,48,0,regular


<IPython.core.display.Javascript object>

In [15]:
# Add treatment:
# Offer prepared and transmitted to applicant:
# O_SENT


<IPython.core.display.Javascript object>

In [16]:
set(data.Activity)

{'A_ACCEPTED',
 'A_ACTIVATED',
 'A_APPROVED',
 'A_CANCELLED',
 'A_DECLINED',
 'A_FINALIZED',
 'A_PARTLYSUBMITTED',
 'A_PREACCEPTED',
 'A_REGISTERED',
 'A_SUBMITTED',
 'O_ACCEPTED',
 'O_CANCELLED',
 'O_CREATED',
 'O_DECLINED',
 'O_SELECTED',
 'O_SENT',
 'O_SENT_BACK',
 'W_Afhandelen leads',
 'W_Beoordelen fraude',
 'W_Completeren aanvraag',
 'W_Nabellen incomplete dossiers',
 'W_Nabellen offertes',
 'W_Valideren aanvraag'}

<IPython.core.display.Javascript object>

In [17]:
def add_Offere(group):
    # get all observations with O_Created activity
    tmp_df = group[group[activity_col] == "O_SENT"]  # to count offers
    # count numbe rof offers for each case
    tmp_df2 = pd.DataFrame(
        tmp_df.groupby([case_id_col])[activity_col].count()
    ).reset_index()
    tmp_df2.columns = [case_id_col, "NumberOfOffers"]
    group = pd.merge(tmp_df2, group, on=case_id_col)
    return group


print("Add no of offeres ...")
data_offeres = add_Offere(data)
data_offeres

Add no of offeres ...


Unnamed: 0,Case ID,NumberOfOffers,start_time,end_time,AMOUNT_REQ,REG_DATE,Activity,Resource,timesincemidnight,month,...,timesincecasestart,event_nr,timesincecasestart_days,timesincecasestart_wks,timesincelastevent_days,timesincelastevent_wks,time_to_event_m,case_length,open_cases,label
0,173688,1,2011-09-30 22:38:44.546,2011-09-30T22:38:44.546,20000,2011-09-30T22:38:44.546Z,A_SUBMITTED,112.0,1358,9,...,0.000000,1,0.000000,0.000000e+00,0.000000,0.000000e+00,17878.744667,18,1,regular
1,173688,1,2011-09-30 22:38:44.880,2011-09-30T22:38:44.880,20000,2011-09-30T22:38:44.546Z,A_PARTLYSUBMITTED,112.0,1358,9,...,0.005567,2,0.000004,5.522487e-07,0.000004,5.522487e-07,17878.739100,18,1,regular
2,173688,1,2011-09-30 22:39:37.906,2011-09-30T22:39:37.906,20000,2011-09-30T22:38:44.546Z,A_PREACCEPTED,112.0,1359,9,...,0.889333,3,0.000618,8.822751e-05,0.000614,8.767526e-05,17877.855333,18,1,regular
3,173688,1,2011-10-01 09:36:46.437,2011-10-01T09:45:13.917,20000,2011-09-30T22:38:44.546Z,W_Completeren aanvraag,112.0,576,10,...,658.031517,4,0.456966,6.528090e-02,0.456349,6.519268e-02,17220.713150,18,10,regular
4,173688,1,2011-10-01 09:42:43.308,2011-10-01T09:42:43.308,20000,2011-09-30T22:38:44.546Z,A_ACCEPTED,10862.0,582,10,...,663.979367,5,0.461097,6.587097e-02,0.004130,5.900645e-04,17214.765300,18,10,regular
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115120,214361,1,2012-03-07 08:01:59.877,2012-03-07T08:01:59.877,5000,2012-02-29T22:11:37.974Z,O_SENT_BACK,10899.0,481,3,...,9230.365033,14,6.409976,9.157108e-01,0.000773,1.104878e-04,8779.704183,18,276,deviant
115121,214361,1,2012-03-12 15:12:08.468,2012-03-12T15:12:34.634,5000,2012-02-29T22:11:37.974Z,W_Valideren aanvraag,10809.0,912,3,...,16860.508217,15,11.708686,1.672669e+00,5.298711,7.569586e-01,1149.561000,18,85,deviant
115122,214361,1,2012-03-13 10:03:00.346,2012-03-13T10:21:45.661,5000,2012-02-29T22:11:37.974Z,W_Valideren aanvraag,10138.0,603,3,...,17991.372850,16,12.494009,1.784858e+00,0.785323,1.121890e-01,18.696367,18,62,deviant
115123,214361,1,2012-03-13 10:21:42.128,2012-03-13T10:21:42.128,5000,2012-02-29T22:11:37.974Z,A_DECLINED,10138.0,621,3,...,18010.069217,17,12.506993,1.786713e+00,0.000000,0.000000e+00,0.000000,18,61,deviant


<IPython.core.display.Javascript object>

In [18]:
set(data_offeres.NumberOfOffers)

{1, 2, 3, 4, 5, 6, 7}

<IPython.core.display.Javascript object>

In [19]:
# function to add treatment based on the number of offers
def add_treatment(df):
    # function to cheack if a case contains A_Pending activity or not
    def check_NoOfOffers(gr):
        df = pd.DataFrame(gr)

        # case should be not treated if it receives more than one offer
        if list(df["NumberOfOffers"])[0] <= 1:
            df["treatment"] = "treat"  # T=1 pos_treatment
        else:
            df["treatment"] = "noTreat"  # T=0 neg_treatment
        return df

    # add new treatment for each case based on number of offers
    # cases with only one offer should be treated
    df = df.groupby(case_id_col).apply(check_NoOfOffers)
    df = df.reset_index(drop=True)
    return df


print("Add Treatment...")
data_treat = add_treatment(data_offeres)
data_treat

Add Treatment...


Unnamed: 0,Case ID,NumberOfOffers,start_time,end_time,AMOUNT_REQ,REG_DATE,Activity,Resource,timesincemidnight,month,...,event_nr,timesincecasestart_days,timesincecasestart_wks,timesincelastevent_days,timesincelastevent_wks,time_to_event_m,case_length,open_cases,label,treatment
0,173688,1,2011-09-30 22:38:44.546,2011-09-30T22:38:44.546,20000,2011-09-30T22:38:44.546Z,A_SUBMITTED,112.0,1358,9,...,1,0.000000,0.000000e+00,0.000000,0.000000e+00,17878.744667,18,1,regular,treat
1,173688,1,2011-09-30 22:38:44.880,2011-09-30T22:38:44.880,20000,2011-09-30T22:38:44.546Z,A_PARTLYSUBMITTED,112.0,1358,9,...,2,0.000004,5.522487e-07,0.000004,5.522487e-07,17878.739100,18,1,regular,treat
2,173688,1,2011-09-30 22:39:37.906,2011-09-30T22:39:37.906,20000,2011-09-30T22:38:44.546Z,A_PREACCEPTED,112.0,1359,9,...,3,0.000618,8.822751e-05,0.000614,8.767526e-05,17877.855333,18,1,regular,treat
3,173688,1,2011-10-01 09:36:46.437,2011-10-01T09:45:13.917,20000,2011-09-30T22:38:44.546Z,W_Completeren aanvraag,112.0,576,10,...,4,0.456966,6.528090e-02,0.456349,6.519268e-02,17220.713150,18,10,regular,treat
4,173688,1,2011-10-01 09:42:43.308,2011-10-01T09:42:43.308,20000,2011-09-30T22:38:44.546Z,A_ACCEPTED,10862.0,582,10,...,5,0.461097,6.587097e-02,0.004130,5.900645e-04,17214.765300,18,10,regular,treat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115120,214361,1,2012-03-07 08:01:59.877,2012-03-07T08:01:59.877,5000,2012-02-29T22:11:37.974Z,O_SENT_BACK,10899.0,481,3,...,14,6.409976,9.157108e-01,0.000773,1.104878e-04,8779.704183,18,276,deviant,treat
115121,214361,1,2012-03-12 15:12:08.468,2012-03-12T15:12:34.634,5000,2012-02-29T22:11:37.974Z,W_Valideren aanvraag,10809.0,912,3,...,15,11.708686,1.672669e+00,5.298711,7.569586e-01,1149.561000,18,85,deviant,treat
115122,214361,1,2012-03-13 10:03:00.346,2012-03-13T10:21:45.661,5000,2012-02-29T22:11:37.974Z,W_Valideren aanvraag,10138.0,603,3,...,16,12.494009,1.784858e+00,0.785323,1.121890e-01,18.696367,18,62,deviant,treat
115123,214361,1,2012-03-13 10:21:42.128,2012-03-13T10:21:42.128,5000,2012-02-29T22:11:37.974Z,A_DECLINED,10138.0,621,3,...,17,12.506993,1.786713e+00,0.000000,0.000000e+00,0.000000,18,61,deviant,treat


<IPython.core.display.Javascript object>

In [20]:
data_all = data_treat
# impute missing values
print("impute missing values...")
grouped = data_all.sort_values(timestamp_col, ascending=True, kind="mergesort").groupby(
    case_id_col
)
for col in static_cols + dynamic_cols:
    data[col] = grouped[col].transform(lambda grp: grp.fillna(method="ffill"))

data_all[cat_cols] = data_all[cat_cols].fillna("missing")
data_all = data_all.fillna(0)

# set infrequent factor levels to "other"
for col in cat_cols:
    counts = data_all[col].value_counts()
    mask = data_all[col].isin(counts[counts >= freq_threshold].index)
    data_all.loc[~mask, col] = "other"

impute missing values...


<IPython.core.display.Javascript object>

In [23]:
del data_all["end_time"]
print("Saving csv file...")
results_dir = "./prepared_data/%s/" % dataset_name
import os

if not os.path.exists(os.path.join(results_dir)):
    os.makedirs(os.path.join(results_dir))

# df.to_csv(results_dir + '/prepared_treatment_outcome_bpic2012.csv', index=False, sep=';')
data_all["event"] = 1
data_all["event"] = data_all["event"].astype("bool")

data_all.to_csv(
    os.path.join(
        results_dir, "prepared_treatment_outcome_time_to_event_%s.csv" % dataset_name
    ),
    index=False,
    sep=";",
)

Saving csv file...


<IPython.core.display.Javascript object>