In [1]:
import pandas as pd
import numpy as np

<IPython.core.display.Javascript object>

In [3]:
# min cols
case_id_col = "Case ID"
activity_col = "Activity"
resource_col = "org:resource"
timestamp_col = "time:timestamp"
# label/outcome col
label_col = "label"
neg_label = "regular"  # negative outcome 
pos_label = "deviant"  # positive outcome that will be predicted
# treatment col
treatment_col = "treatment"
pos_treatment = "treat"  # do treatment
neg_treatment = "noTreat"  # do not treat

# features for classifier
dynamic_cat_cols = [
    "Activity",
    "org:resource",
    "Action",
    "EventOrigin",
    "lifecycle:transition",
    "Accepted",
    "Selected",
]
static_cat_cols = [
    "ApplicationType",
    "LoanGoal",
]  # static attributes, no need for predicting in suffix predictions
dynamic_num_cols = [
    "FirstWithdrawalAmount",
    "MonthlyCost",
    "NumberOfTerms",
    "OfferedAmount",
    "CreditScore",
    "timesincelastevent",
    "timesincecasestart",
    "timesincemidnight",
    "event_nr",
    "month",
    "weekday",
    "hour",
    "open_cases",
]
static_num_cols = [
    "NumberOfOffers",
    "RequestedAmount",
]  # static attributes, no need for predicting in suffix predictions


static_cols = static_cat_cols + static_num_cols + [case_id_col, label_col]
dynamic_cols = dynamic_cat_cols + dynamic_num_cols + [timestamp_col]
cat_cols = dynamic_cat_cols + static_cat_cols
freq_threshold = 10

<IPython.core.display.Javascript object>

In [8]:
datasets = ["bpic2017"]
for dataset_name in datasets:
    # dataset_manager = DatasetManager(dataset_name)
    # data = dataset_manager.read_dataset()
    dataset_name = "%s" % dataset_name
    if dataset_name == "bpic2017":
        sep = ";"
    else:
        sep = ","
    data = pd.read_csv("./data/%s/%s.csv" % (dataset_name, dataset_name), sep=sep)

    # TO REMOVE WHITE SPACE FROM COLUMNS VALUES
    df_obj = data.select_dtypes(["object"])
    data[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())

<IPython.core.display.Javascript object>

In [9]:
data.head()

Unnamed: 0,ApplicationType,LoanGoal,RequestedAmount,Case ID,label,Activity,org:resource,Action,EventOrigin,lifecycle:transition,...,CreditScore,timesincelastevent,timesincecasestart,timesincemidnight,event_nr,month,weekday,hour,open_cases,time:timestamp
0,New credit,Existing loan takeover,20000.0,Application_652823628,deviant,A_Create Application,User_1,Created,Application,complete,...,0.0,0.0,0.0,591,1,1,4,9,0,2016-01-01 09:51:15.304
1,New credit,Existing loan takeover,20000.0,Application_652823628,deviant,A_Submitted,User_1,statechange,Application,complete,...,0.0,0.0008,0.0008,591,2,1,4,9,1,2016-01-01 09:51:15.352
2,New credit,Existing loan takeover,20000.0,Application_652823628,deviant,W_Handle leads,User_1,Created,Workflow,schedule,...,0.0,0.007033,0.007833,591,3,1,4,9,1,2016-01-01 09:51:15.774
3,New credit,Existing loan takeover,20000.0,Application_652823628,deviant,W_Handle leads,User_1,Deleted,Workflow,withdraw,...,0.0,1.343633,1.351467,592,4,1,4,9,1,2016-01-01 09:52:36.392
4,New credit,Existing loan takeover,20000.0,Application_652823628,deviant,W_Complete application,User_1,Created,Workflow,schedule,...,0.0,0.000183,1.35165,592,5,1,4,9,1,2016-01-01 09:52:36.403


<IPython.core.display.Javascript object>

In [10]:
data[timestamp_col] = pd.to_datetime(data[timestamp_col])
data[resource_col] = (
    data.sort_values(timestamp_col, ascending=True)
    .groupby(case_id_col)[resource_col]
    .transform(lambda grp: grp.fillna(method="ffill"))
)
data.rename(columns=lambda x: x.replace("(case) ", ""), inplace=True)

# add event duration
data[timestamp_col] = pd.to_datetime(data[timestamp_col])
data["timesincemidnight"] = (
    data[timestamp_col].dt.hour * 60 + data[timestamp_col].dt.minute
)
data["month"] = data[timestamp_col].dt.month
data["weekday"] = data[timestamp_col].dt.weekday
data["hour"] = data[timestamp_col].dt.hour

<IPython.core.display.Javascript object>

In [11]:
data.head()

Unnamed: 0,ApplicationType,LoanGoal,RequestedAmount,Case ID,label,Activity,org:resource,Action,EventOrigin,lifecycle:transition,...,CreditScore,timesincelastevent,timesincecasestart,timesincemidnight,event_nr,month,weekday,hour,open_cases,time:timestamp
0,New credit,Existing loan takeover,20000.0,Application_652823628,deviant,A_Create Application,User_1,Created,Application,complete,...,0.0,0.0,0.0,591,1,1,4,9,0,2016-01-01 09:51:15.304
1,New credit,Existing loan takeover,20000.0,Application_652823628,deviant,A_Submitted,User_1,statechange,Application,complete,...,0.0,0.0008,0.0008,591,2,1,4,9,1,2016-01-01 09:51:15.352
2,New credit,Existing loan takeover,20000.0,Application_652823628,deviant,W_Handle leads,User_1,Created,Workflow,schedule,...,0.0,0.007033,0.007833,591,3,1,4,9,1,2016-01-01 09:51:15.774
3,New credit,Existing loan takeover,20000.0,Application_652823628,deviant,W_Handle leads,User_1,Deleted,Workflow,withdraw,...,0.0,1.343633,1.351467,592,4,1,4,9,1,2016-01-01 09:52:36.392
4,New credit,Existing loan takeover,20000.0,Application_652823628,deviant,W_Complete application,User_1,Created,Workflow,schedule,...,0.0,0.000183,1.35165,592,5,1,4,9,1,2016-01-01 09:52:36.403


<IPython.core.display.Javascript object>

In [12]:
set(data.Activity)

{'A_Accepted',
 'A_Cancelled',
 'A_Complete',
 'A_Concept',
 'A_Create Application',
 'A_Denied',
 'A_Incomplete',
 'A_Pending',
 'A_Submitted',
 'A_Validating',
 'O_Accepted',
 'O_Cancelled',
 'O_Create Offer',
 'O_Created',
 'O_Refused',
 'O_Returned',
 'O_Sent (mail and online)',
 'O_Sent (online only)',
 'W_Assess potential fraud',
 'W_Call after offers',
 'W_Call incomplete files',
 'W_Complete application',
 'W_Handle leads',
 'W_Personal Loan collection',
 'W_Shortened completion',
 'W_Validate application'}

<IPython.core.display.Javascript object>

In [13]:
def extract_timestamp_features_2(group):
    group[timestamp_col] = pd.to_datetime(group[timestamp_col])

    group = group.sort_values(timestamp_col, ascending=False, kind="mergesort")

    tmp = group[timestamp_col] - group[timestamp_col].shift(-1)
    tmp = tmp.fillna(pd.Timedelta(seconds=0))
    group["timesincelastevent"] = tmp.apply(
        lambda x: float(x / np.timedelta64(1, "m"))
    )  # m is for minutes

    tmp = group[timestamp_col] - group[timestamp_col].iloc[-1]
    tmp = tmp.fillna(pd.Timedelta(seconds=0))
    group["timesincecasestart"] = tmp.apply(
        lambda x: float(x / np.timedelta64(1, "m"))
    )  # m is for minutes

    group = group.sort_values(timestamp_col, ascending=True, kind="mergesort")
    group["event_nr"] = range(1, len(group) + 1)

    group["timesincecasestart_days"] = group["timesincecasestart"] / 1440
    group["timesincecasestart_wks"] = group["timesincecasestart_days"] / 7

    group["timesincelastevent_days"] = group["timesincelastevent"] / 1440
    group["timesincelastevent_wks"] = group["timesincelastevent_days"] / 7

    try:
        last_e = pd.to_datetime(
            group[
                group[activity_col].isin(
                    [
                        "A_Pending",
                        "A_Denied",
                        "A_Cancelled",
                    ]
                )
            ][timestamp_col].values[0]
        )
        group["time_to_event_m"] = (
            last_e - pd.to_datetime(group[timestamp_col])
        ).dt.total_seconds() / 60.0
        group["time_to_event_m"] = group["time_to_event_m"].fillna(
            pd.Timedelta(seconds=0)
        )
        #     group["time_to_event_days"] = group["time_to_event_m"] / 1440
        #     group["time_to_event_wks"] = group["time_to_event_days"] / 7

        group["case_length"] = group["event_nr"].max()

        return group
    except:
        print(f"False, continue, {set(group['Case ID'])}")


def get_open_cases(date, dt_first_last_timestamps):
    return sum(
        (dt_first_last_timestamps["start_time"] <= date)
        & (dt_first_last_timestamps["end_time"] > date)
    )


<IPython.core.display.Javascript object>

In [14]:
print("Extracting timestamp features...")
data = data.groupby(case_id_col).apply(extract_timestamp_features_2)

print("Extracting open cases...")
data = data.sort_values([timestamp_col], ascending=True, kind="mergesort").reset_index(
    drop=True
)
dt_first_last_timestamps = data.groupby(case_id_col)[timestamp_col].agg([min, max])
dt_first_last_timestamps.columns = ["start_time", "end_time"]
data["open_cases"] = data[timestamp_col].apply(
    lambda x: get_open_cases(x, dt_first_last_timestamps)
)

Extracting timestamp features...
False, continue, {'Application_174895313'}
False, continue, {'Application_2103458006'}
Extracting open cases...


<IPython.core.display.Javascript object>

In [15]:
data = data.sort_values(timestamp_col, ascending=True, kind="mergesort")
data.head()

Unnamed: 0,ApplicationType,LoanGoal,RequestedAmount,Case ID,label,Activity,org:resource,Action,EventOrigin,lifecycle:transition,...,weekday,hour,open_cases,time:timestamp,timesincecasestart_days,timesincecasestart_wks,timesincelastevent_days,timesincelastevent_wks,time_to_event_m,case_length
0,New credit,Existing loan takeover,20000.0,Application_652823628,deviant,A_Create Application,User_1,Created,Application,complete,...,4,9,1,2016-01-01 09:51:15.304,0.0,0.0,0.0,0.0,19077.935267,40
1,New credit,Existing loan takeover,20000.0,Application_652823628,deviant,A_Submitted,User_1,statechange,Application,complete,...,4,9,1,2016-01-01 09:51:15.352,5.555556e-07,7.936508e-08,5.555556e-07,7.936508e-08,19077.934467,40
2,New credit,Existing loan takeover,20000.0,Application_652823628,deviant,W_Handle leads,User_1,Created,Workflow,schedule,...,4,9,1,2016-01-01 09:51:15.774,5.439815e-06,7.771164e-07,4.884259e-06,6.977513e-07,19077.927433,40
3,New credit,Existing loan takeover,20000.0,Application_652823628,deviant,W_Handle leads,User_1,Deleted,Workflow,withdraw,...,4,9,1,2016-01-01 09:52:36.392,0.0009385185,0.0001340741,0.0009330787,0.000133297,19076.5838,40
4,New credit,Existing loan takeover,20000.0,Application_652823628,deviant,W_Complete application,User_1,Created,Workflow,schedule,...,4,9,1,2016-01-01 09:52:36.403,0.0009386458,0.0001340923,1.273148e-07,1.818783e-08,19076.583617,40


<IPython.core.display.Javascript object>

In [16]:
def add_lable(group):
    activities1 = [
        "A_Pending",
    ]
    activities2 = ["A_Denied", "A_Cancelled"]

    if not group[group["Activity"].isin(activities1)].empty:
        group["label"] = neg_label  # positive outcome
        return group
    elif not group[group["Activity"].isin(activities2)].empty:
        group["label"] = pos_label  # negative outcome that we want to predict
        return group
    else:
        print("\nNot Complete\n")
        return None


print("Add Label ...")
data = data.groupby(case_id_col, as_index=False).apply(add_lable).reset_index(drop=True)
data

Add Label ...


Unnamed: 0,ApplicationType,LoanGoal,RequestedAmount,Case ID,label,Activity,org:resource,Action,EventOrigin,lifecycle:transition,...,weekday,hour,open_cases,time:timestamp,timesincecasestart_days,timesincecasestart_wks,timesincelastevent_days,timesincelastevent_wks,time_to_event_m,case_length
0,New credit,Existing loan takeover,20000.0,Application_652823628,regular,A_Create Application,User_1,Created,Application,complete,...,4,9,1,2016-01-01 09:51:15.304,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,19077.935267,40
1,New credit,Existing loan takeover,20000.0,Application_652823628,regular,A_Submitted,User_1,statechange,Application,complete,...,4,9,1,2016-01-01 09:51:15.352,5.555556e-07,7.936508e-08,5.555556e-07,7.936508e-08,19077.934467,40
2,New credit,Existing loan takeover,20000.0,Application_652823628,regular,W_Handle leads,User_1,Created,Workflow,schedule,...,4,9,1,2016-01-01 09:51:15.774,5.439815e-06,7.771164e-07,4.884259e-06,6.977513e-07,19077.927433,40
3,New credit,Existing loan takeover,20000.0,Application_652823628,regular,W_Handle leads,User_1,Deleted,Workflow,withdraw,...,4,9,1,2016-01-01 09:52:36.392,9.385185e-04,1.340741e-04,9.330787e-04,1.332970e-04,19076.583800,40
4,New credit,Existing loan takeover,20000.0,Application_652823628,regular,W_Complete application,User_1,Created,Workflow,schedule,...,4,9,1,2016-01-01 09:52:36.403,9.386458e-04,1.340923e-04,1.273148e-07,1.818783e-08,19076.583617,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1198314,New credit,Car,20000.0,Application_1836826811,regular,O_Cancelled,User_134,statechange,Offer,complete,...,2,10,1,2017-02-01 10:15:04.991,3.462904e+01,4.947005e+00,6.944444e-08,9.920635e-09,-0.000433,124
1198315,New credit,Existing loan takeover,25250.0,Application_1433215421,deviant,A_Denied,User_90,statechange,Application,complete,...,2,10,1,2017-02-01 10:46:32.670,4.184302e+01,5.977574e+00,6.276365e-02,8.966235e-03,0.000000,37
1198316,New credit,Existing loan takeover,25250.0,Application_1433215421,deviant,O_Refused,User_90,statechange,Offer,complete,...,2,10,1,2017-02-01 10:46:32.711,4.184302e+01,5.977574e+00,4.745370e-07,6.779101e-08,-0.000683,37
1198317,New credit,Existing loan takeover,25250.0,Application_1433215421,deviant,O_Refused,User_90,statechange,Offer,complete,...,2,10,1,2017-02-01 10:46:32.722,4.184302e+01,5.977574e+00,1.273148e-07,1.818783e-08,-0.000867,37


<IPython.core.display.Javascript object>

In [17]:
set(data.Activity)

{'A_Accepted',
 'A_Cancelled',
 'A_Complete',
 'A_Concept',
 'A_Create Application',
 'A_Denied',
 'A_Incomplete',
 'A_Pending',
 'A_Submitted',
 'A_Validating',
 'O_Accepted',
 'O_Cancelled',
 'O_Create Offer',
 'O_Created',
 'O_Refused',
 'O_Returned',
 'O_Sent (mail and online)',
 'O_Sent (online only)',
 'W_Assess potential fraud',
 'W_Call after offers',
 'W_Call incomplete files',
 'W_Complete application',
 'W_Handle leads',
 'W_Personal Loan collection',
 'W_Shortened completion',
 'W_Validate application'}

<IPython.core.display.Javascript object>

In [18]:
def add_Offere(group):
    # get all observations with O_Created activity
    tmp_df = group[group[activity_col] == "O_Created"]  # to count offers
    # count numbe rof offers for each case
    tmp_df2 = pd.DataFrame(
        tmp_df.groupby([case_id_col])[activity_col].count()
    ).reset_index()
    tmp_df2.columns = [case_id_col, "NumberOfOffers"]
    group = pd.merge(tmp_df2, group, on=case_id_col)
    return group


print("Add no of offeres ...")
data_offeres = add_Offere(data)
data_offeres

Add no of offeres ...


Unnamed: 0,Case ID,NumberOfOffers,ApplicationType,LoanGoal,RequestedAmount,label,Activity,org:resource,Action,EventOrigin,...,weekday,hour,open_cases,time:timestamp,timesincecasestart_days,timesincecasestart_wks,timesincelastevent_days,timesincelastevent_wks,time_to_event_m,case_length
0,Application_1000086665,1,New credit,Other,5000.0,deviant,A_Create Application,User_1,Created,Application,...,2,15,2250,2016-08-03 15:57:21.673,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,46923.250617,22
1,Application_1000086665,1,New credit,Other,5000.0,deviant,A_Submitted,User_1,statechange,Application,...,2,15,2250,2016-08-03 15:57:21.734,7.060185e-07,1.008598e-07,7.060185e-07,1.008598e-07,46923.249600,22
2,Application_1000086665,1,New credit,Other,5000.0,deviant,W_Handle leads,User_1,Created,Workflow,...,2,15,2250,2016-08-03 15:57:21.963,3.356481e-06,4.794974e-07,2.650463e-06,3.786376e-07,46923.245783,22
3,Application_1000086665,1,New credit,Other,5000.0,deviant,W_Handle leads,User_1,Deleted,Workflow,...,2,15,2250,2016-08-03 15:58:28.286,7.709838e-04,1.101405e-04,7.676273e-04,1.096610e-04,46922.140400,22
4,Application_1000086665,1,New credit,Other,5000.0,deviant,W_Complete application,User_1,Created,Workflow,...,2,15,2250,2016-08-03 15:58:28.293,7.710648e-04,1.101521e-04,8.101852e-08,1.157407e-08,46922.140283,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1198314,Application_999993812,1,New credit,Caravan / Camper,30000.0,regular,W_Call incomplete files,User_41,Obtained,Workflow,...,3,8,2320,2016-10-20 08:19:28.812,1.376556e+01,1.966509e+00,8.158991e-01,1.165570e-01,5645.020783,35
1198315,Application_999993812,1,New credit,Caravan / Camper,30000.0,regular,W_Call incomplete files,User_41,Released,Workflow,...,3,8,2321,2016-10-20 08:21:59.667,1.376731e+01,1.966759e+00,1.746007e-03,2.494296e-04,5642.506533,35
1198316,Application_999993812,1,New credit,Caravan / Camper,30000.0,regular,O_Accepted,User_68,statechange,Offer,...,0,6,2106,2016-10-24 06:24:30.056,1.768572e+01,2.526531e+00,3.918407e+00,5.597725e-01,0.000050,35
1198317,Application_999993812,1,New credit,Caravan / Camper,30000.0,regular,A_Pending,User_68,statechange,Application,...,0,6,2106,2016-10-24 06:24:30.059,1.768572e+01,2.526531e+00,3.472222e-08,4.960317e-09,0.000000,35


<IPython.core.display.Javascript object>

In [19]:
set(data_offeres.NumberOfOffers)

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}

<IPython.core.display.Javascript object>

In [20]:
# function to add treatment based on the number of offers
def add_treatment(df):
    # function to cheack if a case contains A_Pending activity or not
    def check_NoOfOffers(gr):
        df = pd.DataFrame(gr)

        # case should be not treated if it receives more than one offer
        if list(df["NumberOfOffers"])[0] <= 1:
            df["treatment"] = "treat"  # T=1 pos_treatment
        else:
            df["treatment"] = "noTreat"  # T=0 neg_treatment
        return df

    # add new treatment for each case based on number of offers
    # cases with only one offer should be treated
    df = df.groupby(case_id_col).apply(check_NoOfOffers)
    df = df.reset_index(drop=True)
    return df


print("Add Treatment...")
data_treat = add_treatment(data_offeres)
data_treat

Add Treatment...


Unnamed: 0,Case ID,NumberOfOffers,ApplicationType,LoanGoal,RequestedAmount,label,Activity,org:resource,Action,EventOrigin,...,hour,open_cases,time:timestamp,timesincecasestart_days,timesincecasestart_wks,timesincelastevent_days,timesincelastevent_wks,time_to_event_m,case_length,treatment
0,Application_1000086665,1,New credit,Other,5000.0,deviant,A_Create Application,User_1,Created,Application,...,15,2250,2016-08-03 15:57:21.673,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,46923.250617,22,treat
1,Application_1000086665,1,New credit,Other,5000.0,deviant,A_Submitted,User_1,statechange,Application,...,15,2250,2016-08-03 15:57:21.734,7.060185e-07,1.008598e-07,7.060185e-07,1.008598e-07,46923.249600,22,treat
2,Application_1000086665,1,New credit,Other,5000.0,deviant,W_Handle leads,User_1,Created,Workflow,...,15,2250,2016-08-03 15:57:21.963,3.356481e-06,4.794974e-07,2.650463e-06,3.786376e-07,46923.245783,22,treat
3,Application_1000086665,1,New credit,Other,5000.0,deviant,W_Handle leads,User_1,Deleted,Workflow,...,15,2250,2016-08-03 15:58:28.286,7.709838e-04,1.101405e-04,7.676273e-04,1.096610e-04,46922.140400,22,treat
4,Application_1000086665,1,New credit,Other,5000.0,deviant,W_Complete application,User_1,Created,Workflow,...,15,2250,2016-08-03 15:58:28.293,7.710648e-04,1.101521e-04,8.101852e-08,1.157407e-08,46922.140283,22,treat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1198314,Application_999993812,1,New credit,Caravan / Camper,30000.0,regular,W_Call incomplete files,User_41,Obtained,Workflow,...,8,2320,2016-10-20 08:19:28.812,1.376556e+01,1.966509e+00,8.158991e-01,1.165570e-01,5645.020783,35,treat
1198315,Application_999993812,1,New credit,Caravan / Camper,30000.0,regular,W_Call incomplete files,User_41,Released,Workflow,...,8,2321,2016-10-20 08:21:59.667,1.376731e+01,1.966759e+00,1.746007e-03,2.494296e-04,5642.506533,35,treat
1198316,Application_999993812,1,New credit,Caravan / Camper,30000.0,regular,O_Accepted,User_68,statechange,Offer,...,6,2106,2016-10-24 06:24:30.056,1.768572e+01,2.526531e+00,3.918407e+00,5.597725e-01,0.000050,35,treat
1198317,Application_999993812,1,New credit,Caravan / Camper,30000.0,regular,A_Pending,User_68,statechange,Application,...,6,2106,2016-10-24 06:24:30.059,1.768572e+01,2.526531e+00,3.472222e-08,4.960317e-09,0.000000,35,treat


<IPython.core.display.Javascript object>

In [21]:
data_all = data_treat
# impute missing values
print("impute missing values...")
grouped = data_all.sort_values(timestamp_col, ascending=True, kind="mergesort").groupby(
    case_id_col
)
for col in static_cols + dynamic_cols:
    data[col] = grouped[col].transform(lambda grp: grp.fillna(method="ffill"))

data_all[cat_cols] = data_all[cat_cols].fillna("missing")
data_all = data_all.fillna(0)

# set infrequent factor levels to "other"
for col in cat_cols:
    counts = data_all[col].value_counts()
    mask = data_all[col].isin(counts[counts >= freq_threshold].index)
    data_all.loc[~mask, col] = "other"

impute missing values...


<IPython.core.display.Javascript object>

In [24]:
data_all.columns

Index(['Case ID', 'NumberOfOffers', 'ApplicationType', 'LoanGoal',
       'RequestedAmount', 'label', 'Activity', 'org:resource', 'Action',
       'EventOrigin', 'lifecycle:transition', 'Accepted', 'Selected',
       'FirstWithdrawalAmount', 'MonthlyCost', 'NumberOfTerms',
       'OfferedAmount', 'CreditScore', 'timesincelastevent',
       'timesincecasestart', 'timesincemidnight', 'event_nr', 'month',
       'weekday', 'hour', 'open_cases', 'time:timestamp',
       'timesincecasestart_days', 'timesincecasestart_wks',
       'timesincelastevent_days', 'timesincelastevent_wks', 'time_to_event_m',
       'case_length', 'treatment'],
      dtype='object')

<IPython.core.display.Javascript object>

In [25]:
try:
    del data_all["end_time"]
except:
    pass
print("Saving csv file...")
results_dir = "./prepared_data/%s/" % dataset_name
import os

if not os.path.exists(os.path.join(results_dir)):
    os.makedirs(os.path.join(results_dir))

# df.to_csv(results_dir + '/prepared_treatment_outcome_bpic2012.csv', index=False, sep=';')
data_all["event"] = 1
data_all["event"] = data_all["event"].astype("bool")

data_all.to_csv(
    os.path.join(
        results_dir, "prepared_treatment_outcome_time_to_event_%s.csv" % dataset_name
    ),
    index=False,
    sep=";",
)

Saving csv file...


<IPython.core.display.Javascript object>