## **Libraries**

In [None]:
# Import libraries
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.model_selection import train_test_split

## **PD Model**

### **Dependent Variable**

In [None]:
# Load processed data
loan_data = pd.read_csv(os.path.join("..", "data", "processed", "loan_data_0714.csv"))

In [None]:
# Display unique values of a column
loan_data["loan_status"].unique()

In [None]:
# Calculate the number of observations for each unique value of a variable
loan_data["loan_status"].value_counts()

In [None]:
# Get the proportion of observations for each unique value of a variable
loan_data["loan_status"].value_counts() / loan_data["loan_status"].count()

In [None]:
# Create a binary variable to classify the performance of the loan
loan_data["loan_performance"] = np.where(loan_data["loan_status"].isin(["Charged Off", 
                                                                        "Default",
                                                                        "Does not meet the credit policy. Status:Charged Off",
                                                                        "Late (31-120 days)"]), 0, 1)
loan_data["loan_performance"].head()

### **Splitting Data**

In [None]:
# Split two data frames with inputs and targets and each into a train and test set
inputs_train, inputs_test, targets_train, targets_test = train_test_split(loan_data.drop("loan_performance", axis = 1), loan_data["loan_performance"], test_size = 0.2, random_state = 42)
print(f"Inputs (Train): {inputs_train.shape}")
print(f"Target (Train): {targets_train.shape}\n")
print(f"Inputs (Test): {inputs_test.shape}")
print(f"Target (Test): {targets_test.shape}")

### **Discrete Variable**

In [None]:
# Define a function to calculate the weight of average for discrete variable
def woe_discrete(df, variable_name, df_good_bad):
    # Concatenate the discrete variable with the input data frame
    df = pd.concat([df[variable_name], df_good_bad], axis = 1)
    # Group by the discrete variable and calculate the number of observations and mean of the variable
    df = pd.concat([df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].count(),
                    df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].mean()], axis = 1)
    # Select the columns that indicates the name of variable and the number and mean of observations
    df = df.iloc[:, [0, 1, 3]]
    df.columns = [df.columns.values[0], "n_obs", "prop_good"]
    # Calculate the proportion of observations for each category
    df["prop_obs"] = df["n_obs"] / df["n_obs"].sum()
    # Compute the number of "good" cases and "bad" cases for each category
    df["n_good"] = df["prop_good"] * df["n_obs"]
    df["n_bad"] = (1 - df["prop_good"]) * df["n_obs"]
    # Calculate the proportion of "good" and "bad" cases
    df["prop_good"] = df["n_good"] / df["n_good"].sum()
    df["prop_bad"] = df["n_bad"] / df["n_bad"].sum()
    # Compute the Weight of Evidence (WoE) for each category
    df["WoE"] = np.log(df["prop_good"] / df["prop_bad"])
    # Sort the data frame by WoE
    df = df.sort_values(["WoE"])
    # Reset the index of the data frame
    df = df.reset_index(drop = True)
    # Calculate the absolute difference in "prop_good" and "WoE" between consecutive categories
    df["diff_prop_good"] = df["prop_good"].diff().abs()
    df["diff_WoE"] = df["WoE"].diff().abs()
    # Compute the Information Value (IV) for the entire variable
    df["IV"] = (df["prop_good"] - df["prop_bad"]) * df["WoE"]
    df["IV"] = df["IV"].sum()
    return df

In [None]:
# Define a function that displays results
def plot_by_woe(df_WoE, rotation_of_x_axis_labels = 0):
    x = np.array(df_WoE.iloc[:, 0].apply(str))
    y = df_WoE["WoE"]
    plt.figure(figsize = (12, 3))
    sns.set_style("white")
    plt.plot(x, y, marker = "o", linestyle = "--", color = "k")
    plt.xlabel(df_WoE.columns[0], fontsize = 12)
    plt.ylabel("Weight of Evidence", fontsize = 12)
    plt.xticks(rotation = rotation_of_x_axis_labels)

In [None]:
# Create the necessary arguments
df_inputs = inputs_train
df_targets = targets_train

#### **Grade**

In [None]:
# Variable: "grade"
df_temp = woe_discrete(df_inputs, "grade", df_targets)
plot_by_woe(df_temp)

#### **Home Ownership**

In [None]:
# Variable: "home_ownership"
df_temp = woe_discrete(df_inputs, "home_ownership", df_targets)
plot_by_woe(df_temp)

In [None]:
# Create a new discrete variable to combine some of the categories
df_inputs["home_ownership:RENT_OTHER_NONE_ANY"] = sum([df_inputs["home_ownership:RENT"],
                                                       df_inputs["home_ownership:OTHER"],
                                                       df_inputs["home_ownership:NONE"],
                                                       df_inputs["home_ownership:ANY"]])

#### **State Address**

In [None]:
# Variable: "addr_state"
df_temp = woe_discrete(df_inputs, "addr_state", df_targets)
plot_by_woe(df_temp)

In [None]:
# Check if the column exists in the data frame and if it does not add it
if ["addr_state:ND"] in df_inputs.columns.values:
    pass
else:
    df_inputs["addr_state:ND"] = 0

In [None]:
# Plot the weight of evidence values
plot_by_woe(df_temp.iloc[2: -2, : ])

In [None]:
# Create categories using weight of evidence values and set "IA_NV_HI_ID_AL_FL" as the reference one
df_inputs["addr_state:ND_NE_IA_NV_FL_HI_AL"] = sum([df_inputs["addr_state:ND"],
                                                    df_inputs["addr_state:NE"],
                                                    df_inputs["addr_state:IA"],
                                                    df_inputs["addr_state:NV"],
                                                    df_inputs["addr_state:FL"],
                                                    df_inputs["addr_state:HI"],
                                                    df_inputs["addr_state:AL"]])

df_inputs["addr_state:NM_VA"] = sum([df_inputs["addr_state:NM"], df_inputs["addr_state:VA"]])

df_inputs["addr_state:OK_TN_MO_LA_MD_NC"] = sum([df_inputs["addr_state:OK"],
                                                 df_inputs["addr_state:TN"],
                                                 df_inputs["addr_state:MO"],
                                                 df_inputs["addr_state:LA"],
                                                 df_inputs["addr_state:MD"],
                                                 df_inputs["addr_state:NC"]])

df_inputs["addr_state:UT_KY_AZ_NJ"] = sum([df_inputs["addr_state:UT"],
                                           df_inputs["addr_state:KY"],
                                           df_inputs["addr_state:AZ"],
                                           df_inputs["addr_state:NJ"]])

df_inputs["addr_state:AR_MI_PA_OH_MN"] = sum([df_inputs["addr_state:AR"],
                                              df_inputs["addr_state:MI"],
                                              df_inputs["addr_state:PA"],
                                              df_inputs["addr_state:OH"],
                                              df_inputs["addr_state:MN"]])

df_inputs["addr_state:RI_MA_DE_SD_IN"] = sum([df_inputs["addr_state:RI"],
                                              df_inputs["addr_state:MA"],
                                              df_inputs["addr_state:DE"],
                                              df_inputs["addr_state:SD"],
                                              df_inputs["addr_state:IN"]])

df_inputs["addr_state:GA_WA_OR"] = sum([df_inputs["addr_state:GA"],
                                        df_inputs["addr_state:WA"],
                                        df_inputs["addr_state:OR"]])

df_inputs["addr_state:WI_MT"] = sum([df_inputs["addr_state:WI"],
                                     df_inputs["addr_state:MT"]])

df_inputs["addr_state:IL_CT"] = sum([df_inputs["addr_state:IL"],
                                     df_inputs["addr_state:CT"]])

df_inputs["addr_state:KS_SC_CO_VT_AK_MS"] = sum([df_inputs["addr_state:KS"],
                                                 df_inputs["addr_state:SC"],
                                                 df_inputs["addr_state:CO"],
                                                 df_inputs["addr_state:VT"],
                                                 df_inputs["addr_state:AK"],
                                                 df_inputs["addr_state:MS"]])

df_inputs["addr_state:WV_NH_WY_DC_ME_ID"] = sum([df_inputs["addr_state:WV"],
                                                 df_inputs["addr_state:NH"],
                                                 df_inputs["addr_state:WY"],
                                                 df_inputs["addr_state:DC"],
                                                 df_inputs["addr_state:ME"],
                                                 df_inputs["addr_state:ID"]])

#### **Verification Status**

In [None]:
# Variable: "verification_status"
df_temp = woe_discrete(df_inputs, "verification_status", df_targets)
plot_by_woe(df_temp)

#### **Purpose**

In [None]:
# Variable: "purpose"
df_temp = woe_discrete(df_inputs, "purpose", df_targets)
plot_by_woe(df_temp, 90)

In [None]:
# Create categories using weight of evidence values and set "purpose:e_sb_w_re_m_h" as the reference one
df_inputs["purpose:e_sb_w_re_m_h"] = sum([df_inputs["purpose:educational"],
                                          df_inputs["purpose:small_business"],
                                          df_inputs["purpose:wedding"],
                                          df_inputs["purpose:renewable_energy"],
                                          df_inputs["purpose:moving"],
                                          df_inputs["purpose:house"]])

df_inputs["purpose:o_m_v"] = sum([df_inputs["purpose:other"],
                                  df_inputs["purpose:medical"],
                                  df_inputs["purpose:vacation"]])

df_inputs["purpose:mp_c_hi"] = sum([df_inputs["purpose:major_purchase"],
                                    df_inputs["purpose:car"],
                                    df_inputs["purpose:home_improvement"]])

#### **List Status**

In [None]:
# Variable: "initial_list_status"
df_temp = woe_discrete(df_inputs, "initial_list_status", df_targets)
plot_by_woe(df_temp)

### **Continuos Variable**

In [None]:
# Define a function for ordered discrete and continuous variables
def woe_ordered_continuous(df, variable_name, df_good_bad):
    # Concatenate the discrete variable with the input data frame
    df = pd.concat([df[variable_name], df_good_bad], axis = 1)
    # Group by the discrete variable and calculate the number of observations and mean of the variable
    df = pd.concat([df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].count(),
                    df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].mean()], axis = 1)
    # Select the columns that indicates the name of variable and the number and mean of observations
    df = df.iloc[:, [0, 1, 3]]
    df.columns = [df.columns.values[0], "n_obs", "prop_good"]
    # Calculate the proportion of observations for each category
    df["prop_obs"] = df["n_obs"] / df["n_obs"].sum()
    # Compute the number of "good" cases and "bad" cases for each category
    df["n_good"] = df["prop_good"] * df["n_obs"]
    df["n_bad"] = (1 - df["prop_good"]) * df["n_obs"]
    # Calculate the proportion of "good" and "bad" cases
    df["prop_good"] = df["n_good"] / df["n_good"].sum()
    df["prop_bad"] = df["n_bad"] / df["n_bad"].sum()
    # Compute the Weight of Evidence (WoE) for each category
    df["WoE"] = np.log(df["prop_good"] / df["prop_bad"])
    # Calculate the absolute difference in "prop_good" and "WoE" between consecutive categories
    df["diff_prop_good"] = df["prop_good"].diff().abs()
    df["diff_WoE"] = df["WoE"].diff().abs()
    # Compute the Information Value (IV) for the entire variable
    df["IV"] = (df["prop_good"] - df["prop_bad"]) * df["WoE"]
    df["IV"] = df["IV"].sum()
    return df

#### **Term**

In [None]:
# Variable: "term_int"
df_temp = woe_ordered_continuous(df_inputs, "term_int", df_targets)
plot_by_woe(df_temp)

In [None]:
# Set "60" as the reference category
df_inputs["term:36"] = np.where((df_inputs["term_int"] == 36), 1, 0)
df_inputs["term:60"] = np.where((df_inputs["term_int"] == 60), 1, 0)

#### **Employment Length**

In [None]:
# Variable: "emp_length"
df_temp = woe_ordered_continuous(df_inputs, "emp_length", df_targets)
plot_by_woe(df_temp)

In [None]:
# Create the following categories and set "0" as the reference one
df_inputs["emp_length:0"] = np.where(df_inputs["emp_length"].isin([0]), 1, 0)
df_inputs["emp_length:1"] = np.where(df_inputs["emp_length"].isin([1]), 1, 0)
df_inputs["emp_length:2-4"] = np.where(df_inputs["emp_length"].isin(range(2, 5)), 1, 0)
df_inputs["emp_length:5-6"] = np.where(df_inputs["emp_length"].isin(range(5, 7)), 1, 0)
df_inputs["emp_length:7-9"] = np.where(df_inputs["emp_length"].isin(range(7, 10)), 1, 0)
df_inputs["emp_length:10"] = np.where(df_inputs["emp_length"].isin([10]), 1, 0)

#### **Issue Months**

In [None]:
# Variable: "mths_issue_d"
df_inputs["mths_issue_d_factor"] = pd.cut(df_inputs["mths_issue_d"], 50)
# Use the "cut" method to do fine-classing and split the variable into 50 categories by its values
df_temp = woe_ordered_continuous(df_inputs, "mths_issue_d_factor", df_targets)
plot_by_woe(df_temp.iloc[3: , : ], 90)

In [None]:
# Create the following categories:
df_inputs["mths_issue_d:<38"] = np.where(df_inputs["mths_issue_d"].isin(range(38)), 1, 0)
df_inputs["mths_issue_d:38-39"] = np.where(df_inputs["mths_issue_d"].isin(range(38, 40)), 1, 0)
df_inputs["mths_issue_d:40-41"] = np.where(df_inputs["mths_issue_d"].isin(range(40, 42)), 1, 0)
df_inputs["mths_issue_d:42-48"] = np.where(df_inputs["mths_issue_d"].isin(range(42, 49)), 1, 0)
df_inputs["mths_issue_d:49-52"] = np.where(df_inputs["mths_issue_d"].isin(range(49, 53)), 1, 0)
df_inputs["mths_issue_d:53-64"] = np.where(df_inputs["mths_issue_d"].isin(range(53, 65)), 1, 0)
df_inputs["mths_issue_d:65-84"] = np.where(df_inputs["mths_issue_d"].isin(range(65, 85)), 1, 0)
df_inputs["mths_issue_d:>84"] = np.where(df_inputs["mths_issue_d"].isin(range(85, 125)), 1, 0)

#### **Interest Rate**

In [None]:
# Variable: "int_rate"
df_inputs["int_rate_factor"] = pd.cut(df_inputs["int_rate"], 50)
# Use the "cut" method to do fine-classing and split the variable into 50 categories by its values
df_temp = woe_ordered_continuous(df_inputs, "int_rate_factor", df_targets)
plot_by_woe(df_temp.iloc[3: , : ], 90)

In [None]:
# Create the following categories:
df_inputs["int_rate:<9.548"] = np.where((df_inputs["int_rate"] <= 9.548), 1, 0)
df_inputs["int_rate:9.548-12.025"] = np.where((df_inputs["int_rate"] > 9.548) & (df_inputs["int_rate"] <= 12.025), 1, 0)
df_inputs["int_rate:12.025-15.74"] = np.where((df_inputs["int_rate"] > 12.025) & (df_inputs["int_rate"] <= 15.74), 1, 0)
df_inputs["int_rate:15.74-20.281"] = np.where((df_inputs["int_rate"] > 15.74) & (df_inputs["int_rate"] <= 20.281), 1, 0)
df_inputs["int_rate:>20.281"] = np.where((df_inputs["int_rate"] > 20.281), 1, 0)

#### **Funded Amount**

In [None]:
# Variable: "funded_amnt"
df_inputs["funded_amnt_factor"] = pd.cut(df_inputs["funded_amnt"], 50)
# Use the "cut" method to do fine-classing and split the variable into 50 categories by its values
df_temp = woe_ordered_continuous(df_inputs, "funded_amnt_factor", df_targets)
plot_by_woe(df_temp, 90)

#### **Credit Line**

In [None]:
# Variable: "months_cr_line"
df_inputs["months_cr_line_factor"] = pd.cut(df_inputs["months_cr_line"], 50)
# Use the "cut" method to do fine-classing and split the variable into 50 categories by its values
df_temp = woe_ordered_continuous(df_inputs, "months_cr_line_factor", df_targets)
plot_by_woe(df_temp.iloc[6: , : ], 90)

In [None]:
# Create the following categories:
df_inputs["months_cr_line:<140"] = np.where(df_inputs["months_cr_line"].isin(range(140)), 1, 0)
df_inputs["months_cr_line:141-164"] = np.where(df_inputs["months_cr_line"].isin(range(140, 165)), 1, 0)
df_inputs["months_cr_line:165-247"] = np.where(df_inputs["months_cr_line"].isin(range(165, 248)), 1, 0)
df_inputs["months_cr_line:248-270"] = np.where(df_inputs["months_cr_line"].isin(range(248, 271)), 1, 0)
df_inputs["months_cr_line:271-352"] = np.where(df_inputs["months_cr_line"].isin(range(271, 353)), 1, 0)
df_inputs["months_cr_line:>352"] = np.where(df_inputs["months_cr_line"].isin(range(353, int(df_inputs["months_cr_line"].max()))), 1, 0)

#### **Delinquency (2 Years)**

In [None]:
# Variable: "delinq_2yrs"
df_temp = woe_ordered_continuous(df_inputs, "delinq_2yrs", df_targets)
plot_by_woe(df_temp.iloc[6: , : ], 90)

In [None]:
# Create the following categories:
df_inputs["delinq_2yrs:0"] = np.where((df_inputs["delinq_2yrs"] == 0), 1, 0)
df_inputs["delinq_2yrs:1-3"] = np.where((df_inputs["delinq_2yrs"] >= 1) & (df_inputs["delinq_2yrs"] <= 3), 1, 0)
df_inputs["delinq_2yrs:>=4"] = np.where((df_inputs["delinq_2yrs"] >= 9), 1, 0)

#### **Inquiries Number**

In [None]:
# Variable: "inq_last_6mths"
df_temp = woe_ordered_continuous(df_inputs, "inq_last_6mths", df_targets)
plot_by_woe(df_temp)

In [None]:
# Create the following categories:
df_inputs["inq_last_6mths:0"] = np.where((df_inputs["inq_last_6mths"] == 0), 1, 0)
df_inputs["inq_last_6mths:1-2"] = np.where((df_inputs["inq_last_6mths"] >= 1) & (df_inputs["inq_last_6mths"] <= 2), 1, 0)
df_inputs["inq_last_6mths:3-6"] = np.where((df_inputs["inq_last_6mths"] >= 3) & (df_inputs["inq_last_6mths"] <= 6), 1, 0)
df_inputs["inq_last_6mths:>6"] = np.where((df_inputs["inq_last_6mths"] > 6), 1, 0)

#### **Open Accounts**

In [None]:
# Variable: "open_acc"
df_temp = woe_ordered_continuous(df_inputs, "open_acc", df_targets)
plot_by_woe(df_temp.iloc[ : 40, :], 90)

In [None]:
# Create the following categories:
df_inputs["open_acc:0"] = np.where((df_inputs["open_acc"] == 0), 1, 0)
df_inputs["open_acc:1-3"] = np.where((df_inputs["open_acc"] >= 1) & (df_inputs["open_acc"] <= 3), 1, 0)
df_inputs["open_acc:4-12"] = np.where((df_inputs["open_acc"] >= 4) & (df_inputs["open_acc"] <= 12), 1, 0)
df_inputs["open_acc:13-17"] = np.where((df_inputs["open_acc"] >= 13) & (df_inputs["open_acc"] <= 17), 1, 0)
df_inputs["open_acc:18-22"] = np.where((df_inputs["open_acc"] >= 18) & (df_inputs["open_acc"] <= 22), 1, 0)
df_inputs["open_acc:23-25"] = np.where((df_inputs["open_acc"] >= 23) & (df_inputs["open_acc"] <= 25), 1, 0)
df_inputs["open_acc:26-30"] = np.where((df_inputs["open_acc"] >= 26) & (df_inputs["open_acc"] <= 30), 1, 0)
df_inputs["open_acc:>=31"] = np.where((df_inputs["open_acc"] >= 31), 1, 0)

#### **Public Records**

In [None]:
# Variable: "pub_rec"
df_temp = woe_ordered_continuous(df_inputs, "pub_rec", df_targets)
plot_by_woe(df_temp, 90)

In [None]:
# Create the following categories:
df_inputs["pub_rec:0-2"] = np.where((df_inputs["pub_rec"] >= 0) & (df_inputs["pub_rec"] <= 2), 1, 0)
df_inputs["pub_rec:3-4"] = np.where((df_inputs["pub_rec"] >= 3) & (df_inputs["pub_rec"] <= 4), 1, 0)
df_inputs["pub_rec:>=5"] = np.where((df_inputs["pub_rec"] >= 5), 1, 0)

#### **Total Accounts**

In [None]:
# Variable: "total_acc"
df_inputs["total_acc_factor"] = pd.cut(df_inputs["total_acc"], 50)
# Use the "cut" method to do fine-classing and split the variable into 50 categories by its values
df_temp = woe_ordered_continuous(df_inputs, "total_acc_factor", df_targets)
plot_by_woe(df_temp, 90)

In [None]:
# Create the following categories:
df_inputs["total_acc:<=27"] = np.where((df_inputs["total_acc"] <= 27), 1, 0)
df_inputs["total_acc:28-51"] = np.where((df_inputs["total_acc"] >= 28) & (df_inputs["total_acc"] <= 51), 1, 0)
df_inputs["total_acc:>=52"] = np.where((df_inputs["total_acc"] >= 52), 1, 0)

#### **Delinquent Account**

In [None]:
# Variable: "acc_now_delinq"
df_temp = woe_ordered_continuous(df_inputs, "acc_now_delinq", df_targets)
plot_by_woe(df_temp)

In [None]:
# Create the following categories:
df_inputs["acc_now_delinq:0"] = np.where((df_inputs["acc_now_delinq"] == 0), 1, 0)
df_inputs["acc_now_delinq:>=1"] = np.where((df_inputs["acc_now_delinq"] >= 1), 1, 0)

#### **Total Revolving Credit Limit**

In [None]:
# Variable: "total_rev_hi_lim"
df_inputs["total_rev_hi_lim_factor"] = pd.cut(df_inputs["total_rev_hi_lim"], 2000)
# Use the "cut" method to do fine-classing and split the variable into 2000 categories by its values
df_temp = woe_ordered_continuous(df_inputs, "total_rev_hi_lim_factor", df_targets)
plot_by_woe(df_temp.iloc[: 50, : ], 90)

In [None]:
# Create the following categories:
df_inputs["total_rev_hi_lim:<=5K"] = np.where((df_inputs["total_rev_hi_lim"] <= 5000), 1, 0)
df_inputs["total_rev_hi_lim:5K-10K"] = np.where((df_inputs["total_rev_hi_lim"] > 5000) & (df_inputs["total_rev_hi_lim"] <= 10000), 1, 0)
df_inputs["total_rev_hi_lim:10K-20K"] = np.where((df_inputs["total_rev_hi_lim"] > 10000) & (df_inputs["total_rev_hi_lim"] <= 20000), 1, 0)
df_inputs["total_rev_hi_lim:20K-30K"] = np.where((df_inputs["total_rev_hi_lim"] > 20000) & (df_inputs["total_rev_hi_lim"] <= 30000), 1, 0)
df_inputs["total_rev_hi_lim:30K-40K"] = np.where((df_inputs["total_rev_hi_lim"] > 30000) & (df_inputs["total_rev_hi_lim"] <= 40000), 1, 0)
df_inputs["total_rev_hi_lim:40K-55K"] = np.where((df_inputs["total_rev_hi_lim"] > 40000) & (df_inputs["total_rev_hi_lim"] <= 55000), 1, 0)
df_inputs["total_rev_hi_lim:55K-95K"] = np.where((df_inputs["total_rev_hi_lim"] > 55000) & (df_inputs["total_rev_hi_lim"] <= 95000), 1, 0)
df_inputs["total_rev_hi_lim:>95K"] = np.where((df_inputs["total_rev_hi_lim"] > 95000), 1, 0)

#### **Installment**

In [None]:
# Variable: "installment"
df_inputs["installment_factor"] = pd.cut(df_inputs["installment"], 50)
# Use the "cut" method to do fine-classing and split the variable into 50 categories by its values
df_temp = woe_ordered_continuous(df_inputs, "installment_factor", df_targets)
plot_by_woe(df_temp, 90)

#### **Annual Income**

In [None]:
# Variable: "annual_inc"
df_inputs = df_inputs.loc[df_inputs["annual_inc"] <= 140000, :]
df_inputs["annual_inc_factor"] = pd.cut(df_inputs["annual_inc"], 50)
# Use the "cut" method to do fine-classing and split the variable into 50 categories by its values
df_temp = woe_ordered_continuous(df_inputs, "annual_inc_factor", df_targets)
plot_by_woe(df_temp, 90)

In [None]:
# Create the following categories considering that WoE is monotonically decreasing with income:
df_inputs["annual_inc:<20K"] = np.where((df_inputs["annual_inc"] <= 20000), 1, 0)
df_inputs["annual_inc:20K-30K"] = np.where((df_inputs["annual_inc"] > 20000) & (df_inputs["annual_inc"] <= 30000), 1, 0)
df_inputs["annual_inc:30K-40K"] = np.where((df_inputs["annual_inc"] > 30000) & (df_inputs["annual_inc"] <= 40000), 1, 0)
df_inputs["annual_inc:40K-50K"] = np.where((df_inputs["annual_inc"] > 40000) & (df_inputs["annual_inc"] <= 50000), 1, 0)
df_inputs["annual_inc:50K-60K"] = np.where((df_inputs["annual_inc"] > 50000) & (df_inputs["annual_inc"] <= 60000), 1, 0)
df_inputs["annual_inc:60K-70K"] = np.where((df_inputs["annual_inc"] > 60000) & (df_inputs["annual_inc"] <= 70000), 1, 0)
df_inputs["annual_inc:70K-80K"] = np.where((df_inputs["annual_inc"] > 70000) & (df_inputs["annual_inc"] <= 80000), 1, 0)
df_inputs["annual_inc:80K-90K"] = np.where((df_inputs["annual_inc"] > 80000) & (df_inputs["annual_inc"] <= 90000), 1, 0)
df_inputs["annual_inc:90K-100K"] = np.where((df_inputs["annual_inc"] > 90000) & (df_inputs["annual_inc"] <= 100000), 1, 0)
df_inputs["annual_inc:100K-120K"] = np.where((df_inputs["annual_inc"] > 100000) & (df_inputs["annual_inc"] <= 120000), 1, 0)
df_inputs["annual_inc:120K-140K"] = np.where((df_inputs["annual_inc"] > 120000) & (df_inputs["annual_inc"] <= 140000), 1, 0)
df_inputs["annual_inc:>140K"] = np.where((df_inputs["annual_inc"] > 140000), 1, 0)

#### **Delinquency Months**

In [None]:
# Variable: "mths_since_last_delinq"
df_temp = df_inputs[pd.notnull(df_inputs["mths_since_last_delinq"])]
# Create one category for missing values and do fine and coarse classing for the rest
df_temp["mths_since_last_delinq_factor"] = pd.cut(df_temp["mths_since_last_delinq"], 50)
df_temp = woe_ordered_continuous(df_temp, "mths_since_last_delinq_factor", df_targets[df_temp.index])
plot_by_woe(df_temp, 90)

In [None]:
# Create the following categories:
df_inputs["mths_since_last_delinq:Missing"] = np.where((df_inputs["mths_since_last_delinq"].isnull()), 1, 0)
df_inputs["mths_since_last_delinq:0-3"] = np.where((df_inputs["mths_since_last_delinq"] >= 0) & (df_inputs["mths_since_last_delinq"] <= 3), 1, 0)
df_inputs["mths_since_last_delinq:4-30"] = np.where((df_inputs["mths_since_last_delinq"] >= 4) & (df_inputs["mths_since_last_delinq"] <= 30), 1, 0)
df_inputs["mths_since_last_delinq:31-56"] = np.where((df_inputs["mths_since_last_delinq"] >= 31) & (df_inputs["mths_since_last_delinq"] <= 56), 1, 0)
df_inputs["mths_since_last_delinq:>=57"] = np.where((df_inputs["mths_since_last_delinq"] >= 57), 1, 0)

#### **Debt-to-Income Ratio**

In [None]:
# Variable: "dti"
df_temp = df_inputs.loc[df_inputs["dti"] <= 35, : ]
df_inputs["dti_factor"] = pd.cut(df_inputs["dti"], 50)
# Use the "cut" method to do fine-classing and split the variable into 100 categories by its values
df_temp = woe_ordered_continuous(df_inputs, "dti_factor", df_targets)
plot_by_woe(df_temp, 90)

In [None]:
# Create the following categories:
df_inputs["dti:<=1.4"] = np.where((df_inputs["dti"] <= 1.4), 1, 0)
df_inputs["dti:1.4-3.5"] = np.where((df_inputs["dti"] > 1.4) & (df_inputs["dti"] <= 3.5), 1, 0)
df_inputs["dti:3.5-7.7"] = np.where((df_inputs["dti"] > 3.5) & (df_inputs["dti"] <= 7.7), 1, 0)
df_inputs["dti:7.7-10.5"] = np.where((df_inputs["dti"] > 7.7) & (df_inputs["dti"] <= 10.5), 1, 0)
df_inputs["dti:10.5-16.1"] = np.where((df_inputs["dti"] > 10.5) & (df_inputs["dti"] <= 16.1), 1, 0)
df_inputs["dti:16.1-20.3"] = np.where((df_inputs["dti"] > 16.1) & (df_inputs["dti"] <= 20.3), 1, 0)
df_inputs["dti:20.3-21.7"] = np.where((df_inputs["dti"] > 20.3) & (df_inputs["dti"] <= 21.7), 1, 0)
df_inputs["dti:21.7-22.4"] = np.where((df_inputs["dti"] > 21.7) & (df_inputs["dti"] <= 22.4), 1, 0)
df_inputs["dti:22.4-35"] = np.where((df_inputs["dti"] > 22.4) & (df_inputs["dti"] <= 35), 1, 0)
df_inputs["dti:>35"] = np.where((df_inputs["dti"] > 35), 1, 0)

#### **Last Records**

In [None]:
# Variable: "mths_since_last_record"
# Create one category for missing values and do fine and coarse classing for the rest
df_temp = df_inputs[pd.notnull(df_inputs["mths_since_last_record"])]
df_temp["mths_since_last_record_factor"] = pd.cut(df_temp["mths_since_last_record"], 50)
# Use the "cut" method to do fine-classing and split the variable into 50 categories by its values
df_temp = woe_ordered_continuous(df_temp, "mths_since_last_record_factor", df_targets[df_temp.index])
plot_by_woe(df_temp, 90)

In [None]:
# Create the following categories:
df_inputs["mths_since_last_record:Missing"] = np.where((df_inputs["mths_since_last_record"].isnull()), 1, 0)
df_inputs["mths_since_last_record:0-2"] = np.where((df_inputs["mths_since_last_record"] >= 0) & (df_inputs["mths_since_last_record"] <= 2), 1, 0)
df_inputs["mths_since_last_record:3-20"] = np.where((df_inputs["mths_since_last_record"] >= 3) & (df_inputs["mths_since_last_record"] <= 20), 1, 0)
df_inputs["mths_since_last_record:21-31"] = np.where((df_inputs["mths_since_last_record"] >= 21) & (df_inputs["mths_since_last_record"] <= 31), 1, 0)
df_inputs["mths_since_last_record:32-80"] = np.where((df_inputs["mths_since_last_record"] >= 32) & (df_inputs["mths_since_last_record"] <= 80), 1, 0)
df_inputs["mths_since_last_record:81-86"] = np.where((df_inputs["mths_since_last_record"] >= 81) & (df_inputs["mths_since_last_record"] <= 86), 1, 0)
df_inputs["mths_since_last_record:>86"] = np.where((df_inputs["mths_since_last_record"] > 86), 1, 0)

### **Saving**

In [None]:
# Save training data set
inputs_train = df_inputs
path = os.path.join("..", "data", "processed", "train")
os.makedirs(path, exist_ok = True)
inputs_train.to_csv(os.path.join(path, "inputs_train.csv"))
targets_train.to_csv(os.path.join(path, "targets_train.csv"))

In [None]:
# Save testing data set
inputs_test = df_inputs
path = os.path.join("..", "data", "processed", "test")
os.makedirs(path, exist_ok = True)
inputs_test.to_csv(os.path.join(path, "inputs_test.csv"))
targets_test.to_csv(os.path.join(path, "targets_test.csv"))