# Previous application table

## Setup

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
# sns.set_style('whitegrid')
from collections import (
    Counter,
)
import matplotlib.pyplot as plt
from sklearn.preprocessing import (
    OneHotEncoder, 
    KBinsDiscretizer, 
    FunctionTransformer,
    MinMaxScaler,
)
from sklearn.compose import (
    ColumnTransformer,
)

In [2]:
# Read the data into pandas dataframe
df = pd.read_csv('data/previous_application.csv')

### Columns of the data 

**Variable Description:**

|Variable|Definition |Key|  Type |
|---|---|---|---|
| **SK_ID_PREV** | ID of previous credit in Home credit related to loan in our sample | |int| 
|**NAME_CONTRACT_TYPE**|Contract product type (Cash loan, consumer loan [POS] ,...) of the previous application | |object|
|**AMT_ANNUITY**|Annuity of previous application| | float|
|**AMT_APPLICATION**|For how much credit did client ask on the previous application| | float|
|**AMT_CREDIT**|Final credit amount on the previous application. | | float|
|**AMT_GOODS_PRICE**|Goods price of good that client asked for (if applicable) on the previous application | | float|
|**WEEKDAY_APPR_PROCESS_START**|On which day of the week did the client apply for previous application| |object|
|**HOUR_APPR_PROCESS_START**|Approximately at what day hour did the client apply for the previous application| |int|
|**FLAG_LAST_APPL_PER_CONTRACT**|Flag if it was last application for the previous contract.| |object|
|**NFLAG_LAST_APPL_IN_DAY**|Flag if the application was the last application per day of the client | |int| 
|**NAME_CASH_LOAN_PURPOSE** | Purpose of the cash loan| |object| 
|**NAME_CONTRACT_STATUS**|Contract status (approved, cancelled, ...) of previous application | |object|
|**DAYS_DECISION**|Relative to current application when was the decision about previous application made| |int|
|**NAME_PAYMENT_TYPE**|Payment method that client chose to pay for the previous application| |object|
|**CODE_REJECT_REASON**|Why was the previous application rejected | |object|
|**NAME_CLIENT_TYPE**|Was the client old or new client when applying for the previous application| |object|
|**NAME_GOODS_CATEGORY**|What kind of goods did the client apply for in the previous application| |object|
|**NAME_PORTFOLIO**|Was the previous application for CASH, POS, CAR | |object|
|**NAME_PRODUCT_TYPE**| Was the previous application x-sell o walk-in| |object|
|**CHANNEL_TYPE**|Through which channel we acquired the client on the previous application| |object|
|**SELLERPLACE_AREA**|Selling area of seller place of the previous application| |int|
|**NAME_SELLER_INDUSTRY**|The industry of the seller| |object|
|**CNT_PAYMENT**|Term of previous credit at application of the previous application| |float|
|**NAME_YIELD_GROUP**|Grouped interest rate into small medium and high of the previous application | |object|
|**PRODUCT_COMBINATION**|Detailed product combination of the previous application| |object|
|**DAYS_LAST_DUE**|Relative to application date of current application when was the last due date of the previous appl| |float|


## Understanding data

In [3]:
df.shape

(1670214, 37)

## Explore and clean data


In [14]:
# Function that calculates the percentage of missing values
def percent_missing(df):
    nans = pd.DataFrame(df.isnull().sum().sort_values(ascending=False)/len(df)*100, columns=['percent']) 
    idx = nans['percent'] > 0
    return nans[idx]

In [15]:
percent_missing(df).head()

Unnamed: 0,percent
RATE_INTEREST_PRIMARY,99.643698
RATE_INTEREST_PRIVILEGED,99.643698
AMT_DOWN_PAYMENT_last,53.63648
AMT_DOWN_PAYMENT,53.63648
RATE_DOWN_PAYMENT,53.63648


Some variables miss a significant amount of values. We do not really care at this point, but after aggregation.

## Really read through what comes below and spend time understanding what is happening.

### Feature prep

In [6]:
# some preparation needed to use variables in calculations or aggregation
df["FLAG_LAST_APPL_PER_CONTRACT_dummy"] = [1 if d == "Y" else 0 for d in df["FLAG_LAST_APPL_PER_CONTRACT"]]
df["SELLERPLACE_AREA"] = [str(d) for d in df["SELLERPLACE_AREA"]]
df.loc[df["PRODUCT_COMBINATION"].isna(), "PRODUCT_COMBINATION"] = ""

In [8]:
# create some new variables that could add value
df["application_to_credit_ratio"] = df["AMT_APPLICATION"] / df["AMT_CREDIT"]
df["credit_collateral"] = df["AMT_GOODS_PRICE"] / df["AMT_CREDIT"] 
df["amortization_rate"] = df["AMT_ANNUITY"] / df["AMT_CREDIT"]
df["AMT_ANNUITY_last"] = df["FLAG_LAST_APPL_PER_CONTRACT_dummy"] * df["AMT_ANNUITY"]
df["AMT_APPLICATION_last"] = df["FLAG_LAST_APPL_PER_CONTRACT_dummy"] * df["AMT_APPLICATION"]
df["AMT_CREDIT_last"] = df["FLAG_LAST_APPL_PER_CONTRACT_dummy"] * df["AMT_CREDIT"]
df["AMT_DOWN_PAYMENT_last"] = df["FLAG_LAST_APPL_PER_CONTRACT_dummy"] * df["AMT_DOWN_PAYMENT"]
df["AMT_GOODS_PRICE_last"] = df["FLAG_LAST_APPL_PER_CONTRACT_dummy"] * df["AMT_GOODS_PRICE"]

### Aggregation step

In [9]:
# aggregation on mortgage id which we merge on later
df_aggregated = df.groupby("SK_ID_CURR").agg({
    "NAME_CONTRACT_TYPE" : ', '.join,
    "AMT_ANNUITY_last" : np.sum,
    "AMT_APPLICATION_last" : np.sum,
    "AMT_CREDIT_last" : np.sum,
    "AMT_DOWN_PAYMENT_last" : np.sum,
    "AMT_GOODS_PRICE_last" : np.sum,
    "application_to_credit_ratio" : np.mean,
    "credit_collateral" : np.mean,
    "amortization_rate" : np.mean,
    "NAME_CASH_LOAN_PURPOSE" : ', '.join,
    "NAME_CONTRACT_STATUS" : ', '.join,
    "DAYS_DECISION" : np.mean,
    "NAME_PAYMENT_TYPE" : ', '.join,
    "CODE_REJECT_REASON" : ', '.join,
    "NAME_CLIENT_TYPE" : ', '.join,
    "NAME_GOODS_CATEGORY" : ', '.join,
    "NAME_PORTFOLIO" : ', '.join,
    "NAME_PRODUCT_TYPE" : ', '.join,
    "CHANNEL_TYPE" : ', '.join,
    "SELLERPLACE_AREA" : ', '.join,
    "NAME_SELLER_INDUSTRY" : ', '.join,
    "CNT_PAYMENT" : np.mean,
    "NAME_YIELD_GROUP" : ', '.join,
    "PRODUCT_COMBINATION" : ', '.join,
})

print("Done aggregating.")

Done aggregating.


In [10]:
df_aggregated.head()

Unnamed: 0_level_0,NAME_CONTRACT_TYPE,AMT_ANNUITY_last,AMT_APPLICATION_last,AMT_CREDIT_last,AMT_DOWN_PAYMENT_last,AMT_GOODS_PRICE_last,application_to_credit_ratio,credit_collateral,amortization_rate,NAME_CASH_LOAN_PURPOSE,...,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,CHANNEL_TYPE,SELLERPLACE_AREA,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,Consumer loans,3951.0,24835.5,23787.0,2520.0,24835.5,1.044079,1.044079,0.166099,XAP,...,Refreshed,Mobile,POS,XNA,Country-wide,23,Connectivity,8.0,high,POS mobile with interest
100002,Consumer loans,9251.775,179055.0,179055.0,0.0,179055.0,1.0,1.0,0.05167,XAP,...,New,Vehicles,POS,XNA,Stone,500,Auto technology,24.0,low_normal,POS other with interest
100003,"Cash loans, Consumer loans, Consumer loans",169661.97,1306309.5,1452573.0,6885.0,1306309.5,0.949329,0.949329,0.126383,"XNA, XAP, XAP",...,"Repeater, Refreshed, Refreshed","XNA, Furniture, Consumer Electronics","Cash, POS, POS","x-sell, XNA, XNA","Credit and cash offices, Stone, Country-wide","-1, 1400, 200","XNA, Furniture, Consumer electronics",10.0,"low_normal, middle, middle","Cash X-Sell: low, POS industry with interest, ..."
100004,Consumer loans,5357.25,24282.0,20106.0,4860.0,24282.0,1.207699,1.207699,0.26645,XAP,...,New,Mobile,POS,XNA,Regional / Local,30,Connectivity,4.0,middle,POS mobile without interest
100005,"Cash loans, Consumer loans",4813.2,44617.5,40153.5,4464.0,44617.5,1.111173,1.111173,0.11987,"XNA, XAP",...,"Repeater, New","XNA, Mobile","XNA, POS","XNA, XNA","Credit and cash offices, Country-wide","-1, 37","XNA, Connectivity",12.0,"XNA, high","Cash, POS mobile with interest"


Have a look at the df above! 
* How are previous credits' information being aggregated? 
* Is the result as expected?
* What do these variables represent now? 

In [11]:
# creating some flag variables for some of the string variables
df_aggregated["has_been_rejected_before"] = [1 if "Canceled" in d else 0 for d in df_aggregated["NAME_CONTRACT_STATUS"]]
df_aggregated["repeated_customer"] = [1 if "Repeater" in d else 0 for d in df_aggregated["NAME_CLIENT_TYPE"]]
df_aggregated["high_yield"] = [1 if "high" in d else 0 for d in df_aggregated["NAME_YIELD_GROUP"]]
df_aggregated["cash_product"] = [1 if "Cash" in d else 0 for d in df_aggregated["PRODUCT_COMBINATION"]]

#### Descriptive Statistics


In [12]:
# Let's have a look at our numerical variables and their descriptive statistics
df_aggregated.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AMT_ANNUITY_last,338857.0,60807.78688,73381.8,0.0,16069.59,36772.335,77710.59,2051103.0
AMT_APPLICATION_last,338857.0,860090.545121,1348405.0,0.0,150255.0,389335.5,1022998.0,38274750.0
AMT_CREDIT_last,338857.0,958586.771767,1485327.0,0.0,159250.5,434295.0,1165486.0,43599840.0
AMT_DOWN_PAYMENT_last,338857.0,15305.179749,41187.69,-0.45,0.0,6142.5,18000.0,6441543.0
AMT_GOODS_PRICE_last,338857.0,860090.545121,1348405.0,0.0,150255.0,389335.5,1022998.0,38274750.0
application_to_credit_ratio,338604.0,0.969534,0.1703467,0.0,0.908348,0.971373,1.026467,10.0
credit_collateral,337783.0,1.002663,0.1375688,0.0,0.933809,0.985378,1.037046,10.0
amortization_rate,338376.0,0.107919,0.04127246,0.023252,0.079068,0.10252,0.1270999,0.3626002
DAYS_DECISION,338857.0,-919.288946,574.659,-2922.0,-1240.666667,-788.25,-471.5,-2.0
CNT_PAYMENT,338379.0,14.533387,8.409062,0.0,9.0,12.0,18.0,72.0


#### Missing values overview

In [16]:
percent_missing(df_aggregated)

Unnamed: 0,percent
credit_collateral,0.316948
amortization_rate,0.141948
CNT_PAYMENT,0.141062
application_to_credit_ratio,0.074663


its not even half a percent of observations in the worst variable, we don't really care and leave it as is, they'll be just ignored by the model

## Prepare data for usage and have a peak at correlations

In [None]:
# get rid of string variables
for column in df_aggregated:
    if pd.api.types.is_string_dtype(df_aggregated[column]):
        df_aggregated.drop(column, axis = 1, inplace = True)

In [None]:
def make_heatmap(df):
    corr = df.corr()
    plt.subplots(figsize=(15, 12))
    mask = np.triu(np.ones_like(corr, dtype=bool))
    sns.heatmap(corr, annot=True, cmap="YlGnBu_r", mask=mask, vmax=1, vmin=-1)
    
make_heatmap(df_aggregated)