In [1]:
import pandas as pd
import numpy as np

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress warnings from pandas
import warnings
warnings.filterwarnings('ignore')

plt.style.use('fivethirtyeight')

In [126]:
pos_cash= pd.read_csv('C:\\home-credit-default-risk\\POS_CASH_balance.csv')

In [127]:
cat_features = [f for f in pos_cash.columns if pos_cash[f].dtype == 'object']
cat_features

['NAME_CONTRACT_STATUS']

In [129]:
continuous_var = [x for x in pos_cash.columns if x not in cat_features]
continuous_var

['SK_ID_PREV',
 'SK_ID_CURR',
 'MONTHS_BALANCE',
 'CNT_INSTALMENT',
 'CNT_INSTALMENT_FUTURE',
 'SK_DPD',
 'SK_DPD_DEF']

In [130]:
df = pos_cash[continuous_var]
df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,0,0
1,1715348,367990,-33,36.0,35.0,0,0
2,1784872,397406,-32,12.0,9.0,0,0
3,1903291,269225,-35,48.0,42.0,0,0
4,2341044,334279,-35,36.0,35.0,0,0


In [131]:
def agg_numeric(df, group_var):

    # Remove id variables other than grouping variable
    for col in df:
        if col != group_var and 'SK_ID' in col:
            df = df.drop(columns = col)
            
    group_ids = df[group_var]
    df[group_var] = group_ids

    # Group by the specified variable and calculate the statistics
    agg = df.groupby(group_var).agg('mean').reset_index()

    return agg

In [133]:
num_group_df = agg_numeric(df.drop(columns = ['SK_ID_PREV']), group_var = 'SK_ID_CURR')
num_group_df.head()

Unnamed: 0,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,SK_DPD,SK_DPD_DEF
0,100001,-72.555556,4.0,1.444444,0.777778,0.777778
1,100002,-10.0,24.0,15.0,0.0,0.0
2,100003,-43.785714,10.107143,5.785714,0.0,0.0
3,100004,-25.5,3.75,2.25,0.0,0.0
4,100005,-20.0,11.7,7.2,0.0,0.0


In [136]:
categorical = pd.get_dummies(pos_cash[cat_features])
categorical['SK_ID_CURR'] = pos_cash['SK_ID_CURR']

categorical.head()

Unnamed: 0,NAME_CONTRACT_STATUS_Active,NAME_CONTRACT_STATUS_Amortized debt,NAME_CONTRACT_STATUS_Approved,NAME_CONTRACT_STATUS_Canceled,NAME_CONTRACT_STATUS_Completed,NAME_CONTRACT_STATUS_Demand,NAME_CONTRACT_STATUS_Returned to the store,NAME_CONTRACT_STATUS_Signed,NAME_CONTRACT_STATUS_XNA,SK_ID_CURR
0,1,0,0,0,0,0,0,0,0,182943
1,1,0,0,0,0,0,0,0,0,367990
2,1,0,0,0,0,0,0,0,0,397406
3,1,0,0,0,0,0,0,0,0,269225
4,1,0,0,0,0,0,0,0,0,334279


In [137]:
cat_group_df = categorical.groupby('SK_ID_CURR').agg('sum').reset_index()
cat_group_df.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_STATUS_Active,NAME_CONTRACT_STATUS_Amortized debt,NAME_CONTRACT_STATUS_Approved,NAME_CONTRACT_STATUS_Canceled,NAME_CONTRACT_STATUS_Completed,NAME_CONTRACT_STATUS_Demand,NAME_CONTRACT_STATUS_Returned to the store,NAME_CONTRACT_STATUS_Signed,NAME_CONTRACT_STATUS_XNA
0,100001,7.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
1,100002,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100003,26.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
3,100004,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,100005,9.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [139]:
cat_num_merge=num_group_df.merge(cat_group_df,on='SK_ID_CURR')
cat_num_merge.head()

Unnamed: 0,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,SK_DPD,SK_DPD_DEF,NAME_CONTRACT_STATUS_Active,NAME_CONTRACT_STATUS_Amortized debt,NAME_CONTRACT_STATUS_Approved,NAME_CONTRACT_STATUS_Canceled,NAME_CONTRACT_STATUS_Completed,NAME_CONTRACT_STATUS_Demand,NAME_CONTRACT_STATUS_Returned to the store,NAME_CONTRACT_STATUS_Signed,NAME_CONTRACT_STATUS_XNA
0,100001,-72.555556,4.0,1.444444,0.777778,0.777778,7.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
1,100002,-10.0,24.0,15.0,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100003,-43.785714,10.107143,5.785714,0.0,0.0,26.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
3,100004,-25.5,3.75,2.25,0.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,100005,-20.0,11.7,7.2,0.0,0.0,9.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [141]:
posh_csv=cat_num_merge.to_csv("C:\home-credit-default-risk\\result\\posh_cash_aggregate.csv",index=False)