In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from functions import *

# bureau

In [3]:
bureau = pd.read_csv('raw-data/dseb63_bureau.csv')
bureau.head()

Unnamed: 0,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY,SK_ID_CURR
0,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,,254629
1,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,,254629
2,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,,254629
3,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,,254629
4,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,,254629


In [4]:
bureau.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1465325 entries, 0 to 1465324
Data columns (total 17 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   SK_ID_BUREAU            1465325 non-null  int64  
 1   CREDIT_ACTIVE           1465325 non-null  object 
 2   CREDIT_CURRENCY         1465325 non-null  object 
 3   DAYS_CREDIT             1465325 non-null  int64  
 4   CREDIT_DAY_OVERDUE      1465325 non-null  int64  
 5   DAYS_CREDIT_ENDDATE     1376227 non-null  float64
 6   DAYS_ENDDATE_FACT       920652 non-null   float64
 7   AMT_CREDIT_MAX_OVERDUE  516780 non-null   float64
 8   CNT_CREDIT_PROLONG      1465325 non-null  int64  
 9   AMT_CREDIT_SUM          1465322 non-null  float64
 10  AMT_CREDIT_SUM_DEBT     1242231 non-null  float64
 11  AMT_CREDIT_SUM_LIMIT    975655 non-null   float64
 12  AMT_CREDIT_SUM_OVERDUE  1465325 non-null  float64
 13  CREDIT_TYPE             1465325 non-null  object 
 14  DA

In [5]:
# check missing values
bureau.isnull().sum() / len(bureau)

SK_ID_BUREAU              0.000000
CREDIT_ACTIVE             0.000000
CREDIT_CURRENCY           0.000000
DAYS_CREDIT               0.000000
CREDIT_DAY_OVERDUE        0.000000
DAYS_CREDIT_ENDDATE       0.060804
DAYS_ENDDATE_FACT         0.371708
AMT_CREDIT_MAX_OVERDUE    0.647327
CNT_CREDIT_PROLONG        0.000000
AMT_CREDIT_SUM            0.000002
AMT_CREDIT_SUM_DEBT       0.152249
AMT_CREDIT_SUM_LIMIT      0.334172
AMT_CREDIT_SUM_OVERDUE    0.000000
CREDIT_TYPE               0.000000
DAYS_CREDIT_UPDATE        0.000000
AMT_ANNUITY               0.771169
SK_ID_CURR                0.000000
dtype: float64

In [4]:
# 1. Aggregated Features
agg_functions = ['mean', 'median', 'sum', 'max', 'min']
agg_bureau = bureau.groupby('SK_ID_CURR').agg({
    'DAYS_CREDIT': agg_functions,
    'CREDIT_DAY_OVERDUE': agg_functions,
    'DAYS_CREDIT_ENDDATE': agg_functions,
    'DAYS_ENDDATE_FACT': agg_functions,
    'AMT_CREDIT_MAX_OVERDUE': agg_functions,
    'CNT_CREDIT_PROLONG': agg_functions,
    'AMT_CREDIT_SUM': agg_functions,
    'AMT_CREDIT_SUM_DEBT': agg_functions,
    'AMT_CREDIT_SUM_LIMIT': agg_functions,
    'AMT_CREDIT_SUM_OVERDUE': agg_functions,
    'DAYS_CREDIT_UPDATE': agg_functions,
    'AMT_ANNUITY': agg_functions,
})
agg_bureau.columns = ['_'.join(col).upper() for col in agg_bureau.columns]

# 2. Credit Utilization Ratios
bureau['UTILIZATION_RATIO'] = bureau['AMT_CREDIT_SUM_DEBT'] / bureau['AMT_CREDIT_SUM']

# 3. Overdue Ratios
bureau['OVERDUE_RATIO'] = bureau['AMT_CREDIT_SUM_OVERDUE'] / bureau['AMT_CREDIT_SUM']

# 4. Credit Age
bureau['CREDIT_AGE'] = -bureau['DAYS_CREDIT']

# 5. Credit Type One-Hot Encoding
credit_type_dummies = pd.get_dummies(bureau['CREDIT_TYPE'], prefix='CREDIT_TYPE')
bureau = pd.concat([bureau, credit_type_dummies], axis=1)

# 6. Credit Active Binary Encoding
bureau['CREDIT_ACTIVE_BINARY'] = bureau['CREDIT_ACTIVE'].apply(lambda x: 1 if x == 'Active' else 0)

# 7. Currency Binary Encoding
currency_dummies = pd.get_dummies(bureau['CREDIT_CURRENCY'], prefix='CURRENCY')
bureau = pd.concat([bureau, currency_dummies], axis=1)

# 8. Debt to Annuity Ratio
bureau['DEBT_TO_ANNUITY_RATIO'] = bureau['AMT_CREDIT_SUM_DEBT'] / bureau['AMT_ANNUITY']

# 9. Days Since Credit Update
bureau['DAYS_SINCE_UPDATE'] = -bureau['DAYS_CREDIT_UPDATE']

# 10. Credit Prolongation Frequency
bureau['PROLONGATION_FREQUENCY'] = bureau['CNT_CREDIT_PROLONG'] / bureau['CREDIT_AGE']

# 11. Time Windows Aggregation (example: aggregating over the last year)
time_window = 365  # days
recent_bureau = bureau[bureau['DAYS_CREDIT'] >= -time_window]
agg_recent_bureau = recent_bureau.groupby('SK_ID_CURR').agg({
    'AMT_CREDIT_SUM': ['sum', 'mean'],
    'CREDIT_DAY_OVERDUE': ['sum', 'mean'],
    'AMT_CREDIT_SUM_DEBT': ['sum', 'mean'],
    'AMT_CREDIT_SUM_OVERDUE': ['sum', 'mean'],
    'DAYS_CREDIT_ENDDATE': ['sum', 'mean'],
    'DAYS_CREDIT_UPDATE': ['sum', 'mean'],
    'CREDIT_ACTIVE_BINARY': ['sum', 'mean'],
    'UTILIZATION_RATIO': ['sum', 'mean'],
    'OVERDUE_RATIO': ['sum', 'mean'],
    'DEBT_TO_ANNUITY_RATIO': ['sum', 'mean'],
    'DAYS_SINCE_UPDATE': ['sum', 'mean'],
    'PROLONGATION_FREQUENCY': ['sum', 'mean'],
})
agg_recent_bureau.columns = ['RECENT_' + '_'.join(col).upper() for col in agg_recent_bureau.columns]


In [5]:
agg_recent_bureau.head()

Unnamed: 0_level_0,RECENT_AMT_CREDIT_SUM_SUM,RECENT_AMT_CREDIT_SUM_MEAN,RECENT_CREDIT_DAY_OVERDUE_SUM,RECENT_CREDIT_DAY_OVERDUE_MEAN,RECENT_AMT_CREDIT_SUM_DEBT_SUM,RECENT_AMT_CREDIT_SUM_DEBT_MEAN,RECENT_AMT_CREDIT_SUM_OVERDUE_SUM,RECENT_AMT_CREDIT_SUM_OVERDUE_MEAN,RECENT_DAYS_CREDIT_ENDDATE_SUM,RECENT_DAYS_CREDIT_ENDDATE_MEAN,...,RECENT_UTILIZATION_RATIO_SUM,RECENT_UTILIZATION_RATIO_MEAN,RECENT_OVERDUE_RATIO_SUM,RECENT_OVERDUE_RATIO_MEAN,RECENT_DEBT_TO_ANNUITY_RATIO_SUM,RECENT_DEBT_TO_ANNUITY_RATIO_MEAN,RECENT_DAYS_SINCE_UPDATE_SUM,RECENT_DAYS_SINCE_UPDATE_MEAN,RECENT_PROLONGATION_FREQUENCY_SUM,RECENT_PROLONGATION_FREQUENCY_MEAN
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,85513.5,85513.5,0,0.0,77566.5,77566.5,0.0,0.0,237.0,237.0,...,0.907067,0.907067,0.0,0.0,0.0,,28,28.0,0.0,0.0
2,123466.5,41155.5,0,0.0,96417.0,32139.0,0.0,0.0,384.0,128.0,...,1.93133,0.643777,0.0,0.0,0.0,,168,56.0,0.0,0.0
3,63765.0,31882.5,0,0.0,41071.5,20535.75,0.0,0.0,333.0,166.5,...,1.086269,0.543134,0.0,0.0,0.0,,58,29.0,0.0,0.0
4,752544.0,752544.0,0,0.0,655510.5,655510.5,0.0,0.0,1148.0,1148.0,...,0.871059,0.871059,0.0,0.0,0.0,,31,31.0,0.0,0.0
5,29836179.0,5967235.8,0,0.0,17272386.0,3454477.2,0.0,0.0,5584.0,1116.8,...,3.685906,0.737181,0.0,0.0,37.201007,18.600504,272,54.4,0.0,0.0


In [6]:
bureau

Unnamed: 0,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,...,CREDIT_TYPE_Real estate loan,CREDIT_TYPE_Unknown type of loan,CREDIT_ACTIVE_BINARY,CURRENCY_currency 1,CURRENCY_currency 2,CURRENCY_currency 3,CURRENCY_currency 4,DEBT_TO_ANNUITY_RATIO,DAYS_SINCE_UPDATE,PROLONGATION_FREQUENCY
0,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,...,False,False,0,True,False,False,False,,131,0.0
1,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,...,False,False,1,True,False,False,False,,20,0.0
2,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,...,False,False,1,True,False,False,False,,16,0.0
3,5714465,Active,currency 1,-203,0,,,,0,90000.0,...,False,False,1,True,False,False,False,,16,0.0
4,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,...,False,False,1,True,False,False,False,,21,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465320,5052629,Closed,currency 1,-532,0,45.0,-227.0,,0,450000.0,...,False,False,0,True,False,False,False,,31,0.0
1465321,5054382,Closed,currency 1,-381,0,-199.0,-199.0,0.0,0,19800.0,...,False,False,0,True,False,False,False,,199,0.0
1465322,5043236,Active,currency 1,-1441,0,2197.0,,,0,1800000.0,...,False,False,1,True,False,False,False,,2,0.0
1465323,5053303,Closed,currency 1,-375,0,-41.0,-19.0,,0,175054.5,...,False,False,0,True,False,False,False,,4,0.0


In [7]:
# 12. Frequency of Credit Inquiries
bureau['CREDIT_INQUIRY_FREQ'] = bureau.groupby('SK_ID_CURR')['SK_ID_BUREAU'].transform('count')

# 13. Credit Variety Score
bureau['CREDIT_VARIETY_SCORE'] = bureau.groupby('SK_ID_CURR')['CREDIT_TYPE'].transform('nunique')

# Combine all the features
bureau_full = bureau.merge(agg_bureau, on='SK_ID_CURR', how='left')
bureau_full = bureau_full.merge(agg_recent_bureau, on='SK_ID_CURR', how='left')

# Now, bureau_full contains all the original and engineered features.
# You can use it for training your model.
bureau_full.head()

Unnamed: 0,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,...,RECENT_UTILIZATION_RATIO_SUM,RECENT_UTILIZATION_RATIO_MEAN,RECENT_OVERDUE_RATIO_SUM,RECENT_OVERDUE_RATIO_MEAN,RECENT_DEBT_TO_ANNUITY_RATIO_SUM,RECENT_DEBT_TO_ANNUITY_RATIO_MEAN,RECENT_DAYS_SINCE_UPDATE_SUM,RECENT_DAYS_SINCE_UPDATE_MEAN,RECENT_PROLONGATION_FREQUENCY_SUM,RECENT_PROLONGATION_FREQUENCY_MEAN
0,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,...,2.156061,0.718687,0.0,0.0,0.0,,105.0,21.0,0.0,0.0
1,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,...,2.156061,0.718687,0.0,0.0,0.0,,105.0,21.0,0.0,0.0
2,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,...,2.156061,0.718687,0.0,0.0,0.0,,105.0,21.0,0.0,0.0
3,5714465,Active,currency 1,-203,0,,,,0,90000.0,...,2.156061,0.718687,0.0,0.0,0.0,,105.0,21.0,0.0,0.0
4,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,...,2.156061,0.718687,0.0,0.0,0.0,,105.0,21.0,0.0,0.0


In [8]:
# encode categorical features
bureau_full = pd.get_dummies(bureau_full)
bureau_full.head()

Unnamed: 0,SK_ID_BUREAU,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,...,CREDIT_TYPE_Interbank credit,CREDIT_TYPE_Loan for business development,CREDIT_TYPE_Loan for purchase of shares (margin lending),CREDIT_TYPE_Loan for the purchase of equipment,CREDIT_TYPE_Loan for working capital replenishment,CREDIT_TYPE_Microloan,CREDIT_TYPE_Mobile operator loan,CREDIT_TYPE_Mortgage,CREDIT_TYPE_Real estate loan,CREDIT_TYPE_Unknown type of loan
0,5714462,-497,0,-153.0,-153.0,,0,91323.0,0.0,,...,False,False,False,False,False,False,False,False,False,False
1,5714463,-208,0,1075.0,,,0,225000.0,171342.0,,...,False,False,False,False,False,False,False,False,False,False
2,5714464,-203,0,528.0,,,0,464323.5,,,...,False,False,False,False,False,False,False,False,False,False
3,5714465,-203,0,,,,0,90000.0,,,...,False,False,False,False,False,False,False,False,False,False
4,5714466,-629,0,1197.0,,77674.5,0,2700000.0,,,...,False,False,False,False,False,False,False,False,False,False


In [9]:
# group by SK_ID_CURR and aggregate
bureau_full = bureau_full.groupby('SK_ID_CURR').agg(['mean', 'max', 'min', 'sum'])
bureau_full.head()

Unnamed: 0_level_0,SK_ID_BUREAU,SK_ID_BUREAU,SK_ID_BUREAU,SK_ID_BUREAU,DAYS_CREDIT,DAYS_CREDIT,DAYS_CREDIT,DAYS_CREDIT,CREDIT_DAY_OVERDUE,CREDIT_DAY_OVERDUE,...,CREDIT_TYPE_Mortgage,CREDIT_TYPE_Mortgage,CREDIT_TYPE_Real estate loan,CREDIT_TYPE_Real estate loan,CREDIT_TYPE_Real estate loan,CREDIT_TYPE_Real estate loan,CREDIT_TYPE_Unknown type of loan,CREDIT_TYPE_Unknown type of loan,CREDIT_TYPE_Unknown type of loan,CREDIT_TYPE_Unknown type of loan
Unnamed: 0_level_1,mean,max,min,sum,mean,max,min,sum,mean,max,...,min,sum,mean,max,min,sum,mean,max,min,sum
SK_ID_CURR,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,6055319.0,6055319,6055319,6055319,-63.0,-63,-63,-63,0.0,0,...,False,0,0.0,False,False,0,0.0,False,False,0
1,6061091.0,6061091,6061091,6061091,-2348.0,-2348,-2348,-2348,0.0,0,...,False,0,0.0,False,False,0,0.0,False,False,0
2,5612556.5,5612559,5612554,33675339,-810.333333,-30,-2901,-4862,0.0,0,...,False,0,0.0,False,False,0,0.0,False,False,0
3,5527739.0,5527742,5527736,38694173,-1131.428571,-116,-2865,-7920,0.0,0,...,False,0,0.0,False,False,0,0.0,False,False,0
4,5982731.5,5982734,5982729,35896389,-773.333333,-313,-1056,-4640,0.0,0,...,False,0,0.0,False,False,0,0.0,False,False,0


In [10]:
# flatten the column names
bureau_full.columns = ['_'.join(col).upper() for col in bureau_full.columns]

# drop SK_ID_BUREAU
bureau_full = bureau_full.drop(columns=['SK_ID_BUREAU_MEAN', 'SK_ID_BUREAU_MAX', 'SK_ID_BUREAU_MIN', 'SK_ID_BUREAU_SUM'])

# fill missing values with mean
bureau_full = bureau_full.fillna(bureau_full.mean())

bureau_full

Unnamed: 0_level_0,DAYS_CREDIT_MEAN,DAYS_CREDIT_MAX,DAYS_CREDIT_MIN,DAYS_CREDIT_SUM,CREDIT_DAY_OVERDUE_MEAN,CREDIT_DAY_OVERDUE_MAX,CREDIT_DAY_OVERDUE_MIN,CREDIT_DAY_OVERDUE_SUM,DAYS_CREDIT_ENDDATE_MEAN,DAYS_CREDIT_ENDDATE_MAX,...,CREDIT_TYPE_MORTGAGE_MIN,CREDIT_TYPE_MORTGAGE_SUM,CREDIT_TYPE_REAL ESTATE LOAN_MEAN,CREDIT_TYPE_REAL ESTATE LOAN_MAX,CREDIT_TYPE_REAL ESTATE LOAN_MIN,CREDIT_TYPE_REAL ESTATE LOAN_SUM,CREDIT_TYPE_UNKNOWN TYPE OF LOAN_MEAN,CREDIT_TYPE_UNKNOWN TYPE OF LOAN_MAX,CREDIT_TYPE_UNKNOWN TYPE OF LOAN_MIN,CREDIT_TYPE_UNKNOWN TYPE OF LOAN_SUM
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-63.000000,-63,-63,-63,0.0,0,0,0,237.000000,237.0,...,False,0,0.0,False,False,0,0.0,False,False,0
1,-2348.000000,-2348,-2348,-2348,0.0,0,0,0,-2044.000000,-2044.0,...,False,0,0.0,False,False,0,0.0,False,False,0
2,-810.333333,-30,-2901,-4862,0.0,0,0,0,-572.166667,274.0,...,False,0,0.0,False,False,0,0.0,False,False,0
3,-1131.428571,-116,-2865,-7920,0.0,0,0,0,-658.714286,502.0,...,False,0,0.0,False,False,0,0.0,False,False,0
4,-773.333333,-313,-1056,-4640,0.0,0,0,0,474.500000,1148.0,...,False,0,0.0,False,False,0,0.0,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307504,-1149.750000,-728,-1701,-4599,0.0,0,0,0,-557.750000,734.0,...,False,0,0.0,False,False,0,0.0,False,False,0
307505,-461.500000,-151,-772,-923,0.0,0,0,0,5931.500000,10188.0,...,False,0,0.0,False,False,0,0.0,False,False,0
307506,-1390.500000,-1222,-1559,-5562,0.0,0,0,0,-1115.500000,-856.0,...,False,0,0.0,False,False,0,0.0,False,False,0
307508,-765.428571,-95,-1957,-5358,0.0,0,0,0,587.666667,3061.0,...,False,1,0.0,False,False,0,0.0,False,False,0


In [11]:
bureau_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 263491 entries, 0 to 307509
Columns: 588 entries, DAYS_CREDIT_MEAN to CREDIT_TYPE_UNKNOWN TYPE OF LOAN_SUM
dtypes: bool(84), float64(399), int64(105)
memory usage: 1.0 GB


In [12]:
target = pd.read_csv('processed-data/app_train.csv', index_col=0)
target.head()

Unnamed: 0_level_0,TARGET,CNT_CHILDREN,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Other,NAME_HOUSING_TYPE_With parents,OCCUPATION_TYPE_Core staff,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Other,OCCUPATION_TYPE_Sales staff,ORGANIZATION_TYPE_Business Entity Type 3,ORGANIZATION_TYPE_Other,ORGANIZATION_TYPE_Self-employed
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,1197000.0,44487.0,1197000.0,0.026392,-11945,-376,-574.0,-580,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1,0,900000.0,26316.0,900000.0,0.003122,-19158,-9203,-12984.0,-2568,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0,1,265851.0,11263.5,229500.0,0.031329,-14434,-3759,-4976.0,-3989,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0,2,545040.0,20547.0,450000.0,0.004849,-15957,-6018,-10110.0,-5219,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0,0,512064.0,25033.5,360000.0,0.018801,-17851,-495,-43.0,-181,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [13]:
# merge bureau_full and target
bureau_full = bureau_full.merge(target, left_index=True, right_index=True)
bureau_full.head()

Unnamed: 0_level_0,DAYS_CREDIT_MEAN,DAYS_CREDIT_MAX,DAYS_CREDIT_MIN,DAYS_CREDIT_SUM,CREDIT_DAY_OVERDUE_MEAN,CREDIT_DAY_OVERDUE_MAX,CREDIT_DAY_OVERDUE_MIN,CREDIT_DAY_OVERDUE_SUM,DAYS_CREDIT_ENDDATE_MEAN,DAYS_CREDIT_ENDDATE_MAX,...,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Other,NAME_HOUSING_TYPE_With parents,OCCUPATION_TYPE_Core staff,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Other,OCCUPATION_TYPE_Sales staff,ORGANIZATION_TYPE_Business Entity Type 3,ORGANIZATION_TYPE_Other,ORGANIZATION_TYPE_Self-employed
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-63.0,-63,-63,-63,0.0,0,0,0,237.0,237.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,-2348.0,-2348,-2348,-2348,0.0,0,0,0,-2044.0,-2044.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,-810.333333,-30,-2901,-4862,0.0,0,0,0,-572.166667,274.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,-1131.428571,-116,-2865,-7920,0.0,0,0,0,-658.714286,502.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,-773.333333,-313,-1056,-4640,0.0,0,0,0,474.5,1148.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [14]:
# check inf values
bureau_full.replace([np.inf, -np.inf], np.nan, inplace=True)
bureau_full.isnull().sum().sum()

# fill na with 0
for col in bureau_full.columns:
    bureau_full[col].fillna(0, inplace=True)
bureau_full.isnull().sum().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bureau_full[col].fillna(0, inplace=True)


0

In [15]:
X, y = bureau_full.drop(columns=['TARGET']), bureau_full['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

logreg = LogisticRegression(class_weight='balanced', solver='newton-cholesky')
logreg.fit(X_train_scaled, y_train)

y_pred = logreg.predict_proba(X_test_scaled)[:, 1]

gini(y_test, y_pred)

0.5069535455136944

In [3]:
y_test = pd.read_csv('processed-data/target.csv', index_col=0)
y_test.head()

Unnamed: 0_level_0,TARGET
SK_ID_CURR,Unnamed: 1_level_1
0,0
1,1
2,0
3,0
4,0
