In [1]:
import pandas as pd
import os
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.metrics import confusion_matrix,roc_curve,auc,precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from collections import Counter


import numpy as np

import warnings

warnings.simplefilter('ignore')



TARGET = 'target'
BINARY_FEATURES =['FLAG_OWN_CAR','FLAG_OWN_REALTY','FLAG_WORK_PHONE','FLAG_PHONE','FLAG_EMAIL']
BINARY_FEATURES_ECOD =['FLAG_OWN_CAR','FLAG_OWN_REALTY']
BINARY_FEATURES_NO_ECOD =['FLAG_OWN_CAR','FLAG_OWN_REALTY']

CATEGORICAL_FEATURES =['CODE_GENDER','NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE','OCCUPATION_TYPE']
CONTINUOUS_FEATURES =['CNT_CHILDREN','AMT_INCOME_TOTAL','DAYS_BIRTH','DAYS_EMPLOYED','CNT_FAM_MEMBERS','ACCOUNT_LENGTH']


ENCODE_TARGET = {"Yes":1,"No":0}
ENCODE_BINARY ={'Y':1,'N':0}

COLS_TO_DROP = 'FLAG_MOBIL'

BASE_INPUT = './data/'

records=  pd.read_csv(os.path.join(BASE_INPUT,'credit_record.csv'))
data = pd.read_csv(os.path.join(BASE_INPUT,'application_record.csv'))


In [2]:

def encode_target(df_credit):

    df_credit[TARGET] = df_credit[TARGET].apply(lambda x : ENCODE_TARGET[x])
    return df_credit
    

def prepare_target(records):
    
    records['status'] = np.where((records['STATUS'] == '2') | (records['STATUS'] == '3' )| (records['STATUS'] == '4' )| (records['STATUS'] == '5'), 1, 0)
    records_ts = records[['ID','status']]
    records_grp = records_ts.groupby('ID').sum()
    records_grp['target'] = np.where(records_grp['status']>0,1,0)
    records_grp.reset_index(inplace=True)

    recent_records =records[records.MONTHS_BALANCE.isin([0,-1])].ID.values.tolist()
    old_records = records[records.MONTHS_BALANCE.isin([0,-1])==False].ID.values.tolist()
    to_drop_ids = list(set(recent_records) - set(old_records))
    
    records_grp[records_grp.ID.isin(to_drop_ids)==False]
    records_grp.reset_index(drop=True,inplace=True)
    return records_grp[['ID','target']]
    
    
def nan_occupation(data):
    data['is_occupation_nan'] = data.OCCUPATION_TYPE.isnull()
    data.OCCUPATION_TYPE.fillna("NotDefined",inplace=True)
    return data
 
    

# def extra_features(data,record):

#     start_df=pd.DataFrame(record.groupby(['ID'])['MONTHS_BALANCE'].agg(min)).reset_index()
#     end_df=pd.DataFrame(record.groupby(['ID'])['MONTHS_BALANCE'].agg(max)).reset_index()
    
#     start_df.rename(columns={'MONTHS_BALANCE':'AGE_START'}, inplace=True)
#     end_df.rename(columns={'MONTHS_BALANCE':'AGE_MAX'}, inplace=True)
    
#     data=pd.merge(data, start_df, how='inner', on=['ID'])
#     data=pd.merge(data, end_df, how='inner', on=['ID'])
    
#     return data

    
def over_sample_minority(data):
    
    bad_customers =data[data[TARGET]==1].ID
    good_customers =data[data[TARGET]==0].ID
    
    total_samples = good_customers.shape[0]-bad_customers.shape[0]
    
    choices = np.random.choice(bad_customers, total_samples)
    
    bad_customers_df = data[data.ID.isin(choices)]

    customers_counter = Counter(choices)

    all_rows=[]
    for _,row in bad_customers_df.iterrows():
        for _ in range(customers_counter[row['ID']]):
            all_rows.append(row)

    all_rows_df = pd.DataFrame(all_rows)
    
    return pd.concat((all_rows_df,data)).sample(frac=1)

 
    
def preprocessing(data,records):
    
    
    target_df = prepare_target(records)
    data = pd.merge(data,target_df,on='ID',how='inner')
    data.sort_values(TARGET,ascending=False,inplace=True)
    data = data.drop_duplicates(subset=data.columns[1:-1], keep='first')
    data.reset_index(inplace=True,drop=True)
    encoders ={}
    
    df_target = prepare_target(records)
    df_credit = pd.merge(data,df_target, on='ID',how='inner')
    data = nan_occupation(data)
   # DROP MOBILE FLAG COL AS ALL CUSTOMERS HAVE THE SAME VALUE ( NO VARIANCE FOR A MODEL TO LEARN FROM)
    data.drop(COLS_TO_DROP,axis=1,inplace=True)
    data.reset_index(drop=True,inplace=True)
    for col in BINARY_FEATURES_ECOD:
        data[col] = data[col].apply(lambda x : ENCODE_BINARY[x])
    
    for col in CATEGORICAL_FEATURES:
        encoders[col] = OneHotEncoder(sparse=False)
        encoders[col].fit(data[col].array.reshape(-1, 1))
        transform_pass = pd.DataFrame(encoders[col].transform(data[col].array.reshape(-1, 1)),columns=encoders[col].categories_)
        data  = data.join(transform_pass)
        data.drop(col,axis=1,inplace=True)
    
#     data = extra_features(data,records)
    #data = encode_target(data)
    
    

    data.columns = [str(k).replace('(','').replace(')','' ).replace(',','').replace("'",'') for k in data.columns]
    
    train_df, test_df = train_test_split(data,test_size=0.2)
    train_df, val_df = train_test_split(train_df,test_size=0.2)

    
    #train_df = over_sample_minority(train_df)
    
    #data.pop('ID')
    
    train_df.reset_index(drop=True,inplace=True)
    val_df.reset_index(drop=True,inplace=True)
    test_df.reset_index(drop=True,inplace=True)
    
    
    return train_df,val_df,test_df
    
    
    

train,val,test = preprocessing(data,records)

    

In [3]:
train.head()

Unnamed: 0,ID,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,...,Low-skill Laborers,Managers,Medicine staff,NotDefined,Private service staff,Realty agents,Sales staff,Secretaries,Security staff,Waiters/barmen staff
0,5100334,0,1,0,247500.0,-23057,-4540,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5048473,1,0,0,225000.0,-9187,-308,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5009136,0,1,0,382500.0,-21673,365243,0,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5061095,0,1,1,180000.0,-14977,-6772,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5053987,1,0,0,76500.0,-16099,-181,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
train.to_csv(os.path.join('data','train.csv'),index=None)
val.to_csv(os.path.join('data','val.csv'),index=None)
test.to_csv(os.path.join('data','test.csv'),index=None)
