In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,ParameterGrid
import optuna.integration.lightgbm as lgb
from sklearn.model_selection import cross_val_score
import optuna
from sklearn.metrics import accuracy_score,f1_score
from sklearn.metrics import roc_auc_score
%matplotlib inline

In [None]:
train_df = pd.read_csv('application_train.csv')
test_df = pd.read_csv('application_test.csv')
bureau_balance = pd.read_csv('bureau_balance.csv')
bureau = pd.read_csv('bureau.csv')
credit_card_balance = pd.read_csv('credit_card_balance.csv')
HomeCredit_columns_description = pd.read_csv('HomeCredit_columns_description.csv')
installments_payments =  pd.read_csv('installments_payments.csv')
POS_CASH_balance = pd.read_csv('POS_CASH_balance.csv')
previous_application = pd.read_csv('previous_application.csv')

In [None]:
train_df.info()

In [None]:
train_df.isnull().sum()

In [None]:
def missing_value_table(df):
    #total missing value
    mis_val = df.isnull().sum()

    #percentage of missing value
    mis_val_percent = 100 * df.isnull().sum() / len(df)

    #make a table with the results
    mis_val_table = pd.concat([mis_val, mis_val_percent],axis=1)

    #rename the columns 
    mis_val_table_ren_columns = mis_val_table.rename(
        columns={0 : 'missing values',1 : '% of total values'}
    )
    #sort the table by percentage of missing decsending
    mis_val_table_ren_columns = mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:,1] != 0].sort_values('% of total values',ascending=False).round(1)

    #print summary information
    print('Your selected dataframe has ' + str(df.shape[1]) + ' columns.\n'
          'There are ' + str(mis_val_table_ren_columns.shape[0]) + 
          ' columns that have missing values'
    )

    #return the dataframe with missing information
    return mis_val_table_ren_columns


In [None]:
missing_value_table(train_df)

In [None]:
train_df.select_dtypes('object').apply(pd.Series.nunique,axis=0)

In [None]:
le = LabelEncoder()
le_count = 0

for col in train_df:
    if train_df[col].dtype == 'object':
        if len(list(train_df[col].unique())) <= 2:

            le.fit(train_df[col])
            train_df[col] = le.transform(train_df[col])
            test_df[col] = le.transform(test_df[col])

            le_count += 1

print('%d columns were label encoded' %le_count)

In [None]:
train_df = pd.get_dummies(train_df)
test_df = pd.get_dummies(test_df)

print('Training Feature shape: ', train_df.shape)
print('Test Feature shape: ', test_df.shape)

In [None]:
train_labels = train_df['TARGET']

train_df, test_df = train_df.align(test_df, join='inner',axis=1)

train_df['TARGET'] = train_labels

print('Training Feature Shape: ', train_df.shape)
print('Test Feature Shape: ', test_df.shape)

In [None]:
(train_df['DAYS_BIRTH'] / -365).describe()

In [None]:
train_df['DAYS_EMPLOYED'].describe()

In [None]:
train_df['DAYS_EMPLOYED'].plot.hist(title='Days Employed Histogram')
plt.xlabel('Days Employment')

In [None]:
#create an anomalous flag column
train_df['DAYS_EMPLOYED_ANOM'] = train_df['DAYS_EMPLOYED'] == 365243

#Replace the anomalous values with nan
train_df['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace=True)

train_df['DAYS_EMPLOYED'].plot.hist(title='Days Employment Histogram')
plt.xlabel('Days Employment')

In [None]:
test_df['DAYS_EMPLOYED_ANOM'] = test_df['DAYS_EMPLOYED'] == 365243
test_df['DAYS_EMPLOYED'].replace({365243:np.nan},inplace=True)

print('There are %d anomalies in the test data out of %d entries' % (test_df['DAYS_EMPLOYED_ANOM'].sum(),len(test_df)))  

In [None]:
#Find correlations with the target and sort
correlations = train_df.corr()['TARGET'].sort_values()

#Display correlations
print('Most Positive Correlations:\n',correlations.tail(15))
print('\nMost Negative Correlaitons:\n',correlations.head(15))


In [None]:
#Find the correlation of the positive days since birth and target

train_df['DAYS_BIRTH'] = abs(train_df['DAYS_BIRTH'])
train_df['DAYS_BIRTH'].corr(train_df['TARGET'])


In [None]:
#set the style of plots
plt.style.use('fivethirtyeight')

#plot the distribution of ages in years 
plt.hist(train_df['DAYS_BIRTH']/365, edgecolor='k', bins=25)
plt.title('Age of Client'); plt.xlabel('Age(years)'); plt.ylabel('Count'); 


In [None]:
plt.figure(figsize=(12,8))

#KDE plot of loans that were repaid on time 
sns.kdeplot(train_df.loc[train_df['TARGET'] == 0, 'DAYS_BIRTH'] /365,label='target = 0')

#KDE plot of loans that were not repaid on time
sns.kdeplot(train_df.loc[train_df['TARGET'] == 1, 'DAYS_BIRTH']/365,label='target = 1')

# Labeling of plot
plt.xlabel('Age (years)'); plt.ylabel('Density'); plt.title('Distribution of Ages');


In [None]:
#Age information into a separate dataframe
age_data = train_df[['TARGET','DAYS_BIRTH']]
age_data['YEARS_BIRTH'] = age_data['DAYS_BIRTH']/365

#Bin the age data
age_data['YEARS_BINNED'] = pd.cut(age_data['YEARS_BIRTH'],bins=np.linspace(20,70,num=11))
age_data.head(10)

In [None]:
#Group by the bin and calculate averages
age_groups = age_data.groupby('YEARS_BINNED').mean()
age_groups

In [None]:
plt.figure(figsize=(8,8))
#graph the age bins and the average of the target as a bar plot
plt.bar(age_groups.index.astype(str),100*age_groups['TARGET'])

#plot labeling
plt.xticks(rotation=75)
plt.xlabel('Age Group(years)')
plt.ylabel('Failure to repay(%)')
plt.title('Failure to repay by age group')

In [None]:
#Extract the EXT_SOURCE variables and show correlations
ext_data = train_df[['TARGET','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH']]
ext_data_corrs = ext_data.corr()
ext_data_corrs

In [None]:
sns.heatmap(ext_data_corrs, cmap = plt.cm.RdYlBu_r, vmin = -0.25, annot = True, vmax = 0.6)
plt.title('Correlation Heatmap');


In [None]:
plt.figure(figsize=(10,12))

#iterate through the sources
for i, source in enumerate(['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']):
    #create a new subplot for each sources
    plt.subplot(3,1,i+1)
    #plot repaid loans
    sns.kdeplot(train_df.loc[train_df['TARGET'] == 0, source], label = 'target=0')
    #plot loans that were not repaid
    sns.kdeplot(train_df.loc[train_df['TARGET'] == 1, source], label='target = 1')

    #label the plots
    plt.title('Distribution of %s by Target Value'%source)
    plt.xlabel('%s'%source)
    plt.ylabel('Density')


plt.tight_layout(h_pad = 2.5)


In [None]:
#copy the data for plotting
plot_data = ext_data.drop(columns=['DAYS_BIRTH']).copy()

#Add in the age of client in years
plot_data['YEARS_BIRTH'] = age_data['YEARS_BIRTH']

#Drop na values and limit to first 100000 rows 
plot_data = plot_data.dropna().loc[:100000, :]

#Function to calculate correlation coefficient between two values
def corr_func(x, y, **kwargs):
    r = np.corrcoef(x, y)[0][1]
    ax = plt.gca()
    ax.annotate("r = {:.2f}".format(r),
                xy=(.2, .8), xycoords=ax.transAxes,
                size = 20)

#Create the pairgrid object 
grid = sns.PairGrid(data = plot_data, height = 3, diag_sharey=False,
                    hue = 'TARGET', 
                    vars = [x for x in list(plot_data.columns) if x != 'TARGET'])

#upper is a scatter plot                
grid.map_upper(plt.scatter, alpha = 0.2)

# Bottom is density plot
grid.map_lower(sns.kdeplot, cmap = plt.cm.OrRd_r);

plt.suptitle('Ext Source and Age Features Pairs Plot', height = 32, y = 1.05);


In [None]:
#Make a new dataframe for polynomial features
poly_features = train_df[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH','TARGET']]

poly_features_test = test_df[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH']]

#imputer for handling missing value
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')

poly_target = poly_features['TARGET']

poly_features = poly_features.drop(columns=['TARGET'])

#Need to impute missing values
poly_features = imputer.fit_transform(poly_features)

poly_features_test = imputer.transform(poly_features_test)

from sklearn.preprocessing import PolynomialFeatures

#Create the polynomial object with specified degree
poly_transformer =  PolynomialFeatures(degree = 3)


In [None]:
#train the polynomial feature 
poly_transformer.fit(poly_features)

#Transform the feature
poly_features = poly_transformer.transform(poly_features)
poly_features_test = poly_transformer.transform(poly_features_test)

print('Polynomial Features Shape: ', poly_features.shape)


In [None]:
poly_transformer.get_feature_names(input_features=['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH'])[:15]


In [None]:
#Create a dataframe of the features
poly_features = pd.DataFrame(poly_features,
                             columns = poly_transformer.get_feature_names([
                                 'EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH'
                             ])
)

#Add in the target
poly_features['TARGET'] = poly_target

#Find the correlation with the target
poly_corrs = poly_features.corr()['TARGET'].sort_values()

#Display most negetive and most positive
print(poly_corrs.head(10))
print(poly_corrs.tail(5))


In [None]:
# Put test features into dataframe
poly_features_test = pd.DataFrame(poly_features_test, 
                                  columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 
                                                                                'EXT_SOURCE_3', 'DAYS_BIRTH']
))

#Merge polynomial feature into training dataframe
poly_features['SK_ID_CURR'] = train_df['SK_ID_CURR']
train_df_poly = train_df.merge(poly_features, on = 'SK_ID_CURR', how = 'left')

# Merge polynomial features into testing dataframe
poly_features_test['SK_ID_CURR'] = test_df['SK_ID_CURR']
test_df_poly = test_df.merge(poly_features_test, on = 'SK_ID_CURR', how = 'left')

#Align the dataframe
train_df_poly,test_df_poly = train_df_poly.align(test_df_poly,join='inner',axis=1)

# Print out the new shapes
print('Training data with polynomial features shape: ', train_df_poly.shape)
print('Testing data with polynomial features shape:  ', test_df_poly.shape)


In [None]:
train_df_domain = train_df.copy()
test_df_domain = test_df.copy()

train_df_domain['CREDIT_INCOME_PERCENT'] = train_df_domain['AMT_CREDIT'] / train_df_domain['AMT_INCOME_TOTAL']
train_df_domain['ANNUITY_INCOME_PERCENT'] = train_df_domain['AMT_ANNUITY'] / train_df_domain['AMT_INCOME_TOTAL']
train_df_domain['CREDIT_TERM'] = train_df_domain['AMT_ANNUITY'] / train_df_domain['AMT_CREDIT']
train_df_domain['DAYS_EMPLOYED_PERCENT'] = train_df_domain['DAYS_EMPLOYED'] / train_df_domain['DAYS_BIRTH']

In [None]:
test_df_domain['CREDIT_INCOME_PERCENT'] = test_df_domain['AMT_CREDIT'] / test_df_domain['AMT_INCOME_TOTAL']
test_df_domain['ANNUITY_INCOME_PERCENT'] = test_df_domain['AMT_ANNUITY'] / test_df_domain['AMT_INCOME_TOTAL']
test_df_domain['CREDIT_TERM'] = test_df_domain['AMT_ANNUITY'] / test_df_domain['AMT_CREDIT']
test_df_domain['DAYS_EMPLOYED_PERCENT'] = test_df_domain['DAYS_EMPLOYED'] / test_df_domain['DAYS_BIRTH']

In [None]:
train_df['TARGET']

In [None]:
X = train_df_poly.drop(columns=['SK_ID_CURR'])
y = train_df['TARGET']
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [None]:
import re
data = train_df_poly.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test_df_poly = test_df_poly.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [None]:
test_df_poly = test_df_poly.drop(columns=['SK_ID_CURR'])

In [None]:
X = data.drop(columns=['SK_ID_CURR'])
y = train_df['TARGET']
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [None]:
lgb_train = lgb.Dataset(X_train,y_train)
lgb_test = lgb.Dataset(X_test,y_test,reference=lgb_train)

#scores = cross_val_score(rfc,X_train,y_train)


In [None]:
# LightGBMのパラメータ設定
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'vervose': 0
}

In [None]:

# ベストなパラメータ、途中経過を保存する
params = {
    'objective': 'binary',
    'metric': 'auc',
    "verbosity": -1,
    "boosting_type": "gbdt",
}

best_params, history = {}, []

# LightGBM学習
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
                valid_sets=[lgb_train, lgb_test],
                early_stopping_rounds=10
               )

best_params = gbm.params
best_params

In [None]:
predict1 = gbm.predict(test_df_poly)


In [None]:
p.describe()

In [None]:
pred1 = lgbcls.predict(X_test)


In [None]:
prediction = np.where(predict1 < 0.08,0,1)
prediction

In [None]:
predict = lgbcls.predict(test_df_poly)
lgbcls.predict()

In [None]:
submit = test_df[['SK_ID_CURR']]
submit['TARGET'] = predict
submit.to_csv('submit1.csv',index=False)

In [None]:
submit = test_df[['SK_ID_CURR']]
submit['TARGET'] = prediction
submit.to_csv('submit3.csv',index=False)