In [1]:
#numpy and pandas for data manipulation
import numpy as np
import pandas as pd

#sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

In [2]:
samples = pd.read_csv('data/application_train.csv')
# # Create an anomalous flag column
samples['DAYS_EMPLOYED_ANOM'] = samples["DAYS_EMPLOYED"] == 365243

# # # Replace the anomalous values with nan
samples['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

# Treating as outlier and removing
samples = samples[samples['CODE_GENDER'] != 'XNA']

train_label = samples['TARGET']
train_data = samples.drop(columns=['TARGET'])

In [3]:
# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in train_data:
    if train_data[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(train_data[col].unique())) <= 2:
            # Train on the training data
            le.fit(train_data[col])
            # Transform 
            train_data[col] = le.transform(train_data[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
print('%d columns were label encoded.' % le_count)

4 columns were label encoded.


In [4]:
# one-hot encoding of categorical variables
train_data = pd.get_dummies(train_data)

In [5]:
train_data['CREDIT_INCOME_PERCENT'] = train_data['AMT_CREDIT'] / train_data['AMT_INCOME_TOTAL']
train_data['ANNUITY_INCOME_PERCENT'] = train_data['AMT_ANNUITY'] / train_data['AMT_INCOME_TOTAL']
train_data['CREDIT_TERM'] = train_data['AMT_ANNUITY'] / train_data['AMT_CREDIT']
train_data['DAYS_EMPLOYED_PERCENT'] = train_data['DAYS_EMPLOYED'] / train_data['DAYS_BIRTH']

In [6]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer


# Median imputation of missing values
imputer = SimpleImputer(strategy = 'median')

# Scale each feature to 0-1
scaler = MinMaxScaler(feature_range = (0, 1))

# Fit on the training data
imputer.fit(train_data)

# Transform both training and testing data
train_data = imputer.transform(train_data)

# Repeat with the scaler
scaler.fit(train_data)
train_data = scaler.transform(train_data)

print('Training data shape: ', train_data.shape)

Training data shape:  (307507, 245)


In [7]:
from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_predict

# Make the model with the specified regularization parameter
# log_reg =

# 80-20 train-test case
kf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
log_reg = LogisticRegression(C = 0.01, max_iter=500, solver='sag', fit_intercept=True)
y_pred = cross_val_predict(estimator=log_reg , X=train_data, y=train_label, cv=kf)
from sklearn import metrics

print(metrics.confusion_matrix(train_label, y_pred))
print(metrics.classification_report(train_label ,y_pred))
print("roc - {}".format(metrics.roc_auc_score(train_label, y_pred)))

[[282573    109]
 [ 24718    107]]
              precision    recall  f1-score   support

           0       0.92      1.00      0.96    282682
           1       0.50      0.00      0.01     24825

    accuracy                           0.92    307507
   macro avg       0.71      0.50      0.48    307507
weighted avg       0.89      0.92      0.88    307507

roc - 0.5019622894537021


In [18]:
len(samples.columns)

123

In [23]:
from sklearn.model_selection import StratifiedKFold

from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_predict

# 80-20 train-test case
kf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

#mean of number of hidden nodes(1 layer is enough for) between input nodes and output nodes
total_hidden_nodes = int((2 + (len(samples.columns)-1))/2)
log_reg = MLPClassifier(hidden_layer_sizes=(total_hidden_nodes,),activation='logistic', solver='sgd', batch_size=200,
                       learning_rate_init=0.001, max_iter=200, alpha=0.0001)
y_pred = cross_val_predict(estimator=log_reg , X=train_data, y=train_label, cv=kf)
from sklearn import metrics

print(metrics.confusion_matrix(train_label, y_pred))
print(metrics.classification_report(train_label ,y_pred))
print("roc - {}".format(metrics.roc_auc_score(train_label, y_pred)))

[[282572    110]
 [ 24734     91]]
              precision    recall  f1-score   support

           0       0.92      1.00      0.96    282682
           1       0.45      0.00      0.01     24825

    accuracy                           0.92    307507
   macro avg       0.69      0.50      0.48    307507
weighted avg       0.88      0.92      0.88    307507

roc - 0.5016382648911916
