In [3]:
#numpy and pandas for data manipulation
import numpy as np
import pandas as pd

#sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_predict

In [4]:
file_data = pd.read_csv('data/application_train.csv')
max_size = file_data['TARGET'].value_counts().max()
print("0 - " + str(max_size))
print("1 - " + str(file_data['TARGET'].value_counts().min()))
lst = [file_data]
for class_index, group in file_data.groupby('TARGET'):
    lst.append(group.sample(max_size-len(group), replace=True))
samples = pd.concat(lst)

0 - 282686
1 - 24825


In [5]:
print("1 - " + str(samples['TARGET'].value_counts().min()))

1 - 282686


In [6]:
# # Create an anomalous flag column
samples['DAYS_EMPLOYED_ANOM'] = samples["DAYS_EMPLOYED"] == 365243

# # # Replace the anomalous values with nan
samples['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

# Treating as outlier and removing
samples = samples[samples['CODE_GENDER'] != 'XNA']

train_label = samples['TARGET']
train_data = samples.drop(columns=['TARGET'])

correlations = samples.corr()

# Select upper triangle of correlation matrix
upper = correlations.where(np.triu(np.ones(correlations.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.99)]

# Drop highly correlated features
train_data = train_data.drop(train_data[to_drop], axis=1)

In [7]:
# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in train_data:
    if train_data[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(train_data[col].unique())) <= 2:
            # Train on the training data
            le.fit(train_data[col])
            # Transform 
            train_data[col] = le.transform(train_data[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
print('%d columns were label encoded.' % le_count)

4 columns were label encoded.


In [8]:
# one-hot encoding of categorical variables
train_data = pd.get_dummies(train_data)

In [9]:
train_data['CREDIT_INCOME_PERCENT'] = train_data['AMT_CREDIT'] / train_data['AMT_INCOME_TOTAL']
train_data['ANNUITY_INCOME_PERCENT'] = train_data['AMT_ANNUITY'] / train_data['AMT_INCOME_TOTAL']
train_data['CREDIT_TERM'] = train_data['AMT_ANNUITY'] / train_data['AMT_CREDIT']
train_data['DAYS_EMPLOYED_PERCENT'] = train_data['DAYS_EMPLOYED'] / train_data['DAYS_BIRTH']

In [10]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer


# Median imputation of missing values
imputer = SimpleImputer(strategy = 'median')

# Scale each feature to 0-1
scaler = MinMaxScaler(feature_range = (0, 1))

# Fit on the training data
imputer.fit(train_data)

# Transform both training and testing data
train_data = imputer.transform(train_data)

# Repeat with the scaler
scaler.fit(train_data)
train_data = scaler.transform(train_data)

print('Training data shape: ', train_data.shape)

Training data shape:  (565368, 231)


In [11]:
from sklearn.model_selection import train_test_split
x_train,x_test, y_train, y_test = train_test_split(train_data, train_label, stratify=train_label, test_size=0.2)

In [12]:
from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_predict

# Make the model with the specified regularization parameter

# 80-20 train-test case
kf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
log_reg = LogisticRegression(C = 0.001, max_iter=100, solver='sag', fit_intercept=True)
y_pred = cross_val_predict(estimator=log_reg , X=x_train, y=y_train, cv=kf)

from sklearn import metrics

# print(metrics.confusion_matrix(y_train, y_pred))
# print(metrics.classification_report(train_label ,y_pred))
print("roc - {}".format(metrics.roc_auc_score(y_train, y_pred)))

roc - 0.6802983564386093


In [14]:
from sklearn.model_selection import StratifiedKFold

from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_predict

# 80-20 train-test case
mlp_kf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

#mean of number of hidden nodes(1 layer is enough for) between input nodes and output nodes
total_hidden_nodes = int((2 + (len(samples.columns)-1))/2)
mlp = MLPClassifier(hidden_layer_sizes=(total_hidden_nodes-10,),activation='relu', solver='adam', 
                    batch_size=400, learning_rate_init=0.01, max_iter=1000, alpha=0.0001)
y_pred = cross_val_predict(estimator=mlp , X=x_train, y=y_train, cv=mlp_kf)
from sklearn import metrics

print(metrics.confusion_matrix(y_train, y_pred))
# print(metrics.classification_report(train_label ,y_pred))
print("roc - {}".format(metrics.roc_auc_score(y_train, y_pred)))

[[153587  72264]
 [ 56106 170337]]
roc - 0.7161329381088406


In [15]:
len(x_train)

452294

In [16]:
452294**0.5

672.5280663288336