In [1]:
#numpy and pandas for data manipulation
import numpy as np
import pandas as pd

#sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_predict

In [12]:
samples = pd.read_csv('data/application_train.csv')
# Create an anomalous flag column
samples['DAYS_EMPLOYED_ANOM'] = samples["DAYS_EMPLOYED"] == 365243

# Replace the anomalous values with nan
samples['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

# Treating as outlier and removing
samples = samples[samples['CODE_GENDER'] != 'XNA']

train_label = samples['TARGET']
train_data = samples.drop(columns=['TARGET'])

correlations = samples.corr()

# Select upper triangle of correlation matrix
upper = correlations.where(np.triu(np.ones(correlations.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.99)]

# Drop highly correlated features
train_data = train_data.drop(train_data[to_drop], axis=1)

In [17]:
train_data.shape

(307507, 226)

In [14]:
# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in train_data:
    if train_data[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(train_data[col].unique())) <= 2:
            # Train on the training data
            le.fit(train_data[col])
            # Transform 
            train_data[col] = le.transform(train_data[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
print('%d columns were label encoded.' % le_count)

4 columns were label encoded.


In [16]:
# one-hot encoding of categorical variables
train_data = pd.get_dummies(train_data)

In [24]:
train_data['CREDIT_INCOME_PERCENT'] = train_data['AMT_CREDIT'] / train_data['AMT_INCOME_TOTAL']
train_data['ANNUITY_INCOME_PERCENT'] = train_data['AMT_ANNUITY'] / train_data['AMT_INCOME_TOTAL']
train_data['CREDIT_TERM'] = train_data['AMT_ANNUITY'] / train_data['AMT_CREDIT']
train_data['DAYS_EMPLOYED_PERCENT'] = train_data['DAYS_EMPLOYED'] / train_data['DAYS_BIRTH']

In [27]:
train_data = train_data.drop(columns =['SK_ID_CURR'])

In [29]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer


# Median imputation of missing values
imputer = SimpleImputer(strategy = 'median')

# Scale each feature to 0-1
scaler = MinMaxScaler(feature_range = (0, 1))

# Fit on the training data
imputer.fit(train_data)

# Transform both training and testing data
train_data = imputer.transform(train_data)

# Repeat with the scaler
scaler.fit(train_data)
train_data = scaler.transform(train_data)

print('Training data shape: ', train_data.shape)

Training data shape:  (307507, 229)


In [30]:
from sklearn.model_selection import train_test_split
x_train,x_test, y_train, y_test = train_test_split(train_data, train_label, test_size=0.2)

In [28]:
train_data

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,CREDIT_INCOME_PERCENT,ANNUITY_INCOME_PERCENT,CREDIT_TERM,DAYS_EMPLOYED_PERCENT
0,0,1,0,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,...,0,0,1,0,1,0,2.007889,0.121978,0.060749,0.067329
1,0,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,0,0,0,0,1,0,4.790750,0.132217,0.027598,0.070862
2,1,1,1,1,0,67500.0,135000.0,6750.0,135000.0,0.010032,...,0,0,0,0,0,0,2.000000,0.100000,0.050000,0.011814
3,0,0,0,1,0,135000.0,312682.5,29686.5,297000.0,0.008019,...,0,0,0,0,0,0,2.316167,0.219900,0.094941,0.159905
4,0,1,0,1,0,121500.0,513000.0,21865.5,513000.0,0.028663,...,0,0,0,0,0,0,4.222222,0.179963,0.042623,0.152418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,0,1,0,0,0,157500.0,254700.0,27558.0,225000.0,0.032561,...,0,0,1,0,1,0,1.617143,0.174971,0.108198,0.025303
307507,0,0,0,1,0,72000.0,269550.0,12001.5,225000.0,0.025164,...,0,0,1,0,1,0,3.743750,0.166687,0.044524,
307508,0,0,0,1,0,153000.0,677664.0,29979.0,585000.0,0.005002,...,0,1,0,0,1,0,4.429176,0.195941,0.044239,0.529266
307509,0,0,0,1,0,171000.0,370107.0,20205.0,319500.0,0.005313,...,0,0,1,0,1,0,2.164368,0.118158,0.054592,0.400134


In [31]:
from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_predict

# Make the model with the specified regularization parameter

# 80-20 train-test case
kf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
log_reg = LogisticRegression(C = 0.001, max_iter=100, solver='sag', fit_intercept=True,
                             class_weight='balanced')
y_pred = cross_val_predict(estimator=log_reg , X=x_train, y=y_train, cv=kf)
from sklearn import metrics

print(metrics.confusion_matrix(y_train, y_pred))
# print(metrics.classification_report(train_label ,y_pred))
print("roc - {}".format(metrics.roc_auc_score(y_train, y_pred)))

[[154387  71687]
 [  6593  13338]]
roc - 0.6760567414368054


In [32]:
final_log = LogisticRegression(C = 0.001, max_iter=100, solver='sag', fit_intercept=True,
                             class_weight='balanced')
final_log.fit(x_train, y_train)
test_pred = final_log.predict(x_test)
print(metrics.confusion_matrix(y_test, test_pred))
print("roc - {}".format(metrics.roc_auc_score(y_test, test_pred)))

[[38928 17680]
 [ 1631  3263]]
roc - 0.67720571537742


In [17]:
from sklearn.model_selection import StratifiedKFold

from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_predict

# 80-20 train-test case
mlp_kf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

#mean of number of hidden nodes(1 layer is enough for) between input nodes and output nodes
# total_hidden_nodes = int((2 + (len(samples.columns)-1))/2)
# root of total training samples
total_hidden_nodes = 490
mlp = MLPClassifier(hidden_layer_sizes=(total_hidden_nodes,),activation='relu', solver='adam', 
                    batch_size=400, learning_rate_init=0.01, max_iter=100, alpha=0.0001)
y_pred = cross_val_predict(estimator=mlp , X=x_train, y=y_train, cv=mlp_kf)
from sklearn import metrics

print(metrics.confusion_matrix(y_train, y_pred))
# print(metrics.classification_report(train_label ,y_pred))
print("roc - {}".format(metrics.roc_auc_score(y_train, y_pred)))

[[225792    336]
 [ 19784     93]]
roc - 0.5015964451809535


In [None]:
final_mlp = MLPClassifier(hidden_layer_sizes=(total_hidden_nodes-10,),activation='logistic', solver='sgd', 
                    batch_size=400, learning_rate_init=0.01, max_iter=200, alpha=0.0001)
final_mlp.fit(x_train, y_train)
test_pred = final_mlp.predict(x_test)
print(metrics.confusion_matrix(y_test, test_pred))
print("roc - {}".format(metrics.roc_auc_score(y_test, test_pred)))

AttributeError: 'MLPClassifier' object has no attribute 'output_activation_'