In [21]:
import os
import pandas as pd
import numpy as np
from collections import Counter
import pickle

In [22]:
# for dirname, _, filenames in os.walk('/mnt'):
#     for filename in filenames:
#         print(dirname,filename)

In [23]:
test_data_input = pd.read_csv('/mnt/cs-test.csv')

In [24]:
test_data_input.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,,0.885519,43,0,0.177513,5700.0,4,0,0,0,0.0
1,2,,0.463295,57,0,0.527237,9141.0,15,0,4,0,2.0
2,3,,0.043275,59,0,0.687648,5083.0,12,0,1,0,2.0
3,4,,0.280308,38,1,0.925961,3200.0,7,0,2,0,0.0
4,5,,1.0,27,0,0.019917,3865.0,4,0,0,0,1.0


In [25]:
test_data_input = test_data_input.loc[:,~test_data_input.columns.isin(['Unnamed: 0','SeriousDlqin2yrs'])]

## Preprocessing Test Data based on approach for Train Data

### RevolvingUtilizationOfUnsecuredLines

In [26]:
upper_limit_outlier = 1.35
lower_limit_outlier = -0.24
valid_upper_val = 1.349255721
valid_lower_val = 0.0
 

test_data_input.loc[test_data_input['RevolvingUtilizationOfUnsecuredLines']>upper_limit_outlier,'RevolvingUtilizationOfUnsecuredLines']=valid_upper_val
test_data_input.loc[test_data_input['RevolvingUtilizationOfUnsecuredLines']<lower_limit_outlier,'RevolvingUtilizationOfUnsecuredLines']=valid_lower_val


### age

In [27]:
# options are: outlier, bin or adaptive_bin
age_bin_or_outlier = 'adaptive_bin'

In [28]:
if age_bin_or_outlier == 'bin':

    test_data_input['age_fixed_width_bin'] = np.where(test_data_input['age'] <= 25, 1,
                                            np.where(test_data_input['age'] <= 35, 2,
                                            np.where(test_data_input['age'] <= 45, 3,
                                            np.where(test_data_input['age'] <= 55, 4,
                                            np.where(test_data_input['age'] <= 65, 5,
                                            np.where(test_data_input['age'] <= 75, 6,
                                            np.where(test_data_input['age'] <= 85, 7,
                                            np.where(test_data_input['age'] <= 95, 8,9))))))))
    
    print(set(test_data_input['age_fixed_width_bin']))
    
    test_data_input['age'] = test_data_input['age_fixed_width_bin']
    
    test_data_input.drop(columns='age_fixed_width_bin',inplace=True)

In [29]:
if age_bin_or_outlier == 'adaptive_bin':

    quantiles = 10 # Example: if quantiles = 4, it means, bins will be 0-25 percentile, 25-50th percentile, 50-75th percentile and 75-100th percentile
    quantile_label = [i for i in range(quantiles)]
    test_data_input['age_adaptive_bin'] = pd.qcut(test_data_input['age'],
                                                   q=quantiles,
                                                   labels=quantile_label)

    print(set(test_data_input['age_adaptive_bin']))

    test_data_input['age'] = test_data_input['age_adaptive_bin']

    test_data_input.drop(columns='age_adaptive_bin',inplace=True)

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}


### Exploring three relevant seeming PastDue features together

#### NumberOfTime30-59DaysPastDueNotWorse, NumberOfTime60-89DaysPastDueNotWorse, NumberOfTimes90DaysLate


In [30]:
## Approach 2: Bin the dataset
## Given that three features have a very high positive correlation we can use the same concept and threshold of ...
## ... binning for all three features.

## For binning, we first analyze the dataset:
# print(sorted(Counter(test_data_input['NumberOfTime30-59DaysPastDueNotWorse']).items()))
## Given the other two features are related to this one, the Counter() for them also shows same trend to an extent
## From this data we relaize that majority do not have a PastDue scenario with majority of the occurences ...
## ... being 0. At same time I am thinking along the lines of 1 week, 2 weeks, 1 month and see if the ...
## ... customers are able to pay the due within that. This idea came around people getting a paycheck next ...
## ... week enabling them to pay the due or so on. Note, this a just a though process through which we defined ...
## ... bins and does not necessarily hold true. Nonetheless atleast we have a thought process behind determining ...
## ... these bins


one_day = 1
one_week = 7*one_day
half_month = 2*one_week
one_month = 4*one_week

custom_bins = {0:0,one_week:1,half_month:2, one_month:3}
## Due of 0 days, assign value 0
## Due of (0,7] days, assign value 1
## Due of (7,14] days, assign value 2
## Due of (14,28] days, assign value 3
## Due beyond 28 days, assign value 4
## The code in below cell is execusting the above logic
    
print(custom_bins)

{0: 0, 7: 1, 14: 2, 28: 3}


In [31]:
for past_due_feature in ['NumberOfTime60-89DaysPastDueNotWorse',
                         'NumberOfTimes90DaysLate',
                         'NumberOfTime30-59DaysPastDueNotWorse']:
    
    test_data_input[past_due_feature] = np.where(test_data_input[past_due_feature]>one_month,
                                                  custom_bins[one_month]+1, 
                                                  np.where(test_data_input[past_due_feature]>half_month,
                                                           custom_bins[half_month]+1, 
                                                           np.where(test_data_input[past_due_feature]>one_week, 
                                                                    custom_bins[one_week]+1, 
                                                                    np.where(test_data_input[past_due_feature]>0, 
                                                                             custom_bins[0]+1,0))))
    
    print(sorted(Counter(test_data_input[past_due_feature]).items()))


[(0, 96375), (1, 4910), (2, 4), (4, 214)]
[(0, 95785), (1, 5464), (2, 37), (3, 3), (4, 214)]
[(0, 85190), (1, 16051), (2, 47), (3, 1), (4, 214)]


### DebtRatio

In [32]:
outlier_upper_range,outlier_lower_range,non_outlier_highest_value,non_outlier_lowest_value = 1.91, -0.86, 1.91, 0.0

test_data_input.loc[test_data_input['DebtRatio']>outlier_upper_range,'DebtRatio'] = non_outlier_highest_value
test_data_input.loc[test_data_input['DebtRatio']<outlier_lower_range,'DebtRatio'] = non_outlier_lowest_value

### Replacing Nan's by 0


In [33]:
test_data_input.replace(np.nan,0,inplace=True)

### Load a saved ML Model

In [34]:
# # file_name = 'RandomForestMLModel.sav'
# # file_name = 'XGBoostMLModel.sav'
# file_name = 'SVMMLModel.sav'
# if 'XGBoost' in file_name:
#     test_data_input['age'] = pd.to_numeric(test_data_input['age'])

# model = pickle.load(open(file_name,'rb'))

In [35]:
# For Neural Network Model

from keras.models import model_from_json
file_name = 'NeuralNetworkMLModel'

# load json and create model
json_file = open(file_name+'.json', 'r')
model_json = json_file.read()
json_file.close()
model = model_from_json(model_json)
# load weights into new model
model.load_weights(file_name+".h5")
print("Loaded model from disk")

Loaded model from disk


### Predict on Test Dataset

In [37]:
if file_name == 'SVMMLModel.sav':
    predicted_output = model.predict(test_data_input)
elif file_name == 'NeuralNetworkMLModel':
    predicted_output = model.predict_proba(test_data_input)
else:
    predicted_output = model.predict_proba(test_data_input)

### Create DataFrame for Kaggle entry

In [33]:
data  = {'Id': [i+1 for i in range(len(predicted_output))], 'Probability' : list(pd.DataFrame(predicted_output)[0])}
entry_df = pd.DataFrame(data)

In [34]:
entry_df.head()

Unnamed: 0,Id,Probability
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0


### Save DataFrame to CSV

In [35]:
entry_name = 'entry_' + file_name[::-1][4:][::-1] + '.csv'
entry_df.to_csv(entry_name,index=False)