# Credit Analysis Project

In [40]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [41]:
#!pip install -U scikit-learn

## Imports and dataset loading

In [42]:
import pandas as pd
import numpy as np

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

The process we will follow to partition data is described on the chart bellow

![image.png](attachment:image.png)

In [43]:
df = pd.read_table('gdrive/MyDrive/TRNcod.xls')

df.dropna(inplace = True)

df.head()

Unnamed: 0,INDEX,UF_1,UF_2,UF_3,UF_4,UF_5,UF_6,UF_7,IDADE,SEXO_1,...,CEP4_7,CEP4_8,CEP4_9,CEP4_10,CEP4_11,CEP4_12,CEP4_13,CEP4_14,IND_BOM_1_1,IND_BOM_1_2
0,0,1,1,1,0,0,0,0,0.135098,1,...,0,0,1,1,0,1,1,1,0,1
1,1,1,0,1,0,0,1,0,0.273504,1,...,0,1,0,1,1,0,0,0,1,0
2,2,1,0,1,0,0,1,0,0.28191,0,...,1,1,0,0,0,0,1,0,1,0
3,3,1,1,1,0,0,0,0,0.225741,0,...,1,1,0,1,1,0,1,0,1,0
4,4,1,1,0,0,0,1,0,0.480403,0,...,1,1,1,0,0,1,0,1,1,0


In [44]:
print('Quantidade de instâncias: {}\nQuantidade de atributos: {}\n'.format(len(df), len(df.columns)))

Quantidade de instâncias: 389196
Quantidade de atributos: 246



### First Step: 
#### Splitting the Dataset on Two Classes

In [45]:
df['IND_BOM_1_2'].value_counts()

0    255098
1    134098
Name: IND_BOM_1_2, dtype: int64

In [46]:
df0 = df.loc[(df['IND_BOM_1_2']) == 0]
print('Quantidade de instâncias na classe 0 :' + str(len(df0)))

Quantidade de instâncias na classe 0 :255098


In [47]:
df1 = df.loc[(df['IND_BOM_1_2']) == 1]
print('Quantidade de instâncias na classe 1 :' + str(len(df1)))

Quantidade de instâncias na classe 1 :134098


### Second Step: 
#### Splitting on train, test and validation and augmenting to compensate data bias

In [48]:
#First we shuffle the dataset
df0 = df0.sample(frac=1, random_state=33)
#Then we split the dataset in three
len_df0 = len(df0)
train_0, validate_0, test_0 = df0[0:int(len_df0*.5)], df0[int(len_df0*.5):int(len_df0*.75)], df0[int(len_df0*.75):]

print("Size of train_0: ", len(train_0), "\nSize of validate_0: ", len(validate_0), "\nSize of test_0:", len(test_0))

Size of train_0:  127549 
Size of validate_0:  63774 
Size of test_0: 63775


In [49]:
#First we shuffle the dataset
df1 = df1.sample(frac=1, random_state=33)
#Then we split the dataset in three
len_df1 = len(df1)
train_1, validate_1, test_1 = df1[0:int(len_df1*.5)], df1[int(len_df1*.5):int(len_df1*.75)], df1[int(len_df1*.75):]

print("Size of train_1: ", len(train_1), "\nSize of validate_1: ", len(validate_1), "\nSize of test_1:", len(test_1))

Size of train_1:  67049 
Size of validate_1:  33524 
Size of test_1: 33525


#### We can see that class 1 contains less samples than class 0, thus we must perform a augmentation on out data to avoid a biased train

In [50]:
train_data_ratio = len(train_1)/len(train_0)
validate_data_ratio = len(validate_1)/len(validate_0)

#We augment our data multiplying its size by 1/ratio 
new_train_1_len = int(len(train_1)/train_data_ratio)
new_validate_1_len = int(len(validate_1)/validate_data_ratio)

#We get a random sample that has the size we need to augment our data
train_1_aug = train_1.sample(frac=1, random_state = 33)[:new_train_1_len - len(train_1)]
validate_1_aug = validate_1.sample(frac=1, random_state = 33)[:new_validate_1_len - len(validate_1)]

#Finally, we concatenate our data with the augmentation data
train_1 = pd.concat([train_1, train_1_aug])
validate_1 = pd.concat([validate_1, validate_1_aug])

print("New train_1 len: " ,len(train_1), "\nNew validate_1 len: ", len(validate_1))

New train_1 len:  127549 
New validate_1 len:  63774


### Third Step:
#### Rejoining the and shuflling datasets

In [51]:
train = pd.concat([train_0, train_1]).sample(frac=1, random_state=33)
validate = pd.concat([validate_0, validate_1]).sample(frac=1, random_state=33)
#The test set does not need to be shuflled
test = pd.concat([test_0, test_1])

# Making Models for prediction

## Separating inputs from predictions

In [52]:
import torch
from torch.utils.data import Dataset, DataLoader
class RiskAnalysisDataset(Dataset):
    
    def __init__(self, data_features, data_labels):
        self.data_features = torch.tensor(data_features.copy()).float()
        self.data_labels = torch.tensor(data_labels.copy()).float()
    
    def __len__(self):
        return len(self.data_features)
    
    def __getitem__(self, index):
        return self.data_features[index], self.data_labels[index]
        

train_x = train.iloc[:, 0:244]
train_Y = train.iloc[:, -1]
train_data = RiskAnalysisDataset(train_x.values, train_Y.values)
# variar batch train_loader = DataLoader(train_data, batch_size)

validation_x = validate.iloc[:, 0:244]
validation_Y = validate.iloc[:, -1]
validation_data = RiskAnalysisDataset(validation_x.values, validation_Y.values)

test_x = test.iloc[:, 0:244]
test_Y = test.iloc[:, -1]
test_data = RiskAnalysisDataset(test_x.values, test_Y.values)

## Metrics used for evaluation

In [53]:
#!pip install scikit-plot
from sklearn.metrics import roc_curve, auc, mean_squared_error, confusion_matrix, classification_report
from scipy.stats import ks_2samp
import scikitplot as skplt
import matplotlib
import matplotlib.pyplot as plt

def ks_stat(y, yhat):
    return ks_2samp(yhat[y==1], yhat[y!=1]).statistic

def get_metrics(y, yhat, y_proba=None):
  results = dict()

  
  if (y_proba is not None):
    skplt.metrics.plot_ks_statistic(y, y_proba)
    plt.show()
  false_positive_rate, true_positive_rate, thresholds = roc_curve(y, yhat)
  results['auroc'] = auc(false_positive_rate, true_positive_rate)
  results['KS'] = ks_stat(y, yhat)
  results['MSE'] = mean_squared_error(y,yhat)
  results['confusion_matrix'] = confusion_matrix(y,yhat)
  results['classification_report'] = classification_report(y,yhat)
  for key in results:
    print(key, ' ' , str(results[key]))
  return results

#RF_model = RandomForestClassifier(max_features = 7, max_depth = 10)
#RF_model.fit(train_x, train_Y)
#y_pred = RF_model.predict(validation_x)
#y_proba = RF_model.predict_proba(validation_x)
#results = get_metrics(validation_Y, y_pred, y_proba)

## MLP

## Random Forest

In [55]:
# Set of hyper-parameters for grid search

max_samples = [None, 0.5]
max_features =  [160,36,7]
max_depths = [None, 20, 5]
RF_test_results = []

In [None]:
#Experiments via grid_search
for max_sample in max_samples:
  for max_feature in max_features:
    for max_depth in max_depths:
      print("Max_sample " + str(max_sample) + " Max_feature " + str(max_feature) + " Max_depht " + str(max_depth))
      RF_model = RandomForestClassifier(max_depth = max_depth, max_samples = max_sample, max_features = max_feature)
      RF_model.fit(train_x, train_Y)
      yhat = RF_model.predict(test_x)
      y_proba = RF_model.predict_proba(test_x)
      result = get_metrics(test_Y, yhat, y_proba)
      RF_test_results.append(("Max_sample " + str(max_sample) +
                        "\nMax_feature " + str(max_feature) +
                        "\nMax_depht " + str(max_depth)
                        , result['c_report']))

Max_sample None Max_feature 160 Max_depht None


In [None]:
# open file for writing, "w" 
f = open("RF_test_results.txt","w")

# write json object to file
for r in RF_test_results:
  f.write(str(r)+"\n")

# close file
f.close()