In [52]:
# standard libraries
import pandas as pd
import numpy as np
#import re
import os
from IPython.display import Image
#import sklearn
#import time

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from tabulate import tabulate
from IPython.display import clear_output

# data pre-processing
from scipy.io import arff
#from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.impute._base import _BaseImputer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection._split import BaseShuffleSplit
from sklearn.datasets import load_digits
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# prediction models
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# import warnings filter
'''import warnings
warnings.filterwarnings('ignore')
from warnings import simplefilter 
simplefilter(action='ignore', category=FutureWarning)'''



In [53]:
from os import listdir
from os.path import isfile, join

class FilePathManager:
    def __init__(self, local_dir: str):
        self.local_dir = local_dir
    
    def retrieve_full_path(self):
        return os.getcwd()+'/'+self.local_dir

In [54]:
class ARFFLoader:
    df = pd.DataFrame()
    
    def __init__(self, file_path_manager: FilePathManager):
        self.file_path_manager = file_path_manager
    
    def load_data(self):
        files = self.retrieve_files()
        for file in files:
            self.df = pd.concat([self.df, self.load_file(file)])
        self.df=self.df.reset_index(drop=True)
        
    def load_file(self, file_name):
        return pd.DataFrame(arff.loadarff(self.file_path_manager.retrieve_full_path()+'/'+file_name)[0])
 
    def retrieve_files(self):
        full_path = self.file_path_manager.retrieve_full_path()
        return [f for f in os.listdir(full_path) if os.path.isfile(join(full_path, f))]
    
    def get_df(self):
        return self.df;
    
    def size(self):
        return len(self.df)

In [7]:
def clean_df(df: pd.DataFrame):
    df['bankrupt'] = ( df['class'] == df['class'][df.shape[0]-1] ).astype(int)
    df = df.drop('class', axis=1)
    return df;

In [8]:
path_manager = FilePathManager('data')
loader = ARFFLoader(path_manager)
loader.load_data()

In [9]:
loader.get_df()

Unnamed: 0,Attr1,Attr2,Attr3,Attr4,Attr5,Attr6,Attr7,Attr8,Attr9,Attr10,...,Attr56,Attr57,Attr58,Attr59,Attr60,Attr61,Attr62,Attr63,Attr64,class
0,0.202350,0.46500,0.240380,1.51710,-14.5470,0.510690,0.253660,0.918160,1.15190,0.426950,...,0.131840,0.473950,0.86816,0.00024,8.5487,5.16550,107.740,3.38790,5.34400,b'0'
1,0.030073,0.59563,0.186680,1.33820,-37.8590,-0.000319,0.041670,0.678900,0.32356,0.404370,...,0.121460,0.074369,0.87235,0.00000,1.5264,0.63305,622.660,0.58619,1.23810,b'0'
2,0.257860,0.29949,0.665190,3.22110,71.7990,0.000000,0.318770,2.332000,1.67620,0.698410,...,0.164990,0.369210,0.81614,0.00000,4.3325,3.19850,65.215,5.59690,47.46600,b'0'
3,0.227160,0.67850,0.042784,1.08280,-88.2120,0.000000,0.285050,0.473840,1.32410,0.321500,...,0.293580,0.706570,0.78617,0.48456,5.2309,5.06750,142.460,2.56210,3.00660,b'0'
4,0.085443,0.38039,0.359230,1.94440,21.7310,0.187900,0.108230,1.371400,1.11260,0.521670,...,0.101240,0.163790,0.89876,0.00000,5.7035,4.00200,89.058,4.09840,5.98740,b'0'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43400,0.018371,0.47410,-0.136190,0.60839,-18.4490,0.018371,0.018371,0.972030,1.01210,0.460840,...,0.011909,0.039866,0.98809,0.27414,73.5050,79.23700,31.268,11.67300,5.14890,b'1'
43401,-0.013359,0.58354,-0.022650,0.92896,-42.2320,-0.013359,-0.015036,0.562890,0.98904,0.328470,...,-0.011082,-0.040671,1.01110,0.80592,10.5990,7.17400,94.092,3.87920,1.75720,b'1'
43402,0.006338,0.50276,0.439230,1.87360,9.7417,0.006338,0.012022,0.983560,1.00830,0.494490,...,0.008258,0.012817,0.99174,0.00000,10.4700,6.07590,51.019,7.15420,62.00100,b'1'
43403,-0.041643,0.84810,-0.128520,0.57485,-121.9200,0.000000,-0.036795,0.179010,0.42138,0.151820,...,-0.232720,-0.274290,0.98788,3.59310,39.7030,3.14200,261.850,1.39390,0.51005,b'1'


### Percentage of missing data

In [350]:
loader.get_df()['class'].value_counts()

b'0'    41314
b'1'     2091
Name: class, dtype: int64

In [351]:
2091/(41314+2091)

0.04817417348231771

In [352]:
2091/(2091+0)

1.0

In [353]:
(2*(1*(.04817417348231771*1.0)))/(.04817417348231771+1)

0.09192016880604888

In [10]:
missing = loader.get_df().isnull().sum()
pct_missing = missing/loader.size()*100
pct_missing.sort_values(ascending=False).head(15)

Attr37    43.736897
Attr21    13.486925
Attr27     6.367930
Attr60     4.957954
Attr45     4.946435
Attr24     2.124179
Attr28     1.870752
Attr64     1.870752
Attr54     1.870752
Attr53     1.870752
Attr41     1.737127
Attr32     0.847829
Attr52     0.693468
Attr47     0.684253
Attr46     0.311024
dtype: float64

## Modeling

In [11]:
from abc import ABC, abstractmethod

class BaseImputer:
    @abstractmethod
    def fit(self, X, y=None):
        pass
    
    @abstractmethod
    def transform(self, X):
        pass

class BaseModel:
    @abstractmethod
    def fit(self, X, y, sample_weight=None):
        pass
    
    @abstractmethod
    def predict(self, X):
        pass

In [12]:
class XGBModel:
    _model = None
    
    def __init__(self, params, num_round: int):
        self._params = params
        self._num_round = num_round
        
    def fit(self, X, y, sample_weight=None):
        dtrain = xgb.DMatrix(X, label = y)
        self._model = xgb.train(self._params, dtrain)
        
    def predict(self, X):
        dtest = xgb.DMatrix(X)
        return self._model.predict(dtest)
        

In [316]:
class Modeling:
    _accuracy = None
    _f1 = None
    
    _X_train_fitted = None
    _X_test_fitted = None
    _y_train = None
    _y_test = None
    
    def __init__(self, data: pd.DataFrame, 
                 target_name: str, 
                 shuffle_splitter: BaseShuffleSplit, 
                 imputer: BaseImputer, 
                 model: BaseModel, scaler = None):
        self._data = data
        self._target_name = target_name
        self._shuffle_splitter = shuffle_splitter
        self._imputer = imputer
        self._model = model
        self._X, self._y = self._split_data()
        self._scaler = scaler
        
    @property
    def X(self):
        return self._X
    
    @property
    def y(self):
        return self._y

    @property
    def model(self):
        return self._model
    
    @model.setter
    def model(self, model):
        self._model = model
     
    @property
    def X_train(self):
        return self._X_train_fitted
    
    @property
    def X_test(self):
        return self._X_test_fitted
    
    @property
    def y_train(self):
        return self._y_train
    
    @property
    def y_test(self):
        return self._y_test
    
    def _split_data(self):
        X = self._data.copy()
        return X.drop([self._target_name], axis=1) , X[self._target_name]
    
    def _shuffle_split(self):
        X = self.X
        y = self.y
        for train_index, test_index in self._shuffle_splitter.split(X,y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y[train_index], y[test_index]
        return X_train, X_test, y_train, y_test
    
    def _fit_imputer(self, train):
        self._imputer.fit(train)
    
    def _fit_scaler(self, train):
        if self._scaler is not None:
            self._scaler.fit(train)
    
    def _impute_data(self, X: pd.DataFrame):
        return pd.DataFrame(self._imputer.transform(X), columns = self.X.columns, index = X.index)
    
    def _scale_data(self, X: pd.DataFrame):
        if self._scaler is not None:
            X = pd.DataFrame(self._scaler.transform(X), columns = self._X.columns)
        return X
    
    def prepare_data(self):
        X_train, X_test, y_train, y_test = self._shuffle_split()   
        self._fit_imputer(X_train)
        X_train = self._impute_data(X_train)
        X_test = self._impute_data(X_test)
        self._fit_scaler(X_train)
        self._X_train_fitted = self._scale_data(X_train)
        self._X_test_fitted = self._scale_data(X_test)
        self._y_train = y_train
        self._y_test = y_test
        
    def prepare_and_train(self):
        self.prepare_data()
        self.train()
        
    def train(self):
        self._model.fit(self.X_train, self.y_train)
        preds =  self._model.predict(self.X_train)
        self._accuracy = accuracy_score(self.y_train, preds)
        self._f1 = f1_score(self.y_train, preds)
        return self._accuracy, self._f1
        
    def test(self):
        preds = self._model.predict(self.X_test)
        return (accuracy_score(self.y_test, preds), f1_score(self.y_test, preds))
        

### RandomForest

In [317]:
simple_impute_model = Modeling(clean_df(loader.get_df()),'bankrupt',
                               StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=12343),
                               SimpleImputer(missing_values=np.nan, strategy='median'),
                               RandomForestClassifier(random_state=0, class_weight='balanced', max_depth=12),
                               StandardScaler())

In [318]:
simple_impute_model.prepare_and_train()

In [319]:
simple_impute_model._accuracy

0.9759075798966528

In [320]:
simple_impute_model._f1

0.7934537246049662

In [321]:
simple_impute_model.test()

(0.9462448164644448, 0.44794952681388017)

### Iterative Model

In [322]:
iterative_impute_model = Modeling(clean_df(loader.get_df()),'bankrupt',
                               StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=12343),
                               IterativeImputer(missing_values=np.nan, random_state=1234),
                               RandomForestClassifier(random_state=0, class_weight='balanced', max_depth=14),
                               StandardScaler())

In [323]:
iterative_impute_model.prepare_and_train()



In [324]:
iterative_impute_model._accuracy

0.9789685021228977

In [325]:
iterative_impute_model._f1

0.8162208800690249

In [326]:
iterative_impute_model.test()

(0.9525418522500384, 0.5056)

### Knn Imputation

In [70]:
knn_impute_model = Modeling(clean_df(loader.get_df()),'bankrupt',
                            StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=1234),
                            KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean'),
                            RandomForestClassifier(random_state=0, class_weight='balanced'),
                            StandardScaler())

In [71]:
knn_impute_model.train()

In [72]:
knn_impute_model._accuracy

1.0

In [73]:
knn_impute_model._f1

1.0

In [74]:
knn_impute_model.test()

(0.9524650591307019, 0.09635036496350363)

### XGB

### XGB Simple Imputer

In [176]:
num_round = 1000
params = {
    'max_depth': 12,
    'objective': 'multi:softmax',
    'num_class': 2,
    'eta': 0.01
}

In [177]:
xgb_simple_imputer = Modeling(clean_df(loader.get_df()),'bankrupt',
                 StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=12343),
                 SimpleImputer(missing_values=np.nan, strategy='median'),
                 XGBModel(params, num_round),
                 StandardScaler())

In [178]:
xgb_simple_imputer.prepare_and_train();



In [179]:
xgb_simple_imputer._accuracy

0.9821610769180134

In [180]:
xgb_simple_imputer._f1

0.7732217573221758

In [181]:
xgb_simple_imputer.test()

(0.968975579788051, 0.5826446280991736)

### XGB Iterative

In [333]:
params2 = {
    'max_depth': 15,
    'objective': 'multi:softmax',
    'num_class': 2,
    'eta': 0.3
}

In [334]:
xgb_iterative_imputer = Modeling(clean_df(loader.get_df()),'bankrupt',
                 StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=12343),
                 IterativeImputer(missing_values=np.nan, random_state=1234),
                 XGBModel(params2, num_round),
                 StandardScaler())

In [335]:
xgb_iterative_imputer.prepare_and_train()





In [336]:
xgb_iterative_imputer._accuracy

0.9976960800447618

In [337]:
xgb_iterative_imputer._f1

0.9755073477956613

In [338]:
xgb_iterative_imputer.test()

(0.9728152357548764, 0.6297071129707112)

### Examining Depth vs Metrics

In [327]:
forest_modeling = Modeling(clean_df(loader.get_df()),'bankrupt',
                           StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=12343),
                           IterativeImputer(missing_values=np.nan, random_state=1234),
                           None,
                           StandardScaler())
forest_modeling.prepare_data()



In [328]:
results = pd.DataFrame()
for i in range(4,21):
    forest_modeling.model = RandomForestClassifier(random_state=0, class_weight='balanced', max_depth = i)
    accuracy_train, f1_train = forest_modeling.train()
    accuracy_test, f1_test = forest_modeling.test()
    results = results.append({'max_depth': forest_modeling.model.max_depth,
                    'accuracy_train': accuracy_train,
                    'f1_train': f1_train,
                    'accuracy_test': accuracy_test,
                    'f1_test': f1_test }, ignore_index = True)

In [329]:
results

Unnamed: 0,accuracy_test,accuracy_train,f1_test,f1_train,max_depth
0,0.767701,0.777079,0.235145,0.255141,4.0
1,0.802795,0.816641,0.267541,0.301267,5.0
2,0.839426,0.853405,0.3133,0.358028,6.0
3,0.867685,0.882303,0.350057,0.415686,7.0
4,0.899017,0.917322,0.39317,0.509758,8.0
5,0.914299,0.936149,0.416928,0.580631,9.0
6,0.934956,0.958102,0.462904,0.68167,10.0
7,0.945554,0.971431,0.485113,0.762322,11.0
8,0.952542,0.978969,0.5056,0.816221,12.0
9,0.955537,0.984103,0.507234,0.856464,13.0


In [339]:
results2 = pd.DataFrame()
for i in range(4,21):
    params = {
    'max_depth': i,
    'objective': 'multi:softmax',
    'num_class': 2,
    'eta': 0.3
    }
    forest_modeling.model = XGBModel(params, num_round)
    accuracy_train, f1_train = forest_modeling.train()
    accuracy_test, f1_test = forest_modeling.test()
    results2 = results2.append({'max_depth': i,
                    'accuracy_train': accuracy_train,
                    'f1_train': f1_train,
                    'accuracy_test': accuracy_test,
                    'f1_test': f1_test }, ignore_index = True)



In [340]:
results2

Unnamed: 0,accuracy_test,accuracy_train,f1_test,f1_train,max_depth
0,0.967824,0.969917,0.506478,0.54931,4.0
1,0.970358,0.973275,0.563348,0.618062,5.0
2,0.971049,0.975611,0.580645,0.662107,6.0
3,0.971971,0.979462,0.603692,0.729402,7.0
4,0.973046,0.983543,0.623794,0.794069,8.0
5,0.973583,0.986769,0.632479,0.840855,9.0
6,0.973814,0.99006,0.642932,0.884996,10.0
7,0.97389,0.99299,0.643606,0.921547,11.0
8,0.972815,0.995425,0.627368,0.950161,12.0
9,0.973737,0.997005,0.647423,0.967924,13.0


In [347]:
results3 = pd.DataFrame()
for i in range(0,10):
    eta = 0.1 + i*0.1
    params = {
    'max_depth': 13,
    'objective': 'multi:softmax',
    'num_class': 2,
    'eta': eta
    }
    forest_modeling.model = XGBModel(params, num_round)
    accuracy_train, f1_train = forest_modeling.train()
    accuracy_test, f1_test = forest_modeling.test()
    results3 = results3.append({'eta': eta,
                    'accuracy_train': accuracy_train,
                    'f1_train': f1_train,
                    'accuracy_test': accuracy_test,
                    'f1_test': f1_test }, ignore_index = True)



In [348]:
results3

Unnamed: 0,accuracy_test,accuracy_train,eta,f1_test,f1_train
0,0.971126,0.987263,0.1,0.6,0.847818
1,0.97343,0.992529,0.2,0.637317,0.915957
2,0.973737,0.997005,0.3,0.647423,0.967924
3,0.973046,0.998618,0.4,0.640737,0.985447
4,0.972508,0.999013,0.5,0.632444,0.989648
5,0.974044,0.999539,0.6,0.657895,0.995196
6,0.97343,0.999967,0.7,0.65121,0.999658
7,0.973353,0.999934,0.8,0.652653,0.999316
8,0.970895,1.0,0.9,0.619095,1.0
9,0.972431,1.0,1.0,0.645607,1.0
