In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from xgboost import XGBClassifier

from warnings import filterwarnings
filterwarnings(action="ignore")
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, classification_report, auc, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer

### Read The Data

In [None]:
one = pd.read_csv('Data/1year.csv')
two = pd.read_csv('Data/2year.csv')
three = pd.read_csv('Data/3year.csv')
four = pd.read_csv('Data/4year.csv')
five = pd.read_csv('Data/5year.csv')

In [None]:
print(one.shape)
print(two.shape)
print(three.shape)
print(four.shape)
print(five.shape)

In [None]:
#  Function to calculate missing values by column
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Add Dtype col
        l = []
        for i in mis_val_table.index:
            l.append(df[i].dtype)
        mis_val_table['dtype'] = l

        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Out of " + str(df.shape[1]) + " columns, \n"      
            " there are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns
    

def get_outliers_Zscore(data_series, threshold = 3, boxplot = False):
    
    # Calculate mean and standard deviation
    m = data_series.mean()
    s = data_series.std()
    
    # Calculate Z-scores and identify outliers based on the threshold
    outliers_index =  data_series[data_series.apply(lambda x: abs((x - m)/s))>3].index.tolist()
    return outliers_index

# Missing Values
- Missing values are entered as '?' and these features are read as object dtype, when infact they are float type.
- They are replaced with NaN and changed the features as float type

In [None]:
one.replace('?',np.NaN,inplace=True)
two.replace('?',np.NaN,inplace=True)
three.replace('?',np.NaN,inplace=True)
four.replace('?',np.NaN,inplace=True)
five.replace('?',np.NaN,inplace=True) 

In [None]:
features = list(one.columns)
features.remove('class')

one.loc[:,features] = one.loc[:,features].astype('float64')
two.loc[:,features] = two.loc[:,features].astype('float64')
three.loc[:,features] = three.loc[:,features].astype('float64')
four.loc[:,features] = four.loc[:,features].astype('float64')
five.loc[:,features] = five.loc[:,features].astype('float64')

In [None]:
data = pd.concat([one,two,three,four,five],ignore_index=True)

In [None]:
missing_values_table(data).head()

# Outliers
- It is observed that every column has outliers.
- Ids of these outlier records are stored in outlier_id_list

In [None]:
outlier_id_list = []
for col in data.columns.values:
    if (col != 'class') & (col != 'bankrupt_after_years'):
        outlier_id_list.extend(get_outliers_Zscore(data[col]))
outlier_id_list = set(outlier_id_list)

In [None]:
fig, ax = plt.subplots(6,2,figsize = (15,15))
fig.suptitle('Comparision of distributions before and after removing outliers',fontsize=15)
sns.kdeplot(data.Attr1.dropna(),color="r",shade=True,ax=ax[0][0])
sns.kdeplot(data.Attr1.drop(outlier_id_list).dropna(),color="g",shade=True,ax=ax[0][1])

sns.kdeplot(data.Attr2.dropna(),color="r",shade=True,ax=ax[1][0])
sns.kdeplot(data.Attr2.drop(outlier_id_list).dropna(),color="g",shade=True,ax=ax[1][1])

sns.kdeplot(data.Attr3.dropna(),color="r",shade=True,ax=ax[2][0])
sns.kdeplot(data.Attr3.drop(outlier_id_list).dropna(),color="g",shade=True,ax=ax[2][1])

sns.kdeplot(data.Attr4.dropna(),color="r",shade=True,ax=ax[3][0])
sns.kdeplot(data.Attr4.drop(outlier_id_list).dropna(),color="g",shade=True,ax=ax[3][1])

sns.kdeplot(data.Attr5.dropna(),color="r",shade=True,ax=ax[4][0])
sns.kdeplot(data.Attr5.drop(outlier_id_list).dropna(),color="g",shade=True,ax=ax[4][1])

sns.kdeplot(data.Attr6.dropna(),color="r",shade=True,ax=ax[5][0])
sns.kdeplot(data.Attr6.drop(outlier_id_list).dropna(),color="g",shade=True,ax=ax[5][1])

plt.show()

## Observations:
- Distribution of values in each attributes are smoother after removing the outliers.
- Applying transformations will futher normalize the distribution

In [None]:
data1 = data.copy()

In [None]:
## Attr37 is having 43.7% of missing valuee. So removing it seems to be the best approach.
data1.drop('Attr37',axis=1,inplace=True)

In [None]:
data1.drop(outlier_id_list,inplace=True)

In [None]:
data1.reset_index(drop=True,inplace=True)

### Missing values
- Missing values are filled with median of the respective attributes.

In [None]:
for col in data1.columns.values:
    data1[col].fillna(data1[col].median(),inplace=True)

# Scaling
- 3 Different types of scaling methods are used.
    - MinMaxScaler - Uses minimum and maximum of the series.
    - StandardScaler - Uses mean and zscore of the series.
    - PowerTransformer - Parametric and Monotonic transformations, useful when heteroscedasticity is present.

In [None]:
mms = MinMaxScaler()
sscaler = StandardScaler()
ptransform = PowerTransformer(method='yeo-johnson')

### PowerTransformer

In [None]:
for col in data1.columns.values:
    if col != 'class': 
        data1[col] = ptransform.fit_transform(data1[col].values.reshape(-1,1))

In [None]:
fig, ax = plt.subplots(6,2,figsize = (12,10))
fig.suptitle('Impact of power transfomration on the data distribution',fontsize=15)
sns.kdeplot(data.Attr1.fillna(data.Attr1.median()),color="r",shade=True,ax=ax[0][0])
sns.kdeplot(data1.Attr1,color="g",shade=True,ax=ax[0][1])

sns.kdeplot(data.Attr2.fillna(data.Attr2.median()),color="r",shade=True,ax=ax[1][0])
sns.kdeplot(data1.Attr2,color="g",shade=True,ax=ax[1][1])

sns.kdeplot(data.Attr3.fillna(data.Attr3.median()),color="r",shade=True,ax=ax[2][0])
sns.kdeplot(data1.Attr3,color="g",shade=True,ax=ax[2][1])

sns.kdeplot(data.Attr4.fillna(data.Attr4.median()),color="r",shade=True,ax=ax[3][0])
sns.kdeplot(data1.Attr4,color="g",shade=True,ax=ax[3][1])

sns.kdeplot(data.Attr5.fillna(data.Attr5.median()),color="r",shade=True,ax=ax[4][0])
sns.kdeplot(data1.Attr5,color="g",shade=True,ax=ax[4][1])

sns.kdeplot(data.Attr6.fillna(data.Attr6.median()),color="r",shade=True,ax=ax[5][0])
sns.kdeplot(data1.Attr6,color="g",shade=True,ax=ax[5][1])

plt.show()

# Modeling
- model_df is used to collect results from all the experiments

In [None]:
model_df = pd.DataFrame(columns=['Model','Description','Train_F1','Test_F1','AUC'])

In [None]:
X = data1.drop(['class'],axis=1)
y = data1['class']

X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.7,random_state=123)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape) 
print(y_test.shape)

### XGB Models

In [None]:
xgb_model = XGBClassifier(n_estimators=250,max_depth=5)
xgb_model.fit(X_train,y_train)

y_train_preds = xgb_model.predict(X_train)
y_test_preds = xgb_model.predict(X_test)

fpr, tpr, thresholds = roc_curve(y_test,y_test_preds)
auc_score = auc(fpr,tpr)

df1 = {'Model':'XGBoost_Power_Transform',
       'Description':'n_estimators=250,max_depth=5, na imputed with median',
       'Train_F1':f1_score(y_train,y_train_preds),
       'Test_F1':f1_score(y_test,y_test_preds),
       'AUC':auc_score}
model_df = pd.concat([model_df, pd.DataFrame([df1])], ignore_index=True)

In [None]:
xgb_model = XGBClassifier(n_estimators=300,max_depth=5)
xgb_model.fit(X_train,y_train)

y_train_preds = xgb_model.predict(X_train)
y_test_preds = xgb_model.predict(X_test)

fpr, tpr, thresholds = roc_curve(y_test,y_test_preds)
auc_score = auc(fpr,tpr)

df1 = {'Model':'XGBoost_Power_Transform',
       'Description':'n_estimators=300,max_depth=5, na imputed with median',
       'Train_F1':f1_score(y_train,y_train_preds),
       'Test_F1':f1_score(y_test,y_test_preds),
       'AUC':auc_score}
model_df = pd.concat([model_df, pd.DataFrame([df1])], ignore_index=True)

### Grid Search

In [None]:
params = { 'n_estimators': [200,300,400,500],
          'max_depth': [5,6,7],
          'learning_rate' : [0.1, 0.01, 0.05, 0.3]
         }

grid_xgb = GridSearchCV(estimator= XGBClassifier(), param_grid = params, cv=5)

grid_xgb.fit(X_train,y_train)


y_train_preds = grid_xgb.predict(X_train)
y_test_preds = grid_xgb.predict(X_test)

print('Train F1 Score:', f1_score(y_train,y_train_preds))
print('Test F1 Score:', f1_score(y_test,y_test_preds))

### XGB models with best params from Grid Search

In [None]:
xgb_model = XGBClassifier(n_estimators=250,max_depth=4,learning_rate=0.3,gamma=0.1)
xgb_model.fit(X_train,y_train)

y_train_preds = xgb_model.predict(X_train)
y_test_preds = xgb_model.predict(X_test)

fpr, tpr, thresholds = roc_curve(y_test,y_test_preds)
auc_score = auc(fpr,tpr)

df1 = {'Model':'XGBoost_Power_Transform_Grid_serach',
        'Description':'n_estimators=250,max_depth=4,learning_rate=0.3,gamma=0.1, na imputed with median',
       'Train_F1':f1_score(y_train,y_train_preds),
       'Test_F1':f1_score(y_test,y_test_preds),
       'AUC':auc_score}
model_df = pd.concat([model_df, pd.DataFrame([df1])], ignore_index=True)

In [None]:
xgb_model = XGBClassifier(n_estimators=250,max_depth=4,learning_rate=0.3,gamma=0.5)
xgb_model.fit(X_train,y_train)

y_train_preds = xgb_model.predict(X_train)
y_test_preds = xgb_model.predict(X_test)

fpr, tpr, thresholds = roc_curve(y_test,y_test_preds)
auc_score = auc(fpr,tpr)

df1 = {'Model':'XGBoost_Power_Transform_Grid_serach',
        'Description':'n_estimators=250,max_depth=4,learning_rate=0.3,gamma=0.5, na imputed with median',
       'Train_F1':f1_score(y_train,y_train_preds),
       'Test_F1':f1_score(y_test,y_test_preds),
       'AUC':auc_score}
model_df = pd.concat([model_df, pd.DataFrame([df1])], ignore_index=True)

In [None]:
model_df

### Minmax Scaling

In [None]:
mms = MinMaxScaler()

for col in data.columns.values:
    data[col].fillna(data[col].median(),inplace=True)

for col in data.columns.values:
    if col != 'class': 
        data[col] = mms.fit_transform(data[col].values.reshape(-1,1))

X = data.drop(['class'],axis=1)
y = data['class']

X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.7,random_state=123)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape) 
print(y_test.shape)

In [None]:
xgb_model = XGBClassifier(n_estimators=250,max_depth=5,learning_rate=0.3, gamma=0.5)
xgb_model.fit(X_train,y_train)

y_train_preds = xgb_model.predict(X_train)
y_test_preds = xgb_model.predict(X_test)

fpr, tpr, thresholds = roc_curve(y_test,y_test_preds)
auc_score = auc(fpr,tpr)

df1 = {'Model':'XGBoost_MinMax_Scaler',
        'Description':'n_estimators=250,max_depth=5,learning_rate=0.3,gamma=0.5, na imputed with median',
       'Train_F1':f1_score(y_train,y_train_preds),
       'Test_F1':f1_score(y_test,y_test_preds),
       'AUC':auc_score}
model_df = pd.concat([model_df, pd.DataFrame([df1])], ignore_index=True)

## Variance Inflation Factor
- Dropping highly correlated features.

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
def calculate_vif(X, thresh=5.0):
        # Taken from https://stats.stackexchange.com/a/253620/53565 and modified
        dropped=True
        while dropped:
            variables = X.columns
            dropped = False
            vif = [variance_inflation_factor(X[variables].values, X.columns.get_loc(var)) for var in X.columns]
            
            max_vif = max(vif)
            if max_vif > thresh:
                maxloc = vif.index(max_vif)
                print(f'Dropping {X.columns[maxloc]} with vif={max_vif}')
                X = X.drop([X.columns.tolist()[maxloc]], axis=1)
                dropped=True
        return X

In [None]:
X = data1.drop(['class'],axis=1)
y = data1['class']

X_vif = calculate_vif(X,thresh= 10)

### Modelling after dropping correlated features

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_vif,y, train_size = 0.7,random_state=123)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape) 
print(y_test.shape)

In [None]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train,y_train)

y_train_preds = xgb_model.predict(X_train)
y_test_preds = xgb_model.predict(X_test)

fpr, tpr, thresholds = roc_curve(y_test,y_test_preds)
auc_score = auc(fpr,tpr)

df1 = {'Model':'XGBoost_VIF_Power_Transform',
       'Description':'Base, na imputed with median',
       'Train_F1':f1_score(y_train,y_train_preds),
       'Test_F1':f1_score(y_test,y_test_preds),
       'AUC':auc_score}
model_df = pd.concat([model_df, pd.DataFrame([df1])], ignore_index=True)

In [None]:
xgb_model = XGBClassifier(max_depth=5)
xgb_model = XGBClassifier()
xgb_model.fit(X_train,y_train)

y_train_preds = xgb_model.predict(X_train)
y_test_preds = xgb_model.predict(X_test)

fpr, tpr, thresholds = roc_curve(y_test,y_test_preds)
auc_score = auc(fpr,tpr)

df1 = {'Model':'XGBoost_VIF_Power_Transform',
       'Description':'max_depth = 5, na imputed with median',
       'Train_F1':f1_score(y_train,y_train_preds),
       'Test_F1':f1_score(y_test,y_test_preds),
       'AUC':auc_score}
model_df = pd.concat([model_df, pd.DataFrame([df1])], ignore_index=True)