## Download dataset

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
from IPython.display import display, Markdown

from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import svm
from boruta import boruta_py

import matplotlib.pyplot as plt
from sklearn.externals import joblib

%matplotlib inline
%config IPCompleter.greedy=True
warnings.filterwarnings('ignore')



## Custom methods

In [2]:
# get the percentage of nulls on pandas dataframe
def val_pd_df_nan(df):
    flat_data = df.values.flatten()
    count=0
    for value in flat_data:
        if value is not None:
            continue
        count+= 1
    return round(100*count/len(flat_data))


## Load data

In [3]:
train_ds = pd.read_csv('data/aps_failure_training_set_processed_8bit.csv', na_values='na')
test_ds =  pd.read_csv('data/aps_failure_test_set_processed_8bit.csv', na_values='na')

train_labels = train_ds['class']
test_labels = test_ds['class']
train_features = train_ds.drop('class', axis=1)
test_features = test_ds.drop('class', axis=1)

print(train_labels.shape, test_labels.shape)
print(train_features.shape, test_features.shape)

(60000,) (16000,)
(60000, 170) (16000, 170)


In [4]:
train_labels = train_labels.apply(round)
train_labels = train_labels.replace({-1:0})

test_labels = test_labels.apply(round)
test_labels = test_labels.replace({-1:0})

## Sampling

In [5]:
number_samples = 1000

idxs_pos = train_labels[train_labels==1].index
idxs_neg = train_labels[train_labels==0].sample(n=number_samples, replace=False, random_state=0).index
idxs_balanced = np.concatenate((idxs_pos,idxs_neg))
train_features_balanced = train_features.loc[idxs_balanced]
train_labels_balanced = train_labels.loc[idxs_balanced]
print(f'Proportion balanced: {int(number_samples/1000)}/1')
print(train_labels_balanced.value_counts())

Proportion balanced: 1/1
1    1000
0    1000
Name: class, dtype: int64


In [6]:
# number_samples = 375

# idxs_pos = test_labels[test_labels==1].index
# idxs_neg = test_labels[test_labels==0].sample(n=number_samples, replace=False, random_state=0).index
# idxs_balanced = np.concatenate((idxs_pos,idxs_neg))
# test_features_balanced = test_features.loc[idxs_balanced]
# test_labels_balanced = test_labels.loc[idxs_balanced]
# print(f'Proportion balanced: {int(number_samples/1000)}/1')
# print(test_labels_balanced.value_counts())

test_features_balanced = test_features
test_labels_balanced = test_labels

In [7]:
scaler = MinMaxScaler()
scaler.fit(train_features_balanced)

joblib.dump(scaler, "models/MinMaxScaler.save") 
scaler = joblib.load("models/MinMaxScaler.save") 

train_features_balanced = pd.DataFrame(scaler.transform(train_features_balanced), columns=train_features_balanced.columns)
test_features_balanced = pd.DataFrame(scaler.transform(test_features_balanced), columns=test_features_balanced.columns)

## Setup pipelines

In [8]:
selectKBest = SelectKBest(chi2, 88)

pca = PCA(0.95)

borutaSelector = boruta_py.BorutaPy(
    RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5), 
    n_estimators='auto', 
    verbose=0, 
    random_state=123
)

rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

In [9]:
from sklearn.pipeline import Pipeline

# estimators pipeline
rf_pipeline = Pipeline([('rf', rf)])
kbest_pipeline = Pipeline([('selectKBest', selectKBest), ('rf', rf)])
pca_pipeline = Pipeline([('pca', pca), ('rf', rf)])
boruta_pipeline = Pipeline([('borutaSelector', borutaSelector), ('rf', rf)])


## KBest + RF

In [10]:
kbest_pipeline.fit(train_features_balanced, train_labels_balanced)

# train
y_pred = kbest_pipeline.predict(train_features_balanced)
report = classification_report(train_labels_balanced, y_pred)
print(report)

# test
y_pred = kbest_pipeline.predict(test_features_balanced)
report = classification_report(test_labels_balanced, y_pred)
print(report)

cm = confusion_matrix(test_labels_balanced, y_pred).ravel()
cm = pd.DataFrame(cm.reshape((1,4)), columns=['tn', 'fp', 'fn', 'tp'])
print(cm)

total_cost = 10*cm.fp + 500*cm.fn
print(f'Total cost is:{float(total_cost.values[0])}')

              precision    recall  f1-score   support

           0       0.97      0.96      0.97      1000
           1       0.96      0.97      0.97      1000

    accuracy                           0.97      2000
   macro avg       0.97      0.97      0.97      2000
weighted avg       0.97      0.97      0.97      2000

              precision    recall  f1-score   support

           0       1.00      0.94      0.97     15625
           1       0.28      0.98      0.44       375

    accuracy                           0.94     16000
   macro avg       0.64      0.96      0.70     16000
weighted avg       0.98      0.94      0.96     16000

      tn   fp  fn   tp
0  14679  946   6  369
Total cost is:12460.0


In [11]:
print("columns count:", len(train_features_balanced.columns))
print("columns count after kbest:", len(train_features_balanced.columns[kbest_pipeline[0].get_support()]))
print("columns selected:", train_features_balanced.columns[kbest_pipeline[0].get_support()].values)

columns count: 170
columns count after kbest: 88
columns selected: ['aa_000' 'ag_001' 'ag_002' 'ag_003' 'ag_004' 'ag_005' 'ag_006' 'ah_000'
 'ai_000' 'al_000' 'am_0' 'an_000' 'ao_000' 'ap_000' 'aq_000' 'ay_007'
 'ay_008' 'az_000' 'az_001' 'az_002' 'az_004' 'az_005' 'ba_000' 'ba_001'
 'ba_002' 'ba_003' 'ba_004' 'ba_005' 'ba_006' 'ba_007' 'ba_008' 'ba_009'
 'bb_000' 'bc_000' 'bd_000' 'be_000' 'bf_000' 'bg_000' 'bh_000' 'bi_000'
 'bj_000' 'bt_000' 'bu_000' 'bv_000' 'bx_000' 'by_000' 'cc_000' 'ci_000'
 'cj_000' 'ck_000' 'cl_000' 'cm_000' 'cn_000' 'cn_001' 'cn_002' 'cn_003'
 'cn_004' 'cn_005' 'cn_006' 'cn_007' 'cn_008' 'cq_000' 'cs_000' 'cs_001'
 'cs_002' 'cs_003' 'cs_004' 'cs_005' 'cu_000' 'cv_000' 'cx_000' 'dc_000'
 'dd_000' 'de_000' 'dn_000' 'ds_000' 'dt_000' 'eb_000' 'ec_00' 'ed_000'
 'ee_000' 'ee_001' 'ee_002' 'ee_003' 'ee_004' 'ee_005' 'ee_006' 'ee_007']


## PCA + RF

In [12]:
pca_pipeline.fit(train_features_balanced, train_labels_balanced)

# train
y_pred = pca_pipeline.predict(train_features_balanced)
report = classification_report(train_labels_balanced, y_pred)
print(report)

# test
y_pred = pca_pipeline.predict(test_features_balanced)
report = classification_report(test_labels_balanced, y_pred)
print(report)

cm = confusion_matrix(test_labels_balanced, y_pred).ravel()
cm = pd.DataFrame(cm.reshape((1,4)), columns=['tn', 'fp', 'fn', 'tp'])
print(cm)

total_cost = 10*cm.fp + 500*cm.fn
print(f'Total cost is:{float(total_cost.values[0])}')

              precision    recall  f1-score   support

           0       0.98      0.94      0.96      1000
           1       0.94      0.98      0.96      1000

    accuracy                           0.96      2000
   macro avg       0.96      0.96      0.96      2000
weighted avg       0.96      0.96      0.96      2000

              precision    recall  f1-score   support

           0       1.00      0.91      0.95     15625
           1       0.21      0.99      0.35       375

    accuracy                           0.91     16000
   macro avg       0.61      0.95      0.65     16000
weighted avg       0.98      0.91      0.94     16000

      tn    fp  fn   tp
0  14243  1382   5  370
Total cost is:16320.0


In [13]:
print("columns count:", len(train_features_balanced.columns))
print("pca components:", pca_pipeline[0].n_components_)

columns count: 170
pca components: 50


## Boruta + RF

In [14]:
boruta_pipeline.fit(train_features_balanced.values, train_labels_balanced.values)

joblib.dump(boruta_pipeline, "models/boruta_pipeline.save") 
boruta_pipeline = joblib.load("models/boruta_pipeline.save") 

# train
y_pred = boruta_pipeline.predict(train_features_balanced.values)
report = classification_report(train_labels_balanced, y_pred)
print(report)

# test
y_pred = boruta_pipeline.predict(test_features_balanced.values)
report = classification_report(test_labels_balanced, y_pred)
print(report)

cm = confusion_matrix(test_labels_balanced, y_pred).ravel()
cm = pd.DataFrame(cm.reshape((1,4)), columns=['tn', 'fp', 'fn', 'tp'])
print(cm)

total_cost = 10*cm.fp + 500*cm.fn
print(f'Total cost is:{float(total_cost.values[0])}')

              precision    recall  f1-score   support

           0       0.98      0.96      0.97      1000
           1       0.96      0.98      0.97      1000

    accuracy                           0.97      2000
   macro avg       0.97      0.97      0.97      2000
weighted avg       0.97      0.97      0.97      2000

              precision    recall  f1-score   support

           0       1.00      0.94      0.97     15625
           1       0.28      0.98      0.44       375

    accuracy                           0.94     16000
   macro avg       0.64      0.96      0.70     16000
weighted avg       0.98      0.94      0.96     16000

      tn   fp  fn   tp
0  14680  945   7  368
Total cost is:12950.0


In [15]:
print("columns count:", len(train_features_balanced.columns))
print("columns count after boruta:", len(train_features_balanced.columns[boruta_pipeline[0].support_]))
print("columns selected:", train_features_balanced.columns[boruta_pipeline[0].support_].values)

columns count: 170
columns count after boruta: 91
columns selected: ['aa_000' 'ag_001' 'ag_002' 'ag_003' 'ag_004' 'ag_005' 'ag_006' 'ah_000'
 'ai_000' 'al_000' 'am_0' 'an_000' 'ao_000' 'ap_000' 'aq_000' 'ay_006'
 'ay_007' 'ay_008' 'az_000' 'az_001' 'az_002' 'az_005' 'ba_000' 'ba_001'
 'ba_002' 'ba_003' 'ba_004' 'ba_005' 'ba_006' 'ba_008' 'ba_009' 'bb_000'
 'bc_000' 'bd_000' 'be_000' 'bg_000' 'bh_000' 'bi_000' 'bj_000' 'bk_000'
 'bl_000' 'bm_000' 'bn_000' 'bo_000' 'bp_000' 'bq_000' 'br_000' 'bs_000'
 'bt_000' 'bu_000' 'bv_000' 'bx_000' 'by_000' 'cc_000' 'cg_000' 'ci_000'
 'cj_000' 'ck_000' 'cm_000' 'cn_000' 'cn_001' 'cn_002' 'cn_003' 'cn_004'
 'cn_005' 'cn_007' 'cn_008' 'cn_009' 'cq_000' 'cs_000' 'cs_001' 'cs_002'
 'cs_003' 'cs_004' 'cs_005' 'cx_000' 'dd_000' 'de_000' 'dn_000' 'ds_000'
 'dt_000' 'ec_00' 'ed_000' 'ee_000' 'ee_001' 'ee_002' 'ee_003' 'ee_004'
 'ee_005' 'ee_006' 'ee_007']


## RF only

In [16]:
rf_pipeline.fit(train_features_balanced, train_labels_balanced)

# train
y_pred = rf_pipeline.predict(train_features_balanced)
report = classification_report(train_labels_balanced, y_pred)
print(report)

# test
y_pred = rf_pipeline.predict(test_features_balanced)
report = classification_report(test_labels_balanced, y_pred)
print(report)

cm = confusion_matrix(test_labels_balanced, y_pred).ravel()
cm = pd.DataFrame(cm.reshape((1,4)), columns=['tn', 'fp', 'fn', 'tp'])
print(cm)

total_cost = 10*cm.fp + 500*cm.fn
print(f'Total cost is:{float(total_cost.values[0])}')

              precision    recall  f1-score   support

           0       0.97      0.96      0.97      1000
           1       0.96      0.97      0.97      1000

    accuracy                           0.97      2000
   macro avg       0.97      0.97      0.97      2000
weighted avg       0.97      0.97      0.97      2000

              precision    recall  f1-score   support

           0       1.00      0.94      0.97     15625
           1       0.28      0.98      0.43       375

    accuracy                           0.94     16000
   macro avg       0.64      0.96      0.70     16000
weighted avg       0.98      0.94      0.96     16000

      tn   fp  fn   tp
0  14656  969   7  368
Total cost is:13190.0
