In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings

import mlflow
import mlflow.sklearn
import os
import shutil
import sys
import random
# import tempfile
from IPython.display import display, Markdown

from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import *
from sklearn.externals import joblib
from sklearn.metrics import *
from sklearn.pipeline import Pipeline
from sklearn import *

from boruta import boruta_py
import matplotlib.pyplot as plt

%matplotlib inline
%config IPCompleter.greedy=True
warnings.filterwarnings('ignore')



## Custom methods

In [None]:
# get the percentage of nulls on pandas dataframe
def val_pd_df_nan(df):
    flat_data = df.values.flatten()
    count=0
    for value in flat_data:
        if value is not None:
            continue
        count+= 1
    return round(100*count/len(flat_data))


## Load data

In [None]:
train_ds = pd.read_csv('data/aps_failure_training_set_processed_8bit.csv', na_values='na')
test_ds =  pd.read_csv('data/aps_failure_test_set_processed_8bit.csv', na_values='na')

train_labels = train_ds['class']
test_labels = test_ds['class']
train_features = train_ds.drop('class', axis=1)
test_features = test_ds.drop('class', axis=1)

print(train_labels.shape, test_labels.shape)
print(train_features.shape, test_features.shape)

In [None]:
train_labels = train_labels.apply(round)
train_labels = train_labels.replace({-1:0})

test_labels = test_labels.apply(round)
test_labels = test_labels.replace({-1:0})

## Sampling

In [None]:
number_samples = 1000

idxs_pos = train_labels[train_labels==1].index
idxs_neg = train_labels[train_labels==0].sample(n=number_samples, replace=False, random_state=0).index
idxs_balanced = np.concatenate((idxs_pos,idxs_neg))
train_features_balanced = train_features.loc[idxs_balanced]
train_labels_balanced = train_labels.loc[idxs_balanced]
print(f'Proportion balanced: {int(number_samples/1000)}/1')
print(train_labels_balanced.value_counts())

In [None]:
# number_samples = 375

# idxs_pos = test_labels[test_labels==1].index
# idxs_neg = test_labels[test_labels==0].sample(n=number_samples, replace=False, random_state=0).index
# idxs_balanced = np.concatenate((idxs_pos,idxs_neg))
# test_features_balanced = test_features.loc[idxs_balanced]
# test_labels_balanced = test_labels.loc[idxs_balanced]
# print(f'Proportion balanced: {int(number_samples/1000)}/1')
# print(test_labels_balanced.value_counts())

test_features_balanced = test_features
test_labels_balanced = test_labels

In [None]:
scaler = MinMaxScaler()
scaler.fit(train_features_balanced)

# # model export
# joblib.dump(scaler, "models/MinMaxScaler.save") 
# scaler = joblib.load("models/MinMaxScaler.save") 

train_features_balanced = pd.DataFrame(scaler.transform(train_features_balanced), columns=train_features_balanced.columns)
test_features_balanced = pd.DataFrame(scaler.transform(test_features_balanced), columns=test_features_balanced.columns)

## Setup training and validation

In [None]:
mlflow.set_tracking_uri("http://host.docker.internal:5000")
mlflow.set_experiment("demo_ml")
print("Running {} with tracking URI {}".format(sys.argv[0], mlflow.get_tracking_uri()))

def ml_tracking(params, metrics, models):
    with mlflow.start_run():
        # log params
        for param, value in params.items():
            mlflow.log_param(param, value)
        # log metrics
        for metric, value in metrics.items():
            mlflow.log_metric(metric, value)
        # log models
        for key, model in models.items():
            mlflow.sklearn.log_model(model, key)
#         run_id = mlflow.active_run().info.run_id
#         service = mlflow.tracking.MlflowClient()
#         run = service.get_run(run_id)
#         print("Metadata & data for run with UUID %s: %s" % (run_id, run))


In [None]:
selectKBest = SelectKBest(chi2, 88)
pca = PCA(0.95)
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
borutaSelector = boruta_py.BorutaPy(rf, n_estimators='auto', verbose=0, random_state=123)

# training and validation template
def run(pipeline, train_X, train_y, test_X, test_y, fit_predict_arr=False):
    
    # train
    pipeline.fit(train_X.values, train_y.values) if fit_predict_arr else pipeline.fit(train_X, train_y)
    
    # predict train
    y_pred = pipeline.predict(train_X.values) if fit_predict_arr else pipeline.predict(train_X)
    train_report = classification_report(train_y, y_pred)
    # print(train_report)
    
    # predict test
    y_pred = pipeline.predict(test_X.values) if fit_predict_arr else pipeline.predict(test_X)
    test_report = classification_report(test_y, y_pred)
    print(test_report)
    
    # generate confusion matrix
    cm = confusion_matrix(test_y, y_pred).ravel()
    cm = pd.DataFrame(cm.reshape((1,4)), columns=['tn', 'fp', 'fn', 'tp'])
    print(cm)
    
    # generate cost
    total_cost = 10 * cm.fp + 500 * cm.fn
    print(f'Total cost is:{float(total_cost.values[0])}')
    
    # mlflow tracking
    ml_tracking(
        params={}, 
        metrics={
            "cost": float(total_cost),
            "weighted_f1": f1_score(test_y, y_pred, average='weighted'),
            "accuracy_score": accuracy_score(test_y, y_pred)
        }, 
        models={
            "MinMaxScaler": scaler,
            "Pipeline": pipeline
        }
    )


## KBest + RF

In [None]:
# kbest + rf pipeline
kbest_pipeline = Pipeline([('selectKBest', selectKBest), ('rf', rf)])

# execute training and validation
run(kbest_pipeline, train_features_balanced, train_labels_balanced, test_features_balanced, test_labels_balanced)

print("features count:", len(train_features_balanced.columns))
print("features count after kbest:", len(train_features_balanced.columns[kbest_pipeline[0].get_support()]))
print("features selected:", train_features_balanced.columns[kbest_pipeline[0].get_support()].values)

## PCA + RF

In [None]:
# pca + rf pipeline
pca_pipeline = Pipeline([('pca', pca), ('rf', rf)])

# execute training and validation
run(pca_pipeline, train_features_balanced, train_labels_balanced, test_features_balanced, test_labels_balanced)

print("features count:", len(train_features_balanced.columns))
print("pca components:", pca_pipeline[0].n_components_)

## Boruta + RF

In [None]:
# boruta + rf pipeline
boruta_pipeline = Pipeline([('borutaSelector', borutaSelector), ('rf', rf)])

# execute training and validation
run(boruta_pipeline, train_features_balanced, train_labels_balanced, test_features_balanced, test_labels_balanced, True)

print("features count:", len(train_features_balanced.columns))
print("features count after boruta:", len(train_features_balanced.columns[boruta_pipeline[0].support_]))
print("features selected:", train_features_balanced.columns[boruta_pipeline[0].support_].values)

## RF only

In [None]:
# rf pipeline
rf_pipeline = Pipeline([('rf', rf)])

# execute training and validation
run(rf_pipeline, train_features_balanced, train_labels_balanced, test_features_balanced, test_labels_balanced)