## Download dataset

In [None]:
import requests
from io import BytesIO
from zipfile import ZipFile

url = 'https://archive.org/download/datasets_202003/aps-failure-at-scania-trucks-data-set.zip'
content = requests.get(url)

# unzip the content
f = ZipFile(BytesIO(content.content))
f.extractall("data")

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import warnings
from IPython.display import display, Markdown

from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import svm

import matplotlib.pyplot as plt

%matplotlib inline
%config IPCompleter.greedy=True
warnings.filterwarnings('ignore')

## Custom methods

In [None]:
# get the percentage of nulls on pandas dataframe
def val_pd_df_nan(df):
    flat_data = df.values.flatten()
    count=0
    for value in flat_data:
        if value is not None:
            continue
        count+= 1
    return round(100*count/len(flat_data))


## Load data
The training set contains 60000 examples in total in which 59000 belong to the negative class and 1000 positive class. The test set contains 16000 examples. There are 171 attributes per record.

The attribute names of the data have been anonymized for proprietary reasons. It consists of both single numerical counters and histograms consisting of bins with different conditions. Typically the histograms have open-ended conditions at each end. For example, if we measuring the ambient temperature "T" then the histogram could be defined with 4 bins where:

The attributes are as follows: class, then anonymized operational data. The operational data have an identifier and a bin id, like "Identifier_Bin". In total there are 171 attributes, of which 7 are histogram variables. Missing values are denoted by "na".

In [None]:
train_ds = pd.read_csv('data/aps_failure_training_set_processed_8bit.csv', na_values='na')
test_ds =  pd.read_csv('data/aps_failure_test_set_processed_8bit.csv', na_values='na')

train_labels = train_ds['class']
test_labels = test_ds['class']
train_features = train_ds.drop('class', axis=1)
test_features = test_ds.drop('class', axis=1)

print(train_labels.shape, test_labels.shape)
print(train_features.shape, test_features.shape)

In [None]:
train_ds

In [None]:
print(f'{val_pd_df_nan(train_features)}% of train data are non-valid.')
print(f'{val_pd_df_nan(test_features)}% of test data are non-valid.')

In [None]:
train_features.describe()

In [None]:
scaler = MinMaxScaler()
scaler.fit(train_features)
train_features = pd.DataFrame(scaler.transform(train_features), columns=train_features.columns)

In [None]:
train_features.describe()

In [None]:
train_labels = train_labels.apply(round)
train_labels = train_labels.replace({-1:0})

In [None]:
train_labels.value_counts()

## Correlation Heatmap

In [None]:
# fig, ax = plt.subplots(figsize=(10,10))
# corr_matrix = train_features.corr()
# ax = sns.heatmap(corr_matrix, square=True, cmap='Purples', ax=ax)
# plt.tight_layout()
# plt.show()

## Select KBest

In [None]:
# k_best = 84
# selectKBest = SelectKBest(chi2, k_best)
# selectKBest.fit(train_features, train_labels)
# best_train_features = selectKBest.transform(train_features)

# idxs_selected = selectKBest.get_support(indices=True)
# best_train_features = train_features.iloc[:,idxs_selected]

# print(best_train_features.columns)

In [None]:
# fig, ax = plt.subplots(figsize=(10,10))
# new_corr_matrix = best_train_features.corr()
# ax = sns.heatmap(new_corr_matrix, square=True, cmap='Purples', ax=ax)
# plt.tight_layout()
# plt.show()

In [None]:
# pca_variance = 0.95 

# pca = PCA(pca_variance)
# pca.fit(train_features)
# best_train_features = pca.transform(train_features)
# best_train_features = pd.DataFrame(best_train_features)

# print(f'Number of components {pca.n_components_}')

## Sampling

In [None]:
# number_samples = 2500

# idxs_pos = train_labels[train_labels==1].index
# idxs_neg = train_labels[train_labels==0].sample(n=number_samples, replace=False, random_state=0).index
# idxs_balanced = np.concatenate((idxs_pos,idxs_neg))
# best_train_features_balanced = best_train_features.loc[idxs_balanced]
# train_labels_balanced = train_labels.loc[idxs_balanced]
# print(f'Proportion balanced: {int(number_samples/1000)}/1')

## Feature selection with Boruta

In [None]:
# from boruta import boruta_py

# # define random forest classifier, with utilising all cores and
# # sampling in proportion to y labels
# forest = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

# # define Boruta feature selection method
# feat_selector = boruta_py.BorutaPy(forest, n_estimators='auto', verbose=2, random_state=123)

# # find all relevant features
# feat_selector.fit(train_features.values, train_labels.values)

# # check selected features
# print(feat_selector.support_)

# # check ranking of features
# print(feat_selector.ranking_)

# # # call transform() on X to filter it down to selected features
# # X_filtered = feat_selector.transform(X)

In [None]:
# result
# [ True False False False False False False  True  True  True  True  True
#   True  True  True False  True  True False False  True  True  True  True
#   True  True  True False False False  True False  True  True  True  True
#  False False  True  True  True  True  True  True  True  True  True  True
#  False False False False  True  True  True  True  True  True  True  True
#   True  True  True  True  True  True False  True  True  True  True  True
#   True  True  True  True  True  True  True  True  True  True  True  True
#   True False False False  True False False False  True False  True  True
#   True  True  True  True  True  True  True  True  True  True  True  True
#   True False False  True False  True  True  True  True  True  True  True
#   True  True False False  True  True  True False False False False  True
#   True  True  True  True False  True False False False False  True  True
#  False  True  True  True  True False  True  True False False False  True
#   True  True  True  True  True  True  True  True  True  True  True False
#  False False]
# [ 1 20 22 42 21 23 28  1  1  1  1  1  1  1  1 13  1  1  7 30  1  1  1  1
#   1  1  1 43 25 27  1  3  1  1  1  1  3  2  1  1  1  1  1  1  1  1  1  1
#   5  3 24 31  1  1  1  1  1  1  1  1  1  1  1  1  1  1 11  1  1  1  1  1
#   1  1  1  1  1  1  1  1  1  1  1  1  1 11 15 16  1 19  2 36  1 44  1  1
#   1  1  1  1  1  1  1  1  1  1  1  1  1 34  8  1 11  1  1  1  1  1  1  1
#   1  1 39  5  1  1  1 17  9 25 14  1  1  1  1  1 17  1 37 29 41 39  1  1
#   2  1  1  1  1  2  1  1  2 35 32  1  1  1  1  1  1  1  1  1  1  1  1  2
#  38 32]