In [1]:
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import seaborn as sns
import os
from collections import Counter


from matplotlib import pyplot as plt
import numpy as np
from discover_feature_relationships import discover
from common import *

In [2]:
# watermark is optional - it shows the versions of installed libraries
# so it is useful to confirm your library versions when you submit bug reports to projects
# install watermark using
# %install_ext https://raw.githubusercontent.com/rasbt/watermark/master/watermark.py
%load_ext watermark
# show a watermark for this environment
%watermark -d -m -v -p numpy,matplotlib,sklearn -g

2019-02-11 

CPython 3.6.8
IPython 7.2.0

numpy 1.15.4
matplotlib 3.0.2
sklearn 0.20.2

compiler   : GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 18.2.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit
Git hash   : 3f684a6dec4e01f878c611b24f8e037be722e680


In [3]:
train = pd.read_csv('aps_failure_training_set.csv',skiprows=20)
data = pre_processing(train,np.nan)

In [5]:
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn import svm

from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import make_scorer
from sklearn import linear_model
from imblearn.over_sampling import SMOTE  # or: import RandomOverSampler
from imblearn.pipeline import Pipeline as imbPipeline

In [6]:
RBM_2 = BernoulliRBM(n_components=2)
RBM_15 = BernoulliRBM(n_components=15)
imp = SimpleImputer(missing_values=np.nan, strategy='median')
anova_filter = SelectKBest(f_regression, k=15)
scaler = StandardScaler()
clf = svm.SVC(kernel='linear')
logistic = linear_model.LogisticRegression(solver='lbfgs', max_iter=10000)
smote = SMOTE(random_state=444)

# Pipe Lines

To check the use of RBM, let's create 3 pipelines with similar steps:
- Imputer (median value)
- Scaler (Standard Scaler)
- SMOTE
- RBM or Feature selection using ANOVA
- Logistic regression or SVM

In [23]:
rbm_features_15_svm = imbPipeline(steps=[("imputer",imp),("scale",scaler),("smote",smote),('rbm', RBM_15),("svm",clf) ])
rbm_features_15_logit = imbPipeline(steps=[("imputer",imp),("scale",scaler),("smote",smote),('rbm', RBM_15),("logit",logistic) ])

anova_features_15 = Pipeline(steps=[("imputer",imp),("scale",scaler),('anova', anova_filter),("svm",clf) ])
anova_features_smote_15 = imbPipeline(steps=[("imputer",imp),("scale",scaler),("smote",smote),('anova', anova_filter),("svm",clf) ])

In [8]:
X = data.drop("class",axis=1).values
y = data["class"].values

In [12]:
scoring = {'precison':  make_scorer(precision_score),'recall':  make_scorer(recall_score)}
scores = {}

In [13]:
scores["SVM"]=cross_validate(rbm_features_15_svm,X, y, cv=3, scoring=scoring)
scores["Logit"]=cross_validate(rbm_features_15_logit,X, y, cv=3, scoring=scoring)
scores["Anova_SVM"]=cross_validate(anova_features_15,X, y, cv=3, scoring=scoring)

  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [28]:
scores["SMOTE_Anova_SVM"]= cross_validate(anova_features_smote_15,X, y, cv=3, scoring=scoring)

  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [29]:
for k in scores.keys():
    print(f"{k} :")
    print("%15s => %02d%% +/- %02d%%"%("precision",np.mean(scores[k]['test_precison'])*100,np.std(scores[k]['test_precison'])*100*2))
    print("%15s => %02d%% +/- %02d%%"%("recall",np.mean(scores[k]['test_recall'])*100,np.std(scores[k]['test_recall'])*100*2))

SVM :
      precision => 07% +/- 00%
         recall => 97% +/- 00%
Logit :
      precision => 08% +/- 00%
         recall => 97% +/- 00%
Anova_SVM :
      precision => 64% +/- 12%
         recall => 18% +/- 03%
SMOTE_Anova_SVM :
      precision => 28% +/- 03%
         recall => 90% +/- 03%


For those using SMOTE allows a significant improvement in precision/recall (it does not actually converge for the pipeline using RBM without SMOTE).

It will be interesting to look at the output of the RBM.

# RBM ouput:

In [33]:
rbm_output_smote = imbPipeline(steps=[("imputer",imp),("scale",scaler),("smote",smote),('rbm', RBM_15)])
rbm_output_no_smote = Pipeline(steps=[("imputer",imp),("scale",scaler),('rbm', RBM_15)])





In [36]:
rbm_features = rbm_output_no_smote.fit_transform(X)


In [42]:
rbm_features_df = pd.DataFrame(rbm_features)
print(rbm_features_df.drop_duplicates().shape)
rbm_features_df.sample(10)

(603, 15)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
38210,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
40960,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5620,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4359,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
46028,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
33525,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7560,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
36717,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9484,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Understanding the output

There seems to be a lot of duplicates, let's count the instance.

For that we will calculate a unique column, as if each row was an encoding of a number in base 2.

In [53]:
categories=rbm_features_df.apply(lambda r:sum( x*(2**i) for i,x in enumerate(r)),axis=1)

In [56]:
categories_df = categories.to_frame().rename(columns={0:"categories"})
categories_df["Class"] = data["class"]

It turns out that not all feature are 1 or 0, in rare case there are some float values (see below).

To understand how the different values are distributed, we will just consider rounding then to make the output easier to read.

In [65]:
categories_df[categories_df.categories < np.round(categories_df.categories)]

Unnamed: 0,categories,Class
900,32767.0,0
2071,32767.0,0
3245,1137.586068,0
4891,32767.0,0
9394,32766.999994,0
10618,464.959876,0
10634,444.738571,0
12136,0.726554,0
12925,32766.999998,0
15376,32767.0,0


In [66]:
pd.crosstab(np.round(categories_df.categories),categories_df.Class)

Class,0,1
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,14287,983
1.0,3,0
15.0,1,0
31.0,1,0
37.0,1,0
54.0,1,0
374.0,1,0
410.0,1,0
416.0,1,0
445.0,1,0
