In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
sns.set_style('whitegrid')
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict,cross_val_score
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_error

In [2]:
training_data = pd.read_csv("aps_failure_training_set.csv",na_values="na")
training_data.head()

FileNotFoundError: [Errno 2] File b'aps_failure_training_set.csv' does not exist: b'aps_failure_training_set.csv'

# Preprocessing

In [None]:
plt.figure(figsize=(20,12))
sns.heatmap(training_data.isnull(),yticklabels=False,cbar=False,cmap = 'viridis')

# Missing value handling

We are going to use different approches with missing values:

1. Removing the column having 80% missing values (**Self intuition)
2. Keeping all the features
3. Later, we will try to implement some feature engineering 


**For the rest of the missing values, we are replacing them with their mean() for now (**Ref) 

<big><b>Second Approach</b>

In [None]:
sample_training_data = training_data
sample_training_data.fillna(sample_training_data.mean(),inplace=True)

#after replacing with mean()

plt.figure(figsize=(20,12))
sns.heatmap(sample_training_data.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
#as all the other values are numerical except Class column so we can replace them with 1 and 0

sample_training_data = sample_training_data.replace('neg',0)
sample_training_data = sample_training_data.replace('pos',1)

sample_training_data.head()

# Testing Data preprocessing

In [None]:
testing_data = pd.read_csv("aps_failure_test_set.csv",na_values="na")
testing_data.head()

In [None]:
sample_testing_data = testing_data
sample_testing_data.fillna(sample_testing_data.mean(),inplace=True)

#after replacing with mean()

plt.figure(figsize=(20,12))
sns.heatmap(sample_testing_data.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
#as all the other values are numerical except Class column so we can replace them with 1 and 0

sample_testing_data = sample_testing_data.replace('neg',0)
sample_testing_data = sample_testing_data.replace('pos',1)

sample_testing_data.head()

# Model implementation with Cross validation (All features)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf= RandomForestClassifier(random_state=42)

In [None]:
X = sample_training_data.drop('class',axis=1)
y = sample_training_data['class']

In [None]:
CV_prediction = cross_val_predict(rf,X,y,cv = 5)
CV_score = cross_val_score(rf,X,y,cv = 5)

In [None]:
#mean cross validation score  
np.mean(CV_score)

In [None]:
print(classification_report(y,CV_prediction))

In [None]:
tn, fp, fn, tp = confusion_matrix(y, CV_prediction).ravel()
confusionData = [[tn,fp],[fn,tp]]
pd.DataFrame(confusionData,columns=['FN','FP'],index=['TN','TP'])

In [None]:
cost = 10*fp+500*fn
values = {'Score':[cost],'Number of Type 1 faults':[fp],'Number of Type 2 faults':[fn]}
pd.DataFrame(values)

In [None]:
print(metrics.accuracy_score(y, CV_prediction))

In [None]:
print(metrics.r2_score(y, CV_prediction)) 

In [None]:
print(metrics.f1_score(y, CV_prediction)) 

In [None]:
print(mean_squared_error(y,CV_prediction))

# Feature Selection

In [None]:
from sklearn.feature_selection import SelectFromModel

In [None]:
rf.fit(X,y)


In [None]:
rf.feature_importances_

In [None]:
#barplot
feat_importances = pd.Series(rf.feature_importances_, index=X.columns)
plt.figure(figsize=(40,40))
feat_importances.nlargest(170).sort_values().plot(kind='barh')
plt.xlabel('Variable Importance',fontsize=35)

In [None]:
features = X.columns.values
features.tolist()

In [None]:
#The top 40 important features
feat_importances = pd.Series(rf.feature_importances_, index=X.columns)
plt.figure(figsize=(40,40))
feat_importances.nlargest(40).sort_values().plot(kind='barh')
plt.xlabel('Variable Importance',fontsize=35)


In [None]:
 model = SelectFromModel(rf, prefit=True)

In [None]:
 X_new = model.transform(X)
 

In [None]:
X=X_new
X.shape

In [None]:
#Print the chosen features
features = np.array(features.tolist())
print(features[model.get_support()])

# Cross validation with selected features

In [None]:
X = X_new
y = sample_training_data['class']

In [None]:
X.shape

In [None]:
CV_prediction = cross_val_predict(rf,X,y,cv = 5)
CV_score = cross_val_score(rf,X,y,cv = 5)

In [None]:
#mean cross validation score  
np.mean(CV_score)

In [None]:
print(classification_report(y,CV_prediction))

In [None]:
tn, fp, fn, tp = confusion_matrix(y, CV_prediction).ravel()
confusionData = [[tn,fp],[fn,tp]]
pd.DataFrame(confusionData,columns=['FN','FP'],index=['TN','TP'])

In [None]:
cost = 10*fp+500*fn
values = {'Score':[cost],'Number of Type 1 faults':[fp],'Number of Type 2 faults':[fn]}
pd.DataFrame(values)

In [None]:
print(metrics.accuracy_score(y, CV_prediction))

In [None]:
print(metrics.r2_score(y, CV_prediction)) 

In [None]:
print(metrics.f1_score(y, CV_prediction)) 

In [None]:
print(mean_squared_error(y,CV_prediction))

In [None]:
#Our cost reduces with the use of selected features

# OOB Curve generation for optimal number of trees

In [None]:
#Now with only 40 features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
#Determining the optimal n_estimators
from collections import OrderedDict

RANDOM_STATE = 42



# NOTE: Setting the `warm_start` construction parameter to `True` disables
# support for parallelized ensembles but is necessary for tracking the OOB
# error trajectory during training.
ensemble_clfs = [
    ("RandomForestClassifier, max_features='sqrt'",
        RandomForestClassifier(n_estimators=100,warm_start=True, oob_score=True,
                               max_features="sqrt",
                               random_state=RANDOM_STATE,n_jobs=-1)),
    ("RandomForestClassifier, max_features='log2'",
        RandomForestClassifier(n_estimators=100,warm_start=True, max_features='log2',
                               oob_score=True,
                               random_state=RANDOM_STATE,n_jobs=-1)),
    ("RandomForestClassifier, max_features=None",
        RandomForestClassifier(n_estimators=100,warm_start=True, max_features=None,
                                oob_score=True,
                               random_state=RANDOM_STATE,n_jobs=-1))

]
     

# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)

# Range of `n_estimators` values to explore.
min_estimators = 10
max_estimators = 500
for label, clf in ensemble_clfs:
    for i in range(min_estimators, max_estimators + 1):
        clf.set_params(n_estimators=i)
        clf.fit(X_train, y_train)

        # Record the OOB error for each `n_estimators=i` setting.
        oob_error = 1 - clf.oob_score_
        error_rate[label].append((i, oob_error))

# Generate the "OOB error rate" vs. "n_estimators" plot.
for label, clf_err in error_rate.items():
    xs, ys = zip(*clf_err)

    plt.plot(xs, ys, label=label)


plt.xlim(min_estimators, max_estimators)
plt.xlabel("n_estimators")
plt.ylabel("OOB error rate")
plt.legend(loc="upper right")
plt.show()


In [None]:
## Setting the parameters as per the oob curve
#curve shows the minimum point at approx 380 trees and max_features=log2
rf=RandomForestClassifier(n_estimators=380,max_features='log2',random_state=42,oob_score=True,warm_start=True)

# Cross Validation

In [None]:
CV_prediction = cross_val_predict(rf,X,y,cv = 5)
CV_score = cross_val_score(rf,X,y,cv = 5)

In [None]:
#mean cross validation score  
np.mean(CV_score)

In [None]:
print(classification_report(y,CV_prediction))

In [None]:
tn, fp, fn, tp = confusion_matrix(y, CV_prediction).ravel()
confusionData = [[tn,fp],[fn,tp]]
pd.DataFrame(confusionData,columns=['FN','FP'],index=['TN','TP'])

In [None]:
cost = 10*fp+500*fn
values = {'Score':[cost],'Number of Type 1 faults':[fp],'Number of Type 2 faults':[fn]}
pd.DataFrame(values)

In [None]:
print(metrics.accuracy_score(y, CV_prediction))

In [None]:
print(metrics.r2_score(y, CV_prediction)) 

In [None]:
print(metrics.f1_score(y, CV_prediction)) 

In [None]:
print(mean_squared_error(y,CV_prediction))

# CV=10

In [None]:
CV_prediction2 = cross_val_predict(rf,X,y,cv = 10)
CV_score2 = cross_val_score(rf,X,y,cv = 10)

In [None]:
#mean cross validation score  
np.mean(CV_score2)

In [None]:
print(classification_report(y,CV_prediction2))

In [None]:
tn, fp, fn, tp = confusion_matrix(y, CV_prediction2).ravel()
confusionData = [[tn,fp],[fn,tp]]
pd.DataFrame(confusionData,columns=['FN','FP'],index=['TN','TP'])

In [None]:
cost = 10*fp+500*fn
values = {'Score':[cost],'Number of Type 1 faults':[fp],'Number of Type 2 faults':[fn]}
pd.DataFrame(values)

In [None]:
print(metrics.accuracy_score(y, CV_prediction2))

In [None]:
print(metrics.r2_score(y, CV_prediction2)) 

In [None]:
print(metrics.f1_score(y, CV_prediction2)) 

In [None]:
print(mean_squared_error(y,CV_prediction2))

# Try with test train split

In [None]:
rf.fit(X_train,y_train)

In [None]:
rf.oob_score_

In [None]:
regularPrediction = rf.predict(X_test)

In [None]:
print(classification_report(y_test,regularPrediction))

In [None]:
print(metrics.accuracy_score(y_test, regularPrediction))

In [None]:
print(metrics.r2_score(y_test, regularPrediction))

In [None]:
print(metrics.f1_score(y_test, regularPrediction))

In [None]:
#testing error
print(metrics.mean_squared_error(y_test, regularPrediction)) 

In [None]:
#Training error
temp = rf.predict(X_train)
mean_squared_error(y_train,temp)

In [None]:
#confusion matrix
print(confusion_matrix(y_test,regularPrediction))

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test,regularPrediction).ravel()
confusionData = [[tn,fp],[fn,tp]]
pd.DataFrame(confusionData,columns=['FN','FP'],index=['TN','TP'])

In [None]:
#without modified threshold
cost = 10*fp+500*fn
values = {'Score':[cost],'Number of Type 1 faults':[fp],'Number of Type 2 faults':[fn]}
pd.DataFrame(values)

In [None]:

#ROC 

import matplotlib.pyplot as plt
from sklearn import metrics

def roccurve(y_values, y_preds_proba):
    fpr, tpr, thresholds = metrics.roc_curve(y_values, y_preds_proba)
    xx = np.arange(101) / float(100)
    aur = metrics.auc(fpr,tpr)
    plt.figure(figsize=(30,30))
    plt.xlim(0, 1.0)
    plt.ylim(0, 1.25)
  
    
    plt.plot(xx,xx, color='blue', label='Random Model')
    plt.plot(fpr,tpr, color='red', label='User Model')
    plt.title("ROC Curve - AUR value ="+str(aur),fontsize=35)
    plt.xlabel('% false positives',fontsize=35)
    plt.ylabel('% true positives',fontsize=35)
    plt.legend()
  
    # create the axis of thresholds (scores)
    ax2 = plt.gca().twinx()
    ax2.plot(fpr, thresholds, markeredgecolor='g',linestyle='dashed', color='g',label = 'Threshold')
    ax2.set_ylabel('Threshold',color='g',fontsize=35)
    ax2.set_ylim([thresholds[-1],thresholds[0]])
    ax2.set_xlim([fpr[0],fpr[-1]])
    plt.legend(loc="lower right")
    plt.savefig('roc_and_threshold.png')
    plt.show()
    
    
y_pred_proba = rf.predict_proba(X=X_test)
roccurve(y_values=y_test, y_preds_proba=y_pred_proba[:,1])



In [None]:
#with different threshold
THRESHOLD = 0.02 #optimal one chosen manually

thresholdPrediction = (rf.predict_proba(X_test)[:,1] >= THRESHOLD).astype(bool)


tn, fp, fn, tp = confusion_matrix(y_test,thresholdPrediction).ravel()
cost = 10*fp+500*fn
values = {'Score':[cost],'Number of Type 1 faults':[fp],'Number of Type 2 faults':[fn]}
pd.DataFrame(values)

# Testing Data implementation 

In [None]:
rf.fit(X,y)


In [None]:
testData_X = sample_testing_data.drop('class',axis=1)
testData_y = sample_testing_data['class']

In [None]:
#Testing data implemented in another file

In [None]:
#trying sampling

from collections import Counter
Counter(sample_testing_data['class'])

In [None]:
count_class = pd.value_counts(sample_testing_data['class'])
print(count_class)

In [None]:
plt.figure(figsize=(8,5))
count_class.plot(kind='bar')
plt.ylabel('Frequency')
plt.xlabel('Class')