In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
sns.set_style('whitegrid')
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict,cross_val_score
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_error

In [2]:
training_data = pd.read_csv("aps_failure_training_set.csv",na_values="na")
training_data.head()

FileNotFoundError: [Errno 2] File b'aps_failure_training_set.csv' does not exist: b'aps_failure_training_set.csv'

# Preprocessing

In [None]:
plt.figure(figsize=(20,12))
sns.heatmap(training_data.isnull(),yticklabels=False,cbar=False,cmap = 'viridis')

# Missing value handling

We are going to use different approches with missing values:

1. Removing the column having 80% missing values (**Self intuition)
2. Keeping all the features
3. Later, we will try to implement some feature engineering 


**For the rest of the missing values, we are replacing them with their mean() for now (**Ref) 

<big><b>Second Approach</b>

In [None]:
sample_training_data = training_data
sample_training_data.fillna(sample_training_data.mean(),inplace=True)

#after replacing with mean()

plt.figure(figsize=(20,12))
sns.heatmap(sample_training_data.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
#as all the other values are numerical except Class column so we can replace them with 1 and 0

sample_training_data = sample_training_data.replace('neg',0)
sample_training_data = sample_training_data.replace('pos',1)

sample_training_data.head()

# Testing Data preprocessing

In [None]:
testing_data = pd.read_csv("aps_failure_test_set.csv",na_values="na")
testing_data.head()

In [None]:
sample_testing_data = testing_data
sample_testing_data.fillna(sample_testing_data.mean(),inplace=True)

#after replacing with mean()

plt.figure(figsize=(20,12))
sns.heatmap(sample_testing_data.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
#as all the other values are numerical except Class column so we can replace them with 1 and 0

sample_testing_data = sample_testing_data.replace('neg',0)
sample_testing_data = sample_testing_data.replace('pos',1)

sample_testing_data.head()

In [None]:
from sklearn.feature_selection import SelectFromModel

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf= RandomForestClassifier(random_state=42)

In [None]:
X = sample_training_data.drop('class',axis=1)
y = sample_training_data['class']

In [None]:
features = X.columns.values
features.tolist()

In [None]:
rf.fit(X,y)
rf.feature_importances_

In [None]:
 model = SelectFromModel(rf, prefit=True)

In [None]:
 X_new = model.transform(X)
 

 

In [None]:
X=X_new
X.shape

In [None]:
#Print the chosen features
features = np.array(features.tolist())
print(features[model.get_support()])

In [None]:
testData_X = sample_testing_data.drop('class',axis=1)
testData_y = sample_testing_data['class']

In [None]:
#Test data transformation
newtestdata=testData_X.loc[:, features[model.get_support()]]

In [None]:
newtestdata.head()

# Test data implementation

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf= RandomForestClassifier(n_estimators=380,max_features='log2',random_state=42,oob_score=True,warm_start=True)

In [None]:
rf.fit(X,y)

In [None]:
testDataPrediction = rf.predict(newtestdata)

In [None]:
print(classification_report(testData_y,testDataPrediction))

In [None]:
print(metrics.accuracy_score(testData_y, testDataPrediction))

In [None]:
print(metrics.r2_score(testData_y, testDataPrediction)) 

In [None]:
print(metrics.f1_score(testData_y, testDataPrediction))

In [None]:
#testing error
print(metrics.mean_squared_error(testData_y, testDataPrediction)) 

In [None]:
#Training error
temp = rf.predict(X)
mean_squared_error(y,temp) 

In [None]:
#confusion matrix
print(confusion_matrix(testData_y, testDataPrediction))

In [None]:
tn, fp, fn, tp = confusion_matrix(testData_y, testDataPrediction).ravel()
confusionData = [[tn,fp],[fn,tp]]
pd.DataFrame(confusionData,columns=['FN','FP'],index=['TN','TP'])

In [None]:
#without modified threshold
cost = 10*fp+500*fn
values = {'Score':[cost],'Number of Type 1 faults':[fp],'Number of Type 2 faults':[fn]}
pd.DataFrame(values)

In [None]:
#with different threshold
THRESHOLD = 0.02 #optimal one chosen manually

thresholdPrediction = (rf.predict_proba(newtestdata)[:,1] >= THRESHOLD).astype(bool)


tn, fp, fn, tp = confusion_matrix(testData_y,thresholdPrediction).ravel()
cost = 10*fp+500*fn
values = {'Score':[cost],'Number of Type 1 faults':[fp],'Number of Type 2 faults':[fn]}
pd.DataFrame(values)

# Final Score is 10810

# Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

y_true = ["Pos", "Neg"]
y_pred = ["Pos", "Neg"]
data = confusion_matrix(testData_y, thresholdPrediction)
df_cm = pd.DataFrame(data, columns=np.unique(y_true), index = np.unique(y_true))
df_cm.index.name = 'Actual'
df_cm.columns.name = 'Predicted'
plt.figure(figsize = (10,7))
sn.set(font_scale=1.4)#for label size
sn.heatmap(df_cm, cmap="Blues", annot=True,annot_kws={"size": 16})# font size

In [None]:
#precision Score

preScore= tp/(tp+fp)
preScore

In [None]:
#Recall Score

reScore = tp/(tp+fn) 
reScore

In [None]:
#Information (Precision-Recall)
#high precision relates to a low false positive rate, 
#and high recall relates to a low false negative rate. 
#High scores for both show that the classifier is returning accurate results (high precision), 
#as well as returning a majority of all positive results (high recall).