### Import Libraries & Read in Data

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go
import seaborn as sns

# import google drive
from google.colab import drive
drive.mount('/content/drive/')

# Change directory to google drive- Just upload the file right into the drive you want(Uchennamachine) for easy access
%cd /content/drive/My Drive/

df = pd.read_csv("nasa_data.csv")

#define titanic - you'd need this going forward
nasa = pd.read_csv('nasa_data.csv')
nasa.head()

Mounted at /content/drive/
/content/drive/My Drive


Unnamed: 0,unit_number,time_in_cycles,Altitud,Mach Number,TRA,T2,T24,T30,T50,P2,P15,P30,Nf,Nc,epr,Ps30,phi,NRf,NRc,BPR,farB,htBleed,Nf_dmd,PCNfR_dmd,W31,W32,condition,max_cycles,target
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,21.61,554.36,2388.06,9046.19,1.3,47.47,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,1,192,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,1.3,47.49,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,1,192,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,21.61,554.26,2388.08,9052.94,1.3,47.27,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,1,192,189
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48,1.3,47.13,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,1,192,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,21.61,554.0,2388.06,9055.15,1.3,47.28,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,1,192,187


In [None]:
#defining a new target variable based on a minimum threshold of 25
target = 25
label_positive =nasa['target'] <= target 
nasa['label_target']=1
nasa.loc[label_positive,'label_target'] = 0

#Unit number not likely to be relevant to the process, also condition is just the data set #
nasa.drop(columns=['max_cycles','target','unit_number','condition'],inplace = True)

### Split into train and test set
Note: Final Test set not included so technically , test set referred to here is the validation set



In [None]:
X = nasa.drop(['label_target'], axis=1)
y = nasa['label_target']

#splitting the data set (note we already have an actual test set, so this test set here is the validation set)
X_train, X_val, y_train, y_val = train_test_split( X, y, test_size=0.33, random_state=42,stratify=y)

In [None]:
#confirming that the split was done (67% to 33%)
for dataset in [y_train, y_val]:
    print(round(len(dataset) / len(y), 2))

0.67
0.33


In [None]:
#Display X_train
X_train.head()

Unnamed: 0,time_in_cycles,Altitud,Mach Number,TRA,T2,T24,T30,T50,P2,P15,P30,Nf,Nc,epr,Ps30,phi,NRf,NRc,BPR,farB,htBleed,Nf_dmd,PCNfR_dmd,W31,W32
159047,299,35.002,0.8417,100.0,449.44,555.1,1369.15,1140.66,5.48,7.98,198.02,2223.46,8387.89,1.04,42.45,186.91,2388.57,8107.35,9.1173,0.02,336,2223,100.0,15.15,9.2185
32086,64,42.0064,0.84,100.0,445.0,549.32,1352.52,1114.88,3.91,5.71,138.64,2211.92,8327.77,1.02,42.14,130.6,2387.94,8087.11,9.3617,0.02,329,2212,100.0,10.65,6.2992
155840,196,35.005,0.84,100.0,449.44,555.15,1363.23,1126.76,5.48,7.98,196.53,2223.24,8363.38,1.03,41.95,184.96,2388.39,8089.61,9.2291,0.02,334,2223,100.0,14.98,9.1078
134494,98,20.0057,0.7007,100.0,491.19,607.12,1479.43,1245.62,9.35,13.65,335.13,2323.97,8738.27,1.08,43.99,315.5,2388.14,8079.62,9.2126,0.02,364,2324,100.0,24.61,14.659
126998,25,25.0076,0.6215,60.0,462.54,537.0,1261.28,1051.99,7.05,9.02,175.36,1915.08,7997.95,0.94,36.81,164.57,2027.95,7860.8,10.9176,0.02,308,1915,84.93,14.33,8.465


### Write out all data

In [None]:
X_train.to_csv('nasatrain_features.csv', index=False)
X_val.to_csv('nasaval_features.csv', index=False)


y_train.to_csv('nasatrain_labels.csv', index=False)
y_val.to_csv('nasaval_labels.csv', index=False)


In [None]:
#Read in Training Data
tr_features = pd.read_csv('nasatrain_features.csv')
tr_labels = pd.read_csv('nasatrain_labels.csv')

In [None]:
#Define Results to print
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

# Decision Tree

In [None]:
#Importing Libraries
import joblib
import pandas as pd
from sklearn import tree
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)


In [None]:
#Fitting the Model and Evaluating
dtree = tree.DecisionTreeClassifier()
parameters = {
    
    'min_samples_leaf': [1,5,10,15,20],
    # to test 1,5,10,15,20 leaves
    'max_depth': [2,4,8,16,32,None],
    'min_samples_split': [5,10,15,20],
    'criterion': ['gini','entropy'],
    'splitter': ['best','random']
    # to test various depths including No limitiation on the depth i.e. None
    
}
#using GridSearchCV to loop through predefined hyperparameters and fit your estimator (model) on your training set
cv = GridSearchCV(dtree,parameters, cv = 5)
#cv = 5 meaning it will run 5-fold validation for each hyperparameter combination
cv.fit(tr_features,tr_labels.values.ravel())
# we use ravel for the labels to convert it to an array, since the label is usually just one column and the algorithm expects an array
print_results(cv)

BEST PARAMS: {'criterion': 'entropy', 'max_depth': 16, 'min_samples_leaf': 20, 'min_samples_split': 15, 'splitter': 'best'}

0.909 (+/-0.003) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 5, 'splitter': 'best'}
0.899 (+/-0.016) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 5, 'splitter': 'random'}
0.909 (+/-0.003) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 10, 'splitter': 'best'}
0.89 (+/-0.007) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 10, 'splitter': 'random'}
0.909 (+/-0.003) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 15, 'splitter': 'best'}
0.891 (+/-0.009) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 15, 'splitter': 'random'}
0.909 (+/-0.003) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 20, 'sp

In [None]:
cv.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=16, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=20, min_samples_split=15,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

# Visualizing the Decision Tree

In [None]:
#Import Libraries
from sklearn.externals.six import StringIO
from pydotplus import graph_from_dot_data
from IPython.display import Image
from sklearn.tree import export_graphviz


In [None]:
#Plot Tree
dot_data = StringIO()
export_graphviz(dtree, out_file=dot_data, filled=True, rounded=True, impurity=False, special_characters=True)
graph = graph_from_dot_data(dot_data.getvalue())
dtree = cv.best_estimator_
Image(graph.create_png(), unconfined=False)



### Write out pickled model
We pickle the model by saving it and writing it to a file that can be used to compare with other hyperparameters performance

In [None]:
joblib.dump(cv.best_estimator_, '../../../NASADT_model.pkl')

['../../../NASADT_model.pkl']

In [None]:
cv = cv.best_estimator_
#Predicting the labels using the optimized hyperparameters
tr_labelspredict = cv.predict(tr_features)


In [None]:
# To view accuracy score, recall score, precision score and f1 score
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

 #Train Scores

acc_train = accuracy_score(y_train,tr_labelspredict)
p_score_train = precision_score(y_train,tr_labelspredict)
r_score_train = recall_score(y_train,tr_labelspredict)
f1_score_train = f1_score(y_train,tr_labelspredict)

 #Test Scores

val_labelspredict = cv.predict(X_val)
acc_test = accuracy_score(y_val,val_labelspredict)
p_score_test = precision_score(y_val,val_labelspredict)
r_score_test = recall_score(y_val,val_labelspredict)
f1_score_test = f1_score(y_val,val_labelspredict)

print(f'The Accuracy score for the training set is:{acc_train}')
print(f'The Precision score for the training set is:{p_score_train}')
print(f'The Recall score for the training set is:{r_score_train}')
print(f'The F1 score for the training set is:{f1_score_train}')
print('--------------------------------------------------------------')
print(f'The Accuracy score for the validation set is:{acc_test}')
print(f'The Precision score for the validation set is:{p_score_test}')
print(f'The Recall score for the validation set is:{r_score_test}')
print(f'The F1 score for the validation set is:{f1_score_test}')
# train_results = [acc_train,p_score_train,r_score_train,f1_score_train]
# test_results = [acc_test,p_score_test,r_score_test,f1_score_test]



The Accuracy score for the training set is:0.9718261355174982
The Precision score for the training set is:0.9829101972303819
The Recall score for the training set is:0.9852979839939425
The F1 score for the training set is:0.9841026422070386
--------------------------------------------------------------
The Accuracy score for the validation set is:0.9609025113853248
The Precision score for the validation set is:0.9766198922556055
The Recall score for the validation set is:0.9792680843795372
The F1 score for the validation set is:0.9779421955457948
