In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from scipy import signal
import sklearn as sk
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split # split into train/test dataset
from sklearn.preprocessing import StandardScaler   # for normalizing data
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score, precision_score, recall_score

pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.6f}'.format

In [None]:
#1. LOAD DATASET in 'turbofan_data.csv' and print out description, info of the data

original_pd = pd.read_csv('turbofan_data.csv', sep=',')
print(original_pd.shape)  # number of rows, cols

(20631, 35)


In [None]:
original_pd.describe()

Unnamed: 0,time_(cycles),CA,CAUXA,CAUXB,EGT,GP_MA,HC_MA,HP_MA,P2,T2,GB_MA,GB_MB,WF,EPR,OS_MA,OS_MB,OS_MC,LP_MA,LPr_MA,STATIC_AUXA,AUXA_BYPASS,AUXB_BYPASS,AUXC_BYPASS,AUXD_BYPASS,EGT_RAW,EGT_RAWC,CAUXB_BF,VIBS,GS_VIBS,EGT_RAWC_SENSED,FAN_DECOUP,P25_BLEED,BDIS,RUL
count,20631.0,13761.0,13761.0,20631.0,0.0,20631.0,20631.0,0.0,20631.0,20631.0,19600.0,20631.0,19600.0,20631.0,10316.0,15474.0,20631.0,20631.0,18568.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,15474.0,20631.0,20631.0,19600.0,20631.0,20631.0,20631.0,20631.0
mean,108.807862,4.804737e-06,9e-06,2e-06,,9.995739,4.98836,,518.67,642.680934,1590.527158,1408.933782,14.62,21.609803,553.354699,2388.096229,9065.242941,1.3,47.540585,521.41347,2388.096152,8143.752722,8.442146,0.03,393.210654,2388.0,100.0,-4.987162,0.003649,2385.790737,0.0,100.0,-642.680934,107.807862
std,68.88099,6.828955e-06,0.002192,0.000293,,3.028799,0.996459,,0.0,0.500053,6.129913,9.000605,0.0,0.001389,0.88524,0.071287,22.08288,0.0,0.266889,0.737553,0.071919,19.076176,0.037505,1.3878120000000003e-17,1.548763,0.0,0.0,1.992161,1.004259,9.194869,0.0,0.0,0.500053,68.88099
min,1.0,0.0,-0.0087,-0.0006,,-3.669363,0.669693,,518.67,641.21,1571.04,1382.25,14.62,21.6,550.43,2387.9,9021.73,1.3,46.85,518.69,2387.88,8099.94,8.3249,0.03,388.0,2388.0,100.0,-13.65327,-3.885182,2356.56,0.0,100.0,-644.53,0.0
25%,52.0,4.9e-07,-0.0015,-0.0002,,7.960675,4.317342,,518.67,642.325,1586.28,1402.36,14.62,21.61,552.8,2388.04,9053.1,1.3,47.35,520.96,2388.04,8133.245,8.4149,0.03,392.0,2388.0,100.0,-6.319536,-0.675421,2379.42,0.0,100.0,-643.0,51.0
50%,104.0,2.25e-06,0.0,0.0,,10.002458,4.991867,,518.67,642.64,1590.1,1408.04,14.62,21.61,553.43,2388.09,9060.66,1.3,47.51,521.48,2388.09,8140.54,8.4389,0.03,393.0,2388.0,100.0,-4.97763,-0.001747,2385.15,0.0,100.0,-642.64,103.0
75%,156.0,6.25e-06,0.0015,0.0003,,12.008643,5.661834,,518.67,643.0,1594.39,1414.555,14.62,21.61,553.99,2388.14,9069.42,1.3,47.7,521.95,2388.14,8148.31,8.4656,0.03,394.0,2388.0,100.0,-3.649616,0.677656,2391.585,0.0,100.0,-642.325,155.0
max,362.0,7.569e-05,0.0087,0.0006,,22.291201,8.592897,,518.67,644.53,1616.91,1441.49,14.62,21.61,555.86,2388.56,9244.59,1.3,48.53,523.38,2388.56,8293.72,8.5848,0.03,400.0,2388.0,100.0,3.244829,4.60369,2425.365,0.0,100.0,-641.21,361.0


In [None]:
print(original_pd.info())

# The data consists of simulated sensor readings for 100 turbofan engines, each run until the engine has failed.
# ESN: engine identifier
# time_(cycles): time series
# RUL: remaining useful life

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20631 entries, 0 to 20630
Data columns (total 35 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ESN              20631 non-null  object 
 1   time_(cycles)    20631 non-null  int64  
 2   CA               13761 non-null  float64
 3   CAUXA            13761 non-null  float64
 4   CAUXB            20631 non-null  float64
 5   EGT              0 non-null      float64
 6   GP_MA            20631 non-null  float64
 7   HC_MA            20631 non-null  float64
 8   HP_MA            0 non-null      float64
 9   P2               20631 non-null  float64
 10  T2               20631 non-null  float64
 11  GB_MA            19600 non-null  float64
 12  GB_MB            20631 non-null  float64
 13  WF               19600 non-null  float64
 14  EPR              20631 non-null  float64
 15  OS_MA            10316 non-null  float64
 16  OS_MB            15474 non-null  float64
 17  OS_MC       

In [None]:
#2a. REMOVE NULL columns: EGT, HP_MA
#2b. REMOVE CONSTANT columns:  P2, WF, EPR,  OS_MB, LP_MA, AUXD_BYPASS, EGT_RAWC, CAUXB_BF,FAN_DECOUP, P25_BLEED
#2c. REMOVE NEARLY CONSTANT columns: AUXA_BYPASS, AUXC_BYPASS, CAUXB, LPr_MA
# columns: CA and CAUXA(13k/20k), OS_MA(10k/20k) ==> have too many NaN value ==> REMOVE
# column BDIS is the negative value of column T2 ==> remove BDIS

## T2, GB_MA, GB_MB, OS_MC, STATIC_AUXA, AUXB_BYPASS  ==> LARGE values ==> REMOVE (strong bias)

remove_null = original_pd.drop(['EGT', 'CAUXB','HP_MA', 'P2', 'WF','EPR', 'OS_MB', 'LP_MA','AUXA_BYPASS','AUXC_BYPASS', 'AUXD_BYPASS', 'EGT_RAWC', 'CAUXB_BF', 'FAN_DECOUP', 'P25_BLEED'], axis=1)
remove_null = remove_null.drop(['CA', 'CAUXA', 'OS_MA', 'BDIS','LPr_MA'],axis=1)
remove_null = remove_null.drop(['GB_MA', 'GB_MB', 'OS_MC','STATIC_AUXA','AUXB_BYPASS','EGT_RAWC_SENSED'],axis=1)
remove_null

Unnamed: 0,ESN,time_(cycles),GP_MA,HC_MA,T2,EGT_RAW,VIBS,GS_VIBS,RUL
0,A1,1,18.964527,5.546084,641.82,392,-5.510178,1.248924,191
1,A1,2,6.539379,2.433800,642.15,392,-5.021747,-0.064045,190
2,A1,3,7.556979,5.300895,642.35,390,-7.020227,-0.975987,189
3,A1,4,7.813116,7.271472,642.35,392,-4.568730,0.824121,188
4,A1,5,14.005445,7.362966,642.37,393,-5.647532,-2.192124,187
...,...,...,...,...,...,...,...,...,...
20626,A100,196,16.296726,6.020237,643.49,397,-4.738461,-1.480118,4
20627,A100,197,9.841553,4.580266,643.54,395,-9.059682,0.159304,3
20628,A100,198,9.478432,3.475012,643.42,398,-5.745223,-1.535532,2
20629,A100,199,3.656952,5.119590,643.23,395,-4.309061,-1.102593,1


In [None]:
groupby_ESN = remove_null.groupby('ESN')
#groupby_ESN.count()
#3.a   ADD DATE column starting from 1-Jan-2022  FOR EACH ESN (A1, A2... A99)

newdf_withDate = None
for name, group in groupby_ESN:
    group['date'] = pd.date_range(start ='1-Jan-2022', periods=len(group), freq='D')
    if newdf_withDate is None:
        newdf_withDate = group
    else:
        newdf_withDate= pd.concat([newdf_withDate, group], ignore_index=True)

#3.b DROP column time_(cycles)
newdf_withDate= newdf_withDate.drop(['time_(cycles)'],axis=1)

In [None]:
newdf_withDate

Unnamed: 0,ESN,GP_MA,HC_MA,T2,EGT_RAW,VIBS,GS_VIBS,RUL,date
0,A1,18.964527,5.546084,641.82,392,-5.510178,1.248924,191,2022-01-01
1,A1,6.539379,2.433800,642.15,392,-5.021747,-0.064045,190,2022-01-02
2,A1,7.556979,5.300895,642.35,390,-7.020227,-0.975987,189,2022-01-03
3,A1,7.813116,7.271472,642.35,392,-4.568730,0.824121,188,2022-01-04
4,A1,14.005445,7.362966,642.37,393,-5.647532,-2.192124,187,2022-01-05
...,...,...,...,...,...,...,...,...,...
20626,A99,14.028679,2.960841,643.72,396,-5.971421,-2.414999,4,2022-06-30
20627,A99,8.804084,5.372009,643.52,393,-4.504541,1.220778,3,2022-07-01
20628,A99,5.194769,6.849628,643.96,395,-2.979502,-0.534871,2,2022-07-02
20629,A99,13.120857,3.320732,644.10,395,-6.744627,0.959087,1,2022-07-03


In [None]:
#4 ENGINE FAILED
# search for final cycel (i.e RUL = 0) and sort by 'date'.
# 100 engines (index 0-99) ==> cutoff date at position 33
finalCycle = newdf_withDate[newdf_withDate['RUL'] ==0] [['ESN','date']]
finalCycle= finalCycle.sort_values(by = ['date'])
finalCycle=finalCycle.reset_index(drop=True)

In [None]:
finalCycle

Unnamed: 0,ESN,date
0,A39,2022-05-08
1,A91,2022-05-15
2,A57,2022-05-17
3,A70,2022-05-17
4,A24,2022-05-27
...,...,...
95,A83,2022-10-20
96,A67,2022-11-09
97,A96,2022-12-02
98,A92,2022-12-07


In [None]:
#CUTOFF DATE
print(finalCycle.iloc[33,1])
print("failed date beyond cutoff date: ", len(finalCycle[finalCycle['date']>= finalCycle.iloc[33,1]]))
print("failed date before cutoff date:",len(finalCycle[finalCycle['date']< finalCycle.iloc[33,1]]))

2022-07-07 00:00:00
failed date beyond cutoff date:  67
failed date before cutoff date: 33


In [None]:
### list of bad cohort
print(finalCycle.loc[0:33])
finalCycle.loc[0:33].to_csv('bad_cohort.csv',index=False)

    ESN       date
0   A39 2022-05-08
1   A91 2022-05-15
2   A57 2022-05-17
3   A70 2022-05-17
4   A24 2022-05-27
5   A58 2022-05-27
6    A8 2022-05-30
7   A65 2022-06-02
8   A77 2022-06-03
9   A90 2022-06-03
10  A93 2022-06-04
11  A98 2022-06-05
12  A27 2022-06-05
13  A36 2022-06-07
14  A19 2022-06-07
15  A45 2022-06-07
16  A13 2022-06-12
17  A29 2022-06-12
18  A28 2022-06-14
19  A74 2022-06-15
20  A23 2022-06-17
21  A12 2022-06-19
22  A37 2022-06-19
23  A60 2022-06-21
24  A63 2022-06-23
25  A87 2022-06-27
26   A3 2022-06-28
27  A62 2022-06-29
28  A14 2022-06-29
29  A35 2022-06-30
30  A99 2022-07-04
31  A61 2022-07-04
32  A80 2022-07-04
33  A40 2022-07-07


In [None]:
## Filter by cuttoff date
filtered_data = newdf_withDate[newdf_withDate['date'] < finalCycle.iloc[33,1]]
filtered_data

Unnamed: 0,ESN,GP_MA,HC_MA,T2,EGT_RAW,VIBS,GS_VIBS,RUL,date
0,A1,18.964527,5.546084,641.82,392,-5.510178,1.248924,191,2022-01-01
1,A1,6.539379,2.433800,642.15,392,-5.021747,-0.064045,190,2022-01-02
2,A1,7.556979,5.300895,642.35,390,-7.020227,-0.975987,189,2022-01-03
3,A1,7.813116,7.271472,642.35,392,-4.568730,0.824121,188,2022-01-04
4,A1,14.005445,7.362966,642.37,393,-5.647532,-2.192124,187,2022-01-05
...,...,...,...,...,...,...,...,...,...
20626,A99,14.028679,2.960841,643.72,396,-5.971421,-2.414999,4,2022-06-30
20627,A99,8.804084,5.372009,643.52,393,-4.504541,1.220778,3,2022-07-01
20628,A99,5.194769,6.849628,643.96,395,-2.979502,-0.534871,2,2022-07-02
20629,A99,13.120857,3.320732,644.10,395,-6.744627,0.959087,1,2022-07-03


In [None]:
#5 FILL MISSING VALUE with mean
# for i in filtered_data.columns[filtered_data.isnull().any(axis=0)]:    
#     filtered_data[i].fillna(filtered_data[i].mean(),inplace=True)

group_fd = filtered_data.groupby('ESN')
fillna_fd = None
for name, group in group_fd:
    for i in range(len(group.columns)-1): 
        for r in range(len(group)-1):
            if pd.isna(group.iloc[r,i]):
                group.iloc[r,i]= (group.iloc[r-1,i]+group.iloc[r+1,i])/2
    if fillna_fd is None:
        fillna_fd = group
    else:
        fillna_fd= pd.concat([fillna_fd, group], ignore_index=True)

In [None]:
fillna_fd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17876 entries, 0 to 17875
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   ESN      17876 non-null  object        
 1   GP_MA    17876 non-null  float64       
 2   HC_MA    17876 non-null  float64       
 3   T2       17876 non-null  float64       
 4   EGT_RAW  17876 non-null  int64         
 5   VIBS     17876 non-null  float64       
 6   GS_VIBS  17876 non-null  float64       
 7   RUL      17876 non-null  int64         
 8   date     17876 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(5), int64(2), object(1)
memory usage: 1.2+ MB


In [None]:
#write this data set to .csv 
fillna_fd.to_csv('early_failed_engines.csv',index=False)
# test= pd.read_csv('early_failed_engines.csv')
# test

In [None]:
fillna_fd

Unnamed: 0,ESN,GP_MA,HC_MA,T2,EGT_RAW,VIBS,GS_VIBS,RUL,date
0,A1,18.964527,5.546084,641.82,392,-5.510178,1.248924,191,2022-01-01
1,A1,6.539379,2.433800,642.15,392,-5.021747,-0.064045,190,2022-01-02
2,A1,7.556979,5.300895,642.35,390,-7.020227,-0.975987,189,2022-01-03
3,A1,7.813116,7.271472,642.35,392,-4.568730,0.824121,188,2022-01-04
4,A1,14.005445,7.362966,642.37,393,-5.647532,-2.192124,187,2022-01-05
...,...,...,...,...,...,...,...,...,...
17871,A99,14.028679,2.960841,643.72,396,-5.971421,-2.414999,4,2022-06-30
17872,A99,8.804084,5.372009,643.52,393,-4.504541,1.220778,3,2022-07-01
17873,A99,5.194769,6.849628,643.96,395,-2.979502,-0.534871,2,2022-07-02
17874,A99,13.120857,3.320732,644.10,395,-6.744627,0.959087,1,2022-07-03


In [None]:
#6.a SavgolTransformer class

class SavgolTransformer:
    def __init__(self,device_col, window_length, polyorder, deriv):
        self.device_col = device_col
        self.window_length = window_length
        self.polyorder = polyorder
        self.deriv = deriv
    
    
    def transform (self,data):
        grouped = data.groupby(self.device_col)  # group by device_col
        smoothed_df = None
        for name, group in grouped:
            group['GP_MA'] = signal.savgol_filter(group['GP_MA'], window_length=self.window_length, polyorder=self.polyorder, deriv=self.deriv)
            group['HC_MA'] = signal.savgol_filter(group['HC_MA'], window_length=self.window_length, polyorder=self.polyorder, deriv=self.deriv)
            group['T2'] = signal.savgol_filter(group['T2'], window_length=self.window_length, polyorder=self.polyorder, deriv=self.deriv)
            group['EGT_RAW'] = signal.savgol_filter(group['EGT_RAW'], window_length=self.window_length, polyorder=self.polyorder, deriv=self.deriv)
            group['VIBS'] = signal.savgol_filter(group['VIBS'], window_length=self.window_length, polyorder=self.polyorder, deriv=self.deriv)
            group['GS_VIBS'] = signal.savgol_filter(group['GS_VIBS'], window_length=self.window_length, polyorder=self.polyorder, deriv=self.deriv)
            if smoothed_df is None:
                smoothed_df =  group
            else:
                smoothed_df= pd.concat([smoothed_df, group], ignore_index=True)
        return smoothed_df


In [None]:

#6.b ########### SMOOTHED DATA using SavgolTransformer ###########
w= 11
p = 3
d= 0
savgol = SavgolTransformer(device_col = "ESN", window_length = w, polyorder = p, deriv = d)
smoothed_data = savgol.transform(fillna_fd)

In [None]:
smoothed_data

Unnamed: 0,ESN,GP_MA,HC_MA,T2,EGT_RAW,VIBS,GS_VIBS,RUL,date
0,A1,14.611773,4.105921,641.841748,391.853147,-5.189522,1.392281,191,2022-01-01
1,A1,12.066319,5.150143,642.118182,391.650350,-5.876964,-0.048441,190,2022-01-02
2,A1,10.258126,5.665233,642.291632,391.543124,-5.969602,-0.758252,189,2022-01-03
3,A1,9.092357,5.777160,642.379732,391.516317,-5.642537,-0.907572,188,2022-01-04
4,A1,8.474177,5.611893,642.400117,391.554779,-5.070869,-0.666822,187,2022-01-05
...,...,...,...,...,...,...,...,...,...
17871,A99,9.311203,5.020239,643.596131,395.044289,-5.377554,-0.410617,4,2022-06-30
17872,A99,9.593628,4.720380,643.707494,394.712121,-5.560216,-0.580740,3,2022-07-01
17873,A99,10.051803,4.559376,643.828298,394.745921,-5.521839,-0.647577,2,2022-07-02
17874,A99,10.615179,4.644386,643.945944,395.321678,-5.179142,-0.553062,1,2022-07-03


In [None]:
########### MODELLING ##############
group_smd = smoothed_data.groupby('ESN')
flatten_df = None
for name, group in group_smd:
    df = pd.DataFrame.from_dict({'ESN':[group.tail(15).iloc[0,0]], 
         'GP_MA': [group.tail(15)['GP_MA'].mean()],
         'HC_MA': [group.tail(15)['HC_MA'].mean()],
         'T2': [group.tail(15)['T2'].mean()],
         'EGT_RAW': [group.tail(15)['EGT_RAW'].mean()],
         'VIBS': [group.tail(15)['VIBS'].mean()],
         'GS_VIBS': [group.tail(15)['GS_VIBS'].mean()] })
    if flatten_df is None:
       flatten_df= df
    else:
        flatten_df = pd.concat([flatten_df, df], ignore_index=True)

flatten_df

Unnamed: 0,ESN,GP_MA,HC_MA,T2,EGT_RAW,VIBS,GS_VIBS
0,A1,9.398420,5.215427,643.449687,395.326185,-5.047159,0.223099
1,A10,9.968779,4.950099,642.927566,393.848640,-5.754085,-0.014239
2,A100,10.402842,5.136344,643.318535,395.165890,-4.537086,-0.197009
3,A11,10.353182,4.989834,642.677054,392.976612,-4.583134,-0.240011
4,A12,10.052149,4.946085,643.483424,395.899845,-4.704504,0.337742
...,...,...,...,...,...,...,...
95,A95,10.645547,5.150790,642.458615,392.526573,-4.610949,0.415744
96,A96,10.242077,4.749121,642.510954,393.098834,-5.304006,-0.457960
97,A97,10.699720,5.388567,643.142295,394.877545,-5.834301,-0.213046
98,A98,9.738681,4.995687,643.473796,395.125719,-5.403325,0.175874


In [None]:
# ADD 'bad_cohort' LABEL
flatten_df['bad_cohort'] = flatten_df.apply(lambda x: 1 if x.iloc[0] in finalCycle.loc[0:33]['ESN'].values else 0 ,axis=1)
flatten_df

Unnamed: 0,ESN,GP_MA,HC_MA,T2,EGT_RAW,VIBS,GS_VIBS,bad_cohort
0,A1,9.398420,5.215427,643.449687,395.326185,-5.047159,0.223099,0
1,A10,9.968779,4.950099,642.927566,393.848640,-5.754085,-0.014239,0
2,A100,10.402842,5.136344,643.318535,395.165890,-4.537086,-0.197009,0
3,A11,10.353182,4.989834,642.677054,392.976612,-4.583134,-0.240011,0
4,A12,10.052149,4.946085,643.483424,395.899845,-4.704504,0.337742,1
...,...,...,...,...,...,...,...,...
95,A95,10.645547,5.150790,642.458615,392.526573,-4.610949,0.415744,0
96,A96,10.242077,4.749121,642.510954,393.098834,-5.304006,-0.457960,0
97,A97,10.699720,5.388567,643.142295,394.877545,-5.834301,-0.213046,0
98,A98,9.738681,4.995687,643.473796,395.125719,-5.403325,0.175874,1


In [None]:
# We use data (mean) of engines (100 engines) to predict failure status  (bad_cohort): 1 ==> Failed, 0 ==> Runing
# This is a BINARY CLASSIFICATION problem. The most common methods of binary classfication are :
#Support Vector Machines
#Naive Bayes
#Nearest Neighbor
#Decision Trees
#Logistic Regression
#Neural Networks
#################################################
import sklearn as sk
from sklearn.neural_network import MLPClassifier
#from sklearn.linear_model import LogisticRegression
#from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split # split into train/test dataset
from sklearn.preprocessing import StandardScaler   # for normalizing data
from sklearn.preprocessing import MinMaxScaler # for normalizing data
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.exceptions import ConvergenceWarning
import warnings

X = flatten_df[['GP_MA','HC_MA','T2', 'EGT_RAW','VIBS','GS_VIBS']]
y = flatten_df['bad_cohort']

X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.20, random_state=0)


#ss_train = StandardScaler()
ss_train = MinMaxScaler()
X_train = ss_train.fit_transform(X_train)
#ss_test = StandardScaler()
ss_test = MinMaxScaler()
X_test = ss_test.fit_transform(X_test)

models,accuracy, precision, recall ,cm = {}, {}, {} ,{},{}


######## params
learning_rate= 0.08
alpha =0.5
max_iter = 2000
random_state = 1

params = [
    {
        "solver": "sgd",
        "learning_rate": "constant",
        "momentum": 0,
        "learning_rate_init": learning_rate,
    },
    {
        "solver": "sgd",
        "learning_rate": "constant",
        "momentum": 0.9,
        "nesterovs_momentum": True,
        "learning_rate_init": learning_rate,
    },

    {"solver": "lbfgs","learning_rate_init": learning_rate},
    {"solver": "adam", "learning_rate_init": learning_rate}
]

models['MLPClassifier 1'] = MLPClassifier(random_state=random_state, alpha= alpha, max_iter=max_iter, **params[0])
models['MLPClassifier 2'] = MLPClassifier(random_state=random_state, alpha= alpha, max_iter=max_iter, **params[1])
models['MLPClassifier 3'] = MLPClassifier(random_state=random_state, alpha= alpha, max_iter=max_iter, **params[2])
models['MLPClassifier 4'] = MLPClassifier(random_state=random_state, alpha= alpha, max_iter=max_iter, **params[3])

for key in models.keys():
    with warnings.catch_warnings():
        warnings.filterwarnings(
                "ignore", category=ConvergenceWarning, module="sklearn"
            )
        # Fit the classifier
        models[key].fit(X_train, y_train)

    # Make predictions
    predictions = models[key].predict(X_test)

    # Calculate metrics
    accuracy[key] = accuracy_score(predictions, y_test)
    precision[key] = precision_score(predictions, y_test)
    recall[key] = recall_score(predictions, y_test)
    cm[key] = confusion_matrix(y_test, predictions)

result = pd.DataFrame(index=models.keys(), columns=['Accuracy', 'Precision', 'Recall'])
result['Accuracy'] = accuracy.values()
result['Precision'] = precision.values()
result['Recall'] = recall.values()
result['Confusion_Matrix']= cm.values()
result

Unnamed: 0,Accuracy,Precision,Recall,Confusion_Matrix
MLPClassifier 1,0.85,0.7,1.0,"[[10, 0], [3, 7]]"
MLPClassifier 2,0.85,0.7,1.0,"[[10, 0], [3, 7]]"
MLPClassifier 3,0.9,0.8,1.0,"[[10, 0], [2, 8]]"
MLPClassifier 4,0.85,0.7,1.0,"[[10, 0], [3, 7]]"


In [None]:
flatten_df

Unnamed: 0,ESN,GP_MA,HC_MA,T2,EGT_RAW,VIBS,GS_VIBS,bad_cohort
0,A1,9.40,5.22,643.45,395.33,-5.05,0.22,0
1,A10,9.97,4.95,642.93,393.85,-5.75,-0.01,0
2,A100,10.40,5.14,643.32,395.17,-4.54,-0.20,0
3,A11,10.35,4.99,642.68,392.98,-4.58,-0.24,0
4,A12,10.05,4.95,643.48,395.90,-4.70,0.34,1
...,...,...,...,...,...,...,...,...
95,A95,10.65,5.15,642.46,392.53,-4.61,0.42,0
96,A96,10.24,4.75,642.51,393.10,-5.30,-0.46,0
97,A97,10.70,5.39,643.14,394.88,-5.83,-0.21,0
98,A98,9.74,5.00,643.47,395.13,-5.40,0.18,1


In [None]:
ss_rank = MinMaxScaler()
X_ranking = ss_rank.fit_transform(X)
#X_ranking

In [None]:
y_ranking = models['MLPClassifier 3'].predict_proba(X_ranking)
y_ranking

array([[5.86423003e-01, 4.13576997e-01],
       [9.95798531e-01, 4.20146927e-03],
       [9.07596782e-01, 9.24032181e-02],
       [9.99703784e-01, 2.96216310e-04],
       [3.87771896e-01, 6.12228104e-01],
       [1.33742608e-01, 8.66257392e-01],
       [1.76902195e-01, 8.23097805e-01],
       [9.80904650e-01, 1.90953504e-02],
       [7.88554534e-01, 2.11445466e-01],
       [9.99778467e-01, 2.21532544e-04],
       [9.64717793e-01, 3.52822071e-02],
       [3.32946017e-01, 6.67053983e-01],
       [9.99866454e-01, 1.33545917e-04],
       [9.55027992e-01, 4.49720079e-02],
       [8.20205382e-01, 1.79794618e-01],
       [8.75441279e-01, 1.24558721e-01],
       [3.19944040e-01, 6.80055960e-01],
       [2.37983932e-01, 7.62016068e-01],
       [9.83603398e-01, 1.63966017e-02],
       [9.02535914e-01, 9.74640860e-02],
       [3.15775370e-01, 6.84224630e-01],
       [7.53373853e-01, 2.46626147e-01],
       [4.29386115e-01, 5.70613885e-01],
       [5.64474139e-01, 4.35525861e-01],
       [9.419964

In [None]:
#belong_to = [1-x[1] for x in y_ranking]
belong_to = y_ranking[:,0].tolist()

In [None]:

flatten_df['ranking_score']= belong_to
flatten_df= flatten_df.sort_values(by=['ranking_score'],ascending=False)
flatten_df['ranking']= range(1,len(flatten_df)+1)
flatten_df= flatten_df.reset_index(drop=True)
flatten_df

Unnamed: 0,ESN,GP_MA,HC_MA,T2,EGT_RAW,VIBS,GS_VIBS,bad_cohort,ranking,ranking_score
0,A31,10.087254,5.437276,642.791844,393.583605,-5.458582,-0.121323,0,1,0.999956
1,A34,9.556444,4.934647,643.345110,395.816395,-4.968261,0.039558,0,2,0.999900
2,A16,9.884988,4.887053,643.350579,395.130614,-4.706971,-0.174665,0,3,0.999897
3,A38,9.957900,4.953948,643.370853,395.656954,-5.343755,0.056107,0,4,0.999892
4,A14,9.344522,4.923385,643.563557,396.165268,-4.937784,0.021443,1,5,0.999866
...,...,...,...,...,...,...,...,...,...,...
95,A77,10.455021,5.011968,643.409371,395.606915,-5.549462,-0.036029,1,96,0.133743
96,A11,10.353182,4.989834,642.677054,392.976612,-4.583134,-0.240011,0,97,0.128484
97,A20,9.141868,4.539128,643.037793,394.806838,-5.237257,-0.425886,0,98,0.096285
98,A93,9.845315,4.848984,643.537381,395.609946,-5.197624,0.298597,1,99,0.076471


In [None]:
flatten_df[['ESN','ranking_score','ranking']].to_csv('ranking.csv',index=False)

# New Section