In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
import sklearn
from sklearn.metrics import mean_squared_error, r2_score
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import random
import warnings
np.random.seed(34)
warnings.filterwarnings('ignore')



# USE train,test,rul FD001 --> First

In [4]:
index_names = ['engine', 'cycle']
setting_names = ['setting_1', 'setting_2', 'setting_3']
sensor_names=[ "(Fan inlet temperature) (◦R)",
"(LPC outlet temperature) (◦R)",
"(HPC outlet temperature) (◦R)",
"(LPT outlet temperature) (◦R)",
"(Fan inlet Pressure) (psia)",
"(bypass-duct pressure) (psia)",
"(HPC outlet pressure) (psia)",
"(Physical fan speed) (rpm)",
"(Physical core speed) (rpm)",
"(Engine pressure ratio(P50/P2)",
"(HPC outlet Static pressure) (psia)",
"(Ratio of fuel flow to Ps30) (pps/psia)",
"(Corrected fan speed) (rpm)",
"(Corrected core speed) (rpm)",
"(Bypass Ratio) ",
"(Burner fuel-air ratio)",
"(Bleed Enthalpy)",
"(Required fan speed)",
"(Required fan conversion speed)",
"(High-pressure turbines Cool air flow)",
"(Low-pressure turbines Cool air flow)" ]
col_names = index_names + setting_names + sensor_names


df_train = pd.read_csv(('/kaggle/input/nasa-cmaps/CMaps/train_FD003.txt'), sep='\s+', header=None, names=col_names)
df_test = pd.read_csv(('/kaggle/input/nasa-cmaps/CMaps/test_FD003.txt'), sep='\s+', header=None, names=col_names)
df_test_RUL = pd.read_csv(('/kaggle/input/nasa-cmaps/CMaps/RUL_FD003.txt'), sep='\s+', header=None, names=['RUL'])

In [5]:
df_train.head()

Unnamed: 0,engine,cycle,setting_1,setting_2,setting_3,(Fan inlet temperature) (◦R),(LPC outlet temperature) (◦R),(HPC outlet temperature) (◦R),(LPT outlet temperature) (◦R),(Fan inlet Pressure) (psia),...,(Ratio of fuel flow to Ps30) (pps/psia),(Corrected fan speed) (rpm),(Corrected core speed) (rpm),(Bypass Ratio),(Burner fuel-air ratio),(Bleed Enthalpy),(Required fan speed),(Required fan conversion speed),(High-pressure turbines Cool air flow),(Low-pressure turbines Cool air flow)
0,1,1,-0.0005,0.0004,100.0,518.67,642.36,1583.23,1396.84,14.62,...,522.31,2388.01,8145.32,8.4246,0.03,391,2388,100.0,39.11,23.3537
1,1,2,0.0008,-0.0003,100.0,518.67,642.5,1584.69,1396.89,14.62,...,522.42,2388.03,8152.85,8.4403,0.03,392,2388,100.0,38.99,23.4491
2,1,3,-0.0014,-0.0002,100.0,518.67,642.18,1582.35,1405.61,14.62,...,522.03,2388.0,8150.17,8.3901,0.03,391,2388,100.0,38.85,23.3669
3,1,4,-0.002,0.0001,100.0,518.67,642.92,1585.61,1392.27,14.62,...,522.49,2388.08,8146.56,8.3878,0.03,392,2388,100.0,38.96,23.2951
4,1,5,0.0016,0.0,100.0,518.67,641.68,1588.63,1397.65,14.62,...,522.58,2388.03,8147.8,8.3869,0.03,392,2388,100.0,39.14,23.4583


In [6]:
df_test.head()

Unnamed: 0,engine,cycle,setting_1,setting_2,setting_3,(Fan inlet temperature) (◦R),(LPC outlet temperature) (◦R),(HPC outlet temperature) (◦R),(LPT outlet temperature) (◦R),(Fan inlet Pressure) (psia),...,(Ratio of fuel flow to Ps30) (pps/psia),(Corrected fan speed) (rpm),(Corrected core speed) (rpm),(Bypass Ratio),(Burner fuel-air ratio),(Bleed Enthalpy),(Required fan speed),(Required fan conversion speed),(High-pressure turbines Cool air flow),(Low-pressure turbines Cool air flow)
0,1,1,-0.0017,-0.0004,100.0,518.67,641.94,1581.93,1396.93,14.62,...,521.89,2387.94,8133.48,8.376,0.03,391,2388,100.0,39.07,23.4468
1,1,2,0.0006,-0.0002,100.0,518.67,642.02,1584.86,1398.9,14.62,...,521.85,2388.01,8137.44,8.4062,0.03,391,2388,100.0,39.04,23.4807
2,1,3,0.0014,-0.0003,100.0,518.67,641.68,1581.78,1391.92,14.62,...,522.1,2387.94,8138.25,8.3553,0.03,391,2388,100.0,39.1,23.4244
3,1,4,0.0027,0.0001,100.0,518.67,642.2,1584.53,1395.34,14.62,...,522.45,2387.96,8137.07,8.3709,0.03,392,2388,100.0,38.97,23.4782
4,1,5,-0.0001,0.0001,100.0,518.67,642.46,1589.03,1395.86,14.62,...,521.91,2387.97,8134.2,8.4146,0.03,391,2388,100.0,39.09,23.395


In [7]:
df_test_RUL.head()

Unnamed: 0,RUL
0,44
1,51
2,27
3,120
4,101


In [8]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24720 entries, 0 to 24719
Data columns (total 26 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   engine                                   24720 non-null  int64  
 1   cycle                                    24720 non-null  int64  
 2   setting_1                                24720 non-null  float64
 3   setting_2                                24720 non-null  float64
 4   setting_3                                24720 non-null  float64
 5   (Fan inlet temperature) (◦R)             24720 non-null  float64
 6   (LPC outlet temperature) (◦R)            24720 non-null  float64
 7   (HPC outlet temperature) (◦R)            24720 non-null  float64
 8   (LPT outlet temperature) (◦R)            24720 non-null  float64
 9   (Fan inlet Pressure) (psia)              24720 non-null  float64
 10  (bypass-duct pressure) (psia)            24720

In [9]:
df_train.describe(include='all').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
engine,24720.0,48.631877,29.34899,1.0,23.0,47.0,74.0,100.0
cycle,24720.0,139.077063,98.84668,1.0,62.0,124.0,191.0,525.0
setting_1,24720.0,-2.4e-05,0.002193545,-0.0086,-0.0015,-0.0,0.0015,0.0086
setting_2,24720.0,5e-06,0.0002940429,-0.0006,-0.0002,-0.0,0.0003,0.0007
setting_3,24720.0,100.0,0.0,100.0,100.0,100.0,100.0,100.0
(Fan inlet temperature) (◦R),24720.0,518.67,6.684921e-11,518.67,518.67,518.67,518.67,518.67
(LPC outlet temperature) (◦R),24720.0,642.457858,0.5230311,640.84,642.08,642.4,642.79,645.11
(HPC outlet temperature) (◦R),24720.0,1588.079175,6.810418,1564.3,1583.28,1587.52,1592.4125,1615.39
(LPT outlet temperature) (◦R),24720.0,1404.471212,9.773178,1377.06,1397.1875,1402.91,1410.6,1441.16
(Fan inlet Pressure) (psia),24720.0,14.62,3.602525e-12,14.62,14.62,14.62,14.62,14.62


In [10]:
constant_cols = [col for col in df_train.columns if df_train[col].nunique() == 1]

print("Columns with constant values:", constant_cols)
df_train.drop(constant_cols, axis=1, inplace=True, errors='ignore')
df_test.drop(constant_cols, axis=1, inplace=True, errors='ignore')


Columns with constant values: ['setting_3', '(Fan inlet temperature) (◦R)', '(Fan inlet Pressure) (psia)', '(Burner fuel-air ratio)', '(Required fan speed)', '(Required fan conversion speed)']


In [11]:
df_train_RUL = df_train.groupby(['engine']).agg({'cycle':'max'})
df_train_RUL.rename(columns={'cycle':'life'},inplace=True)
df_train_RUL.head()

Unnamed: 0_level_0,life
engine,Unnamed: 1_level_1
1,259
2,253
3,222
4,272
5,213


In [12]:
df_train=df_train.merge(df_train_RUL,how='left',on=['engine'])

In [13]:
df_train['RUL']=df_train['life']-df_train['cycle']
df_train.drop(['life'],axis=1,inplace=True)


df_train['RUL'] = df_train['RUL']. clip(upper=125)
df_train.head()



Unnamed: 0,engine,cycle,setting_1,setting_2,(LPC outlet temperature) (◦R),(HPC outlet temperature) (◦R),(LPT outlet temperature) (◦R),(bypass-duct pressure) (psia),(HPC outlet pressure) (psia),(Physical fan speed) (rpm),...,(Engine pressure ratio(P50/P2),(HPC outlet Static pressure) (psia),(Ratio of fuel flow to Ps30) (pps/psia),(Corrected fan speed) (rpm),(Corrected core speed) (rpm),(Bypass Ratio),(Bleed Enthalpy),(High-pressure turbines Cool air flow),(Low-pressure turbines Cool air flow),RUL
0,1,1,-0.0005,0.0004,642.36,1583.23,1396.84,21.61,553.97,2387.96,...,1.3,47.3,522.31,2388.01,8145.32,8.4246,391,39.11,23.3537,125
1,1,2,0.0008,-0.0003,642.5,1584.69,1396.89,21.61,554.55,2388.0,...,1.3,47.23,522.42,2388.03,8152.85,8.4403,392,38.99,23.4491,125
2,1,3,-0.0014,-0.0002,642.18,1582.35,1405.61,21.61,554.43,2388.03,...,1.3,47.22,522.03,2388.0,8150.17,8.3901,391,38.85,23.3669,125
3,1,4,-0.002,0.0001,642.92,1585.61,1392.27,21.61,555.21,2388.0,...,1.3,47.24,522.49,2388.08,8146.56,8.3878,392,38.96,23.2951,125
4,1,5,0.0016,0.0,641.68,1588.63,1397.65,21.61,554.74,2388.04,...,1.3,47.15,522.58,2388.03,8147.8,8.3869,392,39.14,23.4583,125


In [14]:
Selected_Features = []
import statsmodels.api as sm

def backward_regression(X, y, initial_list=[], threshold_out=0.05, verbose=True):
    """To select feature with Backward Stepwise Regression 

    Args:
        X -- features values
        y -- target variable
        initial_list -- features header
        threshold_out -- pvalue threshold of features to drop
        verbose -- true to produce lots of logging output

    Returns:
        list of selected features for modeling 
    """
    included = list(X.columns)
    while True:
        changed = False
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max()  # null if pvalues is empty
        if worst_pval > threshold_out:
            changed = True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print(f"worst_feature : {worst_feature}, {worst_pval} ")
        if not changed:
            break
    Selected_Features.append(included)
    print(f"\nSelected Features:\n{Selected_Features[0]}")


# Application of the backward regression function on our training data
X = df_train.iloc[:,1:-1]
y = df_train.iloc[:,-1]
backward_regression(X, y)

worst_feature : setting_2, 0.8386868453910624 
worst_feature : setting_1, 0.24781866742837771 
worst_feature : (HPC outlet pressure) (psia), 0.12274669908471379 

Selected Features:
['cycle', '(LPC outlet temperature) (◦R)', '(HPC outlet temperature) (◦R)', '(LPT outlet temperature) (◦R)', '(bypass-duct pressure) (psia)', '(Physical fan speed) (rpm)', '(Physical core speed) (rpm)', '(Engine pressure ratio(P50/P2)', '(HPC outlet Static pressure) (psia)', '(Ratio of fuel flow to Ps30) (pps/psia)', '(Corrected fan speed) (rpm)', '(Corrected core speed) (rpm)', '(Bypass Ratio) ', '(Bleed Enthalpy)', '(High-pressure turbines Cool air flow)', '(Low-pressure turbines Cool air flow)']


In [15]:
Selected_Features

[['cycle',
  '(LPC outlet temperature) (◦R)',
  '(HPC outlet temperature) (◦R)',
  '(LPT outlet temperature) (◦R)',
  '(bypass-duct pressure) (psia)',
  '(Physical fan speed) (rpm)',
  '(Physical core speed) (rpm)',
  '(Engine pressure ratio(P50/P2)',
  '(HPC outlet Static pressure) (psia)',
  '(Ratio of fuel flow to Ps30) (pps/psia)',
  '(Corrected fan speed) (rpm)',
  '(Corrected core speed) (rpm)',
  '(Bypass Ratio) ',
  '(Bleed Enthalpy)',
  '(High-pressure turbines Cool air flow)',
  '(Low-pressure turbines Cool air flow)']]

In [16]:
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer, accuracy_score

import sklearn
from sklearn.metrics import mean_squared_error, r2_score


In [17]:
feature_names = Selected_Features[0]
np.shape(X)

(24720, 19)

In [18]:


df_test_cycle = df_test.groupby(['engine']).agg({'cycle':'max'})
df_test_cycle.rename(columns={'cycle':'life'},inplace=True)
df_test_max = df_test.merge(df_test_cycle,how='left',on=['engine'])
df_test_max = df_test_max[(df_test_max['cycle']==df_test_max['life'])]
df_test_max.drop(['life'],axis=1,inplace=True)
# df_test_max



In [19]:


X_train = df_train[feature_names]
y_train = df_train.iloc[:,-1]
X_test = df_test_max[feature_names]
y_test = df_test_RUL.iloc[:,-1]



In [20]:


from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)



# Save data

In [26]:
import pandas as pd

# Convert arrays to DataFrames
X_train_df = pd.DataFrame(X_train)
y_train_df = pd.DataFrame(y_train)
X_test_df = pd.DataFrame(X_test)
y_test_df = pd.DataFrame(y_test)

# Now save as CSV
X_train_df.to_csv('X_train_fd003.csv', index=False)
y_train_df.to_csv('y_train_fd003.csv', index=False)
X_test_df.to_csv('X_test_fd003.csv', index=False)
y_test_df.to_csv('y_test_fd003.csv', index=False)