**Phase 0 : Imports and Reading Data:**

- Upload dataset.

- Import essential libraries: pandas, numpy, matplotlib, seaborn and....etc.

- Load the dataset into a DataFrame.

In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [8]:
df_train = pd.read_csv(('/kaggle/input/nasa-cmaps/CMaps/train_FD004.txt'), sep=' ', header=None)
df_test = pd.read_csv(('/kaggle/input/nasa-cmaps/CMaps/test_FD004.txt'), sep=' ', header=None)
rul = pd.read_csv(('/kaggle/input/nasa-cmaps/CMaps/RUL_FD004.txt'), sep=' ', header=None)

**Phase 1: Data Understanding & Preprocessing**

- Explore dataset structure and dimensions.

- Assign column names, drop empty/constant columns.

- Handle missing values and duplicates.

- Verify data types and correct inconsistencies.

- Detect outliers and standardize sensor values.

In [9]:
df_train.drop(df_train.columns[[26, 27]], axis=1, inplace=True)
df_test.drop(df_test.columns[[26, 27]], axis=1, inplace=True)
rul.drop(rul.columns[[1]], axis=1, inplace=True)
print("df_train:" ,df_train.shape)
print("df_test:",df_test.shape)
print("rul:" ,rul.shape)

df_train: (61249, 26)
df_test: (41214, 26)
rul: (248, 1)


In [10]:
index_names = ['engine', 'cycle']
setting_names = ['setting_1', 'setting_2', 'setting_3']
sensor_names=[ "(Fan inlet temperature) (◦R)",
"(LPC outlet temperature) (◦R)",
"(HPC outlet temperature) (◦R)",
"(LPT outlet temperature) (◦R)",
"(Fan inlet Pressure) (psia)",
"(bypass-duct pressure) (psia)",
"(HPC outlet pressure) (psia)",
"(Physical fan speed) (rpm)",
"(Physical core speed) (rpm)",
"(Engine pressure ratio(P50/P2)",
"(HPC outlet Static pressure) (psia)",
"(Ratio of fuel flow to Ps30) (pps/psia)",
"(Corrected fan speed) (rpm)",
"(Corrected core speed) (rpm)",
"(Bypass Ratio) ",
"(Burner fuel-air ratio)",
"(Bleed Enthalpy)",
"(Required fan speed)",
"(Required fan conversion speed)",
"(High-pressure turbines Cool air flow)",
"(Low-pressure turbines Cool air flow)" ]
col_names = index_names + setting_names + sensor_names
df_train.columns = col_names
df_test.columns = col_names
rul.columns = ['RUL']

In [11]:
print(df_test.info())
print("---------------------------------")
print(df_train.info())
print("---------------------------------")
print(rul.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41214 entries, 0 to 41213
Data columns (total 26 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   engine                                   41214 non-null  int64  
 1   cycle                                    41214 non-null  int64  
 2   setting_1                                41214 non-null  float64
 3   setting_2                                41214 non-null  float64
 4   setting_3                                41214 non-null  float64
 5   (Fan inlet temperature) (◦R)             41214 non-null  float64
 6   (LPC outlet temperature) (◦R)            41214 non-null  float64
 7   (HPC outlet temperature) (◦R)            41214 non-null  float64
 8   (LPT outlet temperature) (◦R)            41214 non-null  float64
 9   (Fan inlet Pressure) (psia)              41214 non-null  float64
 10  (bypass-duct pressure) (psia)            41214

In [12]:
df_train.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
engine,61249.0,124.325181,71.99535,1.0,60.0,126.0,185.0,249.0
cycle,61249.0,134.311417,89.783389,1.0,62.0,123.0,191.0,543.0
setting_1,61249.0,23.999823,14.780722,0.0,10.0046,25.0014,41.9981,42.008
setting_2,61249.0,0.571347,0.310703,0.0,0.2507,0.7,0.84,0.842
setting_3,61249.0,94.031576,14.251954,60.0,100.0,100.0,100.0,100.0
(Fan inlet temperature) (◦R),61249.0,472.882435,26.436832,445.0,445.0,462.54,491.19,518.67
(LPC outlet temperature) (◦R),61249.0,579.420056,37.342647,535.48,549.33,555.74,607.07,644.42
(HPC outlet temperature) (◦R),61249.0,1417.8966,106.167598,1242.67,1350.55,1367.68,1497.42,1613.0
(LPT outlet temperature) (◦R),61249.0,1201.915359,119.327591,1024.42,1119.49,1136.92,1302.62,1440.77
(Fan inlet Pressure) (psia),61249.0,8.031626,3.622872,3.91,3.91,7.05,10.52,14.62


In [13]:
df_train.duplicated().sum()

0

In [14]:
df_train.isnull().sum()

engine                                     0
cycle                                      0
setting_1                                  0
setting_2                                  0
setting_3                                  0
(Fan inlet temperature) (◦R)               0
(LPC outlet temperature) (◦R)              0
(HPC outlet temperature) (◦R)              0
(LPT outlet temperature) (◦R)              0
(Fan inlet Pressure) (psia)                0
(bypass-duct pressure) (psia)              0
(HPC outlet pressure) (psia)               0
(Physical fan speed) (rpm)                 0
(Physical core speed) (rpm)                0
(Engine pressure ratio(P50/P2)             0
(HPC outlet Static pressure) (psia)        0
(Ratio of fuel flow to Ps30) (pps/psia)    0
(Corrected fan speed) (rpm)                0
(Corrected core speed) (rpm)               0
(Bypass Ratio)                             0
(Burner fuel-air ratio)                    0
(Bleed Enthalpy)                           0
(Required 

**Feature Engineering:** Calculate RUL

In [15]:
max_cycles_df = df_train.groupby('engine')['cycle'].max().reset_index()
max_cycles_df.columns = ['engine', 'max_cycles']
max_cycles_df
df_train = pd.merge(df_train, max_cycles_df, on='engine', how='left')

df_train['RUL'] = df_train['max_cycles'] - df_train['cycle']
df_train.drop(columns=['max_cycles'], inplace=True)
df_train['RUL'] = df_train['RUL'].where(df_train['RUL'] <= 125, 125)
df_train

Unnamed: 0,engine,cycle,setting_1,setting_2,setting_3,(Fan inlet temperature) (◦R),(LPC outlet temperature) (◦R),(HPC outlet temperature) (◦R),(LPT outlet temperature) (◦R),(Fan inlet Pressure) (psia),...,(Corrected fan speed) (rpm),(Corrected core speed) (rpm),(Bypass Ratio),(Burner fuel-air ratio),(Bleed Enthalpy),(Required fan speed),(Required fan conversion speed),(High-pressure turbines Cool air flow),(Low-pressure turbines Cool air flow),RUL
0,1,1,42.0049,0.8400,100.0,445.00,549.68,1343.43,1112.93,3.91,...,2387.99,8074.83,9.3335,0.02,330,2212,100.00,10.62,6.3670,125
1,1,2,20.0020,0.7002,100.0,491.19,606.07,1477.61,1237.50,9.35,...,2387.73,8046.13,9.1913,0.02,361,2324,100.00,24.37,14.6552,125
2,1,3,42.0038,0.8409,100.0,445.00,548.95,1343.12,1117.05,3.91,...,2387.97,8066.62,9.4007,0.02,329,2212,100.00,10.48,6.4213,125
3,1,4,42.0000,0.8400,100.0,445.00,548.70,1341.24,1118.03,3.91,...,2388.02,8076.05,9.3369,0.02,328,2212,100.00,10.54,6.4176,125
4,1,5,25.0063,0.6207,60.0,462.54,536.10,1255.23,1033.59,7.05,...,2028.08,7865.80,10.8366,0.02,305,1915,84.93,14.03,8.6754,125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61244,249,251,9.9998,0.2500,100.0,489.05,605.33,1516.36,1315.28,10.52,...,2388.73,8185.69,8.4541,0.03,372,2319,100.00,29.11,17.5234,4
61245,249,252,0.0028,0.0015,100.0,518.67,643.42,1598.92,1426.77,14.62,...,2388.46,8185.47,8.2221,0.03,396,2388,100.00,39.38,23.7151,3
61246,249,253,0.0029,0.0000,100.0,518.67,643.68,1607.72,1430.56,14.62,...,2388.48,8193.94,8.2525,0.03,395,2388,100.00,39.78,23.8270,2
61247,249,254,35.0046,0.8400,100.0,449.44,555.77,1381.29,1148.18,5.48,...,2388.83,8125.64,9.0515,0.02,337,2223,100.00,15.26,9.0774,1


Unlike traditional datasets where outliers may represent noise, in predictive maintenance datasets such as C-MAPSS, extreme sensor values often carry critical information about degradation. Therefore, instead of removing outliers, we apply feature scaling (standardization/normalization) to make training more stable.

So now, data is ready for visualization and EDA.

In [16]:
rul['engine']=rul.index+1
rul=rul[['engine','RUL']]
df_test = pd.merge(df_test, rul, on='engine', how='left')

In [17]:
df_test

Unnamed: 0,engine,cycle,setting_1,setting_2,setting_3,(Fan inlet temperature) (◦R),(LPC outlet temperature) (◦R),(HPC outlet temperature) (◦R),(LPT outlet temperature) (◦R),(Fan inlet Pressure) (psia),...,(Corrected fan speed) (rpm),(Corrected core speed) (rpm),(Bypass Ratio),(Burner fuel-air ratio),(Bleed Enthalpy),(Required fan speed),(Required fan conversion speed),(High-pressure turbines Cool air flow),(Low-pressure turbines Cool air flow),RUL
0,1,1,20.0072,0.7000,100.0,491.19,606.67,1481.04,1227.81,9.35,...,2387.78,8048.98,9.2229,0.02,362,2324,100.00,24.31,14.7007,22
1,1,2,24.9984,0.6200,60.0,462.54,536.22,1256.17,1031.48,7.05,...,2028.09,7863.46,10.8632,0.02,306,1915,84.93,14.36,8.5748,22
2,1,3,42.0000,0.8420,100.0,445.00,549.23,1340.13,1105.88,3.91,...,2387.95,8071.13,9.3960,0.02,328,2212,100.00,10.39,6.4365,22
3,1,4,42.0035,0.8402,100.0,445.00,549.19,1339.70,1107.26,3.91,...,2387.90,8078.89,9.3594,0.02,328,2212,100.00,10.56,6.2367,22
4,1,5,35.0079,0.8400,100.0,449.44,555.10,1353.04,1117.80,5.48,...,2387.87,8057.83,9.3030,0.02,333,2223,100.00,14.85,8.9326,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41209,248,277,41.9991,0.8401,100.0,445.00,550.30,1364.40,1129.17,3.91,...,2388.50,8112.61,9.4427,0.02,331,2212,100.00,10.53,6.2620,26
41210,248,278,20.0026,0.7005,100.0,491.19,608.00,1494.75,1260.88,9.35,...,2388.33,8086.83,9.2772,0.02,366,2324,100.00,24.33,14.6486,26
41211,248,279,34.9988,0.8413,100.0,449.44,555.92,1370.65,1130.97,5.48,...,2388.64,8100.84,9.3982,0.02,336,2223,100.00,14.69,8.8389,26
41212,248,280,20.0027,0.7000,100.0,491.19,608.19,1489.11,1256.25,9.35,...,2388.37,8085.24,9.2727,0.03,366,2324,100.00,24.44,14.6887,26
