# Data Preprocessing 

In [43]:
import pandas as pd
import pathlib

DATA_DIR = pathlib.Path("C:/Ahsan.docx/PROJECT/MetroPT_project/MetroPT/")
df = pd.read_feather(DATA_DIR / "Processed.feather")
# file_path = "processed_data.csv"
# df: pd.DataFrame = pd.read_csv(file_path)
df.head(10)

Unnamed: 0,timestamp,TP2,TP3,H1,DV_pressure,Reservoirs,Oil_temperature,Flowmeter,Motor_current,COMP,DV_eletric,Towers,MPG,LPS,Oil_level,Caudal_impulses,gpsSpeed
0,2022-01-01 06:00:00,-0.012001,9.757812,9.757812,-0.028,1.576172,63.34375,19.046875,3.955078,True,False,True,True,False,False,False,0
1,2022-01-01 06:00:01,-0.012001,9.757812,9.757812,-0.028,1.578125,63.25,19.046875,4.027344,True,False,True,True,False,False,False,0
2,2022-01-01 06:00:02,-0.010002,9.757812,9.757812,-0.028,1.578125,63.3125,19.046875,3.945312,True,False,True,True,False,False,False,0
3,2022-01-01 06:00:03,-0.012001,9.757812,9.757812,-0.029999,1.576172,63.1875,19.046875,3.929688,True,False,True,True,False,False,False,0
4,2022-01-01 06:00:04,-0.012001,9.757812,9.757812,-0.029999,1.578125,63.15625,19.046875,3.994141,True,False,True,True,False,False,False,0
5,2022-01-01 06:00:05,-0.012001,9.757812,9.757812,-0.029999,1.576172,63.0625,19.046875,3.994141,True,False,True,True,False,False,False,0
6,2022-01-01 06:00:06,-0.012001,9.757812,9.757812,-0.028,1.578125,63.0,19.046875,3.947266,True,False,True,True,False,False,False,0
7,2022-01-01 06:00:07,-0.010002,9.75,9.757812,-0.028,1.576172,63.0625,19.046875,4.023438,True,False,True,True,False,False,False,0
8,2022-01-01 06:00:08,-0.012001,9.75,9.75,-0.028,1.576172,63.0625,19.046875,3.953125,True,False,True,True,False,False,False,0
9,2022-01-01 06:00:09,-0.010002,9.75,9.75,-0.028,1.576172,63.0,19.046875,3.925781,True,False,True,True,False,False,False,0


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10773588 entries, 0 to 10773587
Data columns (total 17 columns):
 #   Column           Dtype         
---  ------           -----         
 0   timestamp        datetime64[ns]
 1   TP2              float16       
 2   TP3              float16       
 3   H1               float16       
 4   DV_pressure      float16       
 5   Reservoirs       float16       
 6   Oil_temperature  float16       
 7   Flowmeter        float16       
 8   Motor_current    float16       
 9   COMP             bool          
 10  DV_eletric       bool          
 11  Towers           bool          
 12  MPG              bool          
 13  LPS              bool          
 14  Oil_level        bool          
 15  Caudal_impulses  bool          
 16  gpsSpeed         int16         
dtypes: bool(7), datetime64[ns](1), float16(8), int16(1)
memory usage: 339.1 MB


# Preparing the Data for Classification
for classification we will be treating this as a multi-class classification problem, where we are trying to label each point as 3 things:

If the point is normal operating condition, we will label it as 0
If the point is at least 2 hours before given failure points we label it as 1
If the point is in the failure interval we label it as 2
We decide to do this, it is hard to predict a continuous timestamp from a ML model thus there needs to be some discretization, there is point to note here which is the threshold, it maybe possible the accuracy will decrease if this threshold N_Hours is to high, because the model is likely to get confused on what is normal and before failure states 

In [45]:
Classification_df = df.copy(deep=True)

failure_periods = [
    ("2022-02-28 21:53", "2022-03-01 02:00"),
    ("2022-03-23 14:54", "2022-03-23 15:24"),
    ("2022-05-30 12:00", "2022-06-02 06:18"),
]

failure_periods = [
    (pd.to_datetime(start), pd.to_datetime(end)) for start, end in failure_periods
]

N_Hours = 2

Classification_df["Target"] = 0

# Convert the 'timestamp' column to datetime
Classification_df["timestamp"] = pd.to_datetime(Classification_df["timestamp"])

# Iterate through the failure periods and assign labels
for start, end in failure_periods:
    mask = (Classification_df["timestamp"] >= start) & (
        Classification_df["timestamp"] <= end
    )
    Classification_df.loc[mask, "Target"] = 2  # Label as 2 for failure state
    
    two_hours_before_start = start - pd.Timedelta(hours=N_Hours)
    mask = (Classification_df["timestamp"] >= two_hours_before_start) & (
        Classification_df["timestamp"] < start
    )
    Classification_df.loc[mask, "Target"] = 1  # Label as 1 for 2 hours before failure

# Print the first few rows of the updated DataFrame
Classification_df.head()



Unnamed: 0,timestamp,TP2,TP3,H1,DV_pressure,Reservoirs,Oil_temperature,Flowmeter,Motor_current,COMP,DV_eletric,Towers,MPG,LPS,Oil_level,Caudal_impulses,gpsSpeed,Target
0,2022-01-01 06:00:00,-0.012001,9.757812,9.757812,-0.028,1.576172,63.34375,19.046875,3.955078,True,False,True,True,False,False,False,0,0
1,2022-01-01 06:00:01,-0.012001,9.757812,9.757812,-0.028,1.578125,63.25,19.046875,4.027344,True,False,True,True,False,False,False,0,0
2,2022-01-01 06:00:02,-0.010002,9.757812,9.757812,-0.028,1.578125,63.3125,19.046875,3.945312,True,False,True,True,False,False,False,0,0
3,2022-01-01 06:00:03,-0.012001,9.757812,9.757812,-0.029999,1.576172,63.1875,19.046875,3.929688,True,False,True,True,False,False,False,0,0
4,2022-01-01 06:00:04,-0.012001,9.757812,9.757812,-0.029999,1.578125,63.15625,19.046875,3.994141,True,False,True,True,False,False,False,0,0


In [46]:
Classification_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10773588 entries, 0 to 10773587
Data columns (total 18 columns):
 #   Column           Dtype         
---  ------           -----         
 0   timestamp        datetime64[ns]
 1   TP2              float16       
 2   TP3              float16       
 3   H1               float16       
 4   DV_pressure      float16       
 5   Reservoirs       float16       
 6   Oil_temperature  float16       
 7   Flowmeter        float16       
 8   Motor_current    float16       
 9   COMP             bool          
 10  DV_eletric       bool          
 11  Towers           bool          
 12  MPG              bool          
 13  LPS              bool          
 14  Oil_level        bool          
 15  Caudal_impulses  bool          
 16  gpsSpeed         int16         
 17  Target           int64         
dtypes: bool(7), datetime64[ns](1), float16(8), int16(1), int64(1)
memory usage: 421.3 MB


In [56]:
Classification_df.to_feather(DATA_DIR / "Classification.feather")

# Creating a Regression data frame
this is data used for regression, this is also known as time to failure prediction or remaining useful life prediction
>>Regression in Python is a statistical method that uses a linear equation to model the relationship between variables. It's used to predict future values and understand how variables relate to each other. 

In [47]:
Reg_df = df.copy(deep=True)

failure_periods = [
    ("2022-02-28 21:53", "2022-03-01 02:00"),
    ("2022-03-23 14:54", "2022-03-23 15:24"),
    ("2022-05-30 12:00", "2022-06-02 06:18"),
]

failure_periods = [
    (pd.to_datetime(start), pd.to_datetime(end)) for start, end in failure_periods
]

"failure_periods.sort(key=lambda x: x[0])"
This line of code below sorts the failure_periods list by the start time of each failure period. Sorting ensures that the function find_time_till_failure processes the failure periods in chronological order.

--The function "find_time_till_failure" calculates the time remaining until the next failure event for a given curr_time (current timestamp).
--It iterates through the sorted failure_periods and checks if curr_time is before the start of a failure period (curr_time < start).
--If curr_time is before a failure period, it calculates the difference between start and curr_time, converts it to hours, and returns the result.
--If curr_time is not before any failure period, it returns 0 (indicating no upcoming failure).

In [48]:
# Sort failure periods by start time
failure_periods.sort(key=lambda x: x[0])

# Convert the 'timestamp' column to datetime
Reg_df["timestamp"] = pd.to_datetime(Reg_df["timestamp"])

def find_time_till_failure(curr_time):
    for start, end in failure_periods:
        if curr_time < start:
            return (start - curr_time).total_seconds() / 3600
    return 0

# Apply the function to create the "Hours_till_Failure" column
Reg_df["Hours_till_Failure"] = Reg_df["timestamp"].apply(find_time_till_failure)

In [36]:
Reg_df.head()

Unnamed: 0,timestamp,TP2,TP3,H1,DV_pressure,Reservoirs,Oil_temperature,Flowmeter,Motor_current,COMP,DV_eletric,Towers,MPG,LPS,Oil_level,Caudal_impulses,gpsSpeed,Hours_till_Failure
0,2022-01-01 06:00:00,-0.012001,9.757812,9.757812,-0.028,1.576172,63.34375,19.046875,3.955078,1,0,1,1,0,0,0,0,1407.883333
1,2022-01-01 06:00:01,-0.012001,9.757812,9.757812,-0.028,1.578125,63.25,19.046875,4.027344,1,0,1,1,0,0,0,0,1407.883056
2,2022-01-01 06:00:02,-0.010002,9.757812,9.757812,-0.028,1.578125,63.3125,19.046875,3.945312,1,0,1,1,0,0,0,0,1407.882778
3,2022-01-01 06:00:03,-0.012001,9.757812,9.757812,-0.029999,1.576172,63.1875,19.046875,3.929688,1,0,1,1,0,0,0,0,1407.8825
4,2022-01-01 06:00:04,-0.012001,9.757812,9.757812,-0.029999,1.578125,63.15625,19.046875,3.994141,1,0,1,1,0,0,0,0,1407.882222


In [49]:
Reg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10773588 entries, 0 to 10773587
Data columns (total 18 columns):
 #   Column              Dtype         
---  ------              -----         
 0   timestamp           datetime64[ns]
 1   TP2                 float16       
 2   TP3                 float16       
 3   H1                  float16       
 4   DV_pressure         float16       
 5   Reservoirs          float16       
 6   Oil_temperature     float16       
 7   Flowmeter           float16       
 8   Motor_current       float16       
 9   COMP                bool          
 10  DV_eletric          bool          
 11  Towers              bool          
 12  MPG                 bool          
 13  LPS                 bool          
 14  Oil_level           bool          
 15  Caudal_impulses     bool          
 16  gpsSpeed            int16         
 17  Hours_till_Failure  float64       
dtypes: bool(7), datetime64[ns](1), float16(8), float64(1), int16(1)
memory usage: 42

In [50]:
Reg_df.to_feather(DATA_DIR / "Regression.feather")

# Creating a failure detection Dataset
0 on normal conditions
1 if the point lies in known failure times
>>This code below is designed to label failure periods in a DataFrame (df) by creating a new column called "Failure". The column is set to 1 for timestamps that fall within a failure period and 0 otherwise.

In [51]:
FD_df = df.copy(deep=True)

failure_periods = [
    ("2022-02-28 21:53", "2022-03-01 02:00"),
    ("2022-03-23 14:54", "2022-03-23 15:24"),
    ("2022-05-30 12:00", "2022-06-02 06:18"),
]

failure_periods = [
    (pd.to_datetime(start), pd.to_datetime(end)) for start, end in failure_periods
]

# Convert the 'timestamp' column to datetime
FD_df["timestamp"] = pd.to_datetime(FD_df["timestamp"])

FD_df["Failure"] = 0

for start, end in failure_periods:
    mask = (FD_df["timestamp"] >= start) & (
        FD_df["timestamp"] <= end
    )
    FD_df.loc[mask, "Failure"] = 1

# FD_df.to_csv("Failure_detection_data.csv")

In [52]:
FD_df.head()

Unnamed: 0,timestamp,TP2,TP3,H1,DV_pressure,Reservoirs,Oil_temperature,Flowmeter,Motor_current,COMP,DV_eletric,Towers,MPG,LPS,Oil_level,Caudal_impulses,gpsSpeed,Failure
0,2022-01-01 06:00:00,-0.012001,9.757812,9.757812,-0.028,1.576172,63.34375,19.046875,3.955078,True,False,True,True,False,False,False,0,0
1,2022-01-01 06:00:01,-0.012001,9.757812,9.757812,-0.028,1.578125,63.25,19.046875,4.027344,True,False,True,True,False,False,False,0,0
2,2022-01-01 06:00:02,-0.010002,9.757812,9.757812,-0.028,1.578125,63.3125,19.046875,3.945312,True,False,True,True,False,False,False,0,0
3,2022-01-01 06:00:03,-0.012001,9.757812,9.757812,-0.029999,1.576172,63.1875,19.046875,3.929688,True,False,True,True,False,False,False,0,0
4,2022-01-01 06:00:04,-0.012001,9.757812,9.757812,-0.029999,1.578125,63.15625,19.046875,3.994141,True,False,True,True,False,False,False,0,0


In [53]:
FD_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10773588 entries, 0 to 10773587
Data columns (total 18 columns):
 #   Column           Dtype         
---  ------           -----         
 0   timestamp        datetime64[ns]
 1   TP2              float16       
 2   TP3              float16       
 3   H1               float16       
 4   DV_pressure      float16       
 5   Reservoirs       float16       
 6   Oil_temperature  float16       
 7   Flowmeter        float16       
 8   Motor_current    float16       
 9   COMP             bool          
 10  DV_eletric       bool          
 11  Towers           bool          
 12  MPG              bool          
 13  LPS              bool          
 14  Oil_level        bool          
 15  Caudal_impulses  bool          
 16  gpsSpeed         int16         
 17  Failure          int64         
dtypes: bool(7), datetime64[ns](1), float16(8), int16(1), int64(1)
memory usage: 421.3 MB


In [54]:
FD_df.to_feather(DATA_DIR / "Failure_detection.feather")