In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('data/MetroPT3(AirCompressor).csv', index_col = False)
data.head()

# 1. Data cleaning and preprocessing 

According to the documentation, the following preprocessing steps have been conducted before publishing the data:

- Data segmentation
- Normalization
- Feature Extraction

Thus, we do not need to apply them in our work.

### 1) Overview 

In [None]:
print(f'number of null values: {data.isna().sum().sum()}')

In [None]:
print(f'number of duplicates: {data.duplicated().sum()}')

In [None]:
print(f'shape: {data.shape}')

### 2) drop unnecessary columns:


In [None]:
#drop unecessary columns
data.drop(['Unnamed: 0'], axis = 1, inplace = True)


### 3) Add a label Column 
From the failure information table provided int the data description file below, we will try to label the data and evaluate the effectiveness of failure prediction algorithms: 

![alt text](image.png)

In [None]:
labeled_data = data.copy()
labeled_data['status'] = 0


#### Converting the timestamp column into pandas.DateTime data type


In [None]:
# converting the timestamp to datetime
labeled_data['timestamp'] = pd.to_datetime(labeled_data['timestamp'], format = '%Y-%m-%d %H:%M:%S')
print("current data type of timestamp: ", labeled_data['timestamp'].dtype)

In [None]:
#define function to convert time to pandas.datetime 
def convert_time(X):
    result =[]
    for x in X:
        result.append(pd.to_datetime(x, format = '%Y-%m-%d %H:%M:%S'))
    return result

failure_start_time = convert_time(["2020-04-18 00:00:00", "2020-05-29 23:30:00", "2020-06-05 10:00:00", "2020-07-15 14:30:00"])
failure_end_time = convert_time(["2020-04-18 23:59:00", "2020-05-30 06:00:00", "2020-06-07 14:30:00", "2020-07-15 19:00:00"])


In [None]:
#iterate through the data and label the data
for start, end in zip(failure_start_time, failure_end_time):
    labeled_data.loc[(labeled_data['timestamp'] >= start) & (labeled_data['timestamp'] <= end), 'status'] = 1
    #check if any failures were missed or
    print(f"number of failures between {start} and {end}: {labeled_data.loc[(labeled_data['timestamp'] >= start) & (labeled_data['timestamp'] <= end), 'status'].sum()}")
    
print(f"number of failures: {labeled_data['status'].sum()}")

In [None]:
#check for positive class imbalance
print(f"Example of Failure state \n {labeled_data[labeled_data['status']==1].head()}")


### 4)Subsampling the data 

In [None]:
# split the data into positive and negative samples 
positive_samples = labeled_data[labeled_data['status'] == 1]
negative_samples = labeled_data[labeled_data['status'] == 0]

#print the shape of the positive and negative samples
print(f"shape of positive samples: {positive_samples.shape}")
print(f"shape of negative samples: {negative_samples.shape}")


There is around 30k positive samples and  1500k negative samples. This indicates a highly imbalanced  dataset, which can be challenging to handle. Thus, we need to subsample the negative class to balance our data. 

In order to do so, we will randomly sample 30k negative samples from the 1500k sample.

In [None]:
#subsample the negative class to balance the data
negative_samples = negative_samples.sample(n = positive_samples.shape[0], random_state = 42)
print(f"Negative dataset after subsampling {negative_samples.shape[0]}")


In [None]:
#plot pie chart to show the class distribution
plt.figure(figsize = (10, 5))
plt.pie([positive_samples.shape[0], negative_samples.shape[0]], labels = ['Positive', 'Negative'], autopct = '%1.1f%%', startangle = 90, colors = ['lightpink', 'lightblue'])
plt.title('Class Distribution')
plt.show()


Now, we merge both the positive and negative samples into a single set 

In [None]:
#merge the positive and negative samples
merged_data = pd.concat([positive_samples, negative_samples], axis = 0)
print(f"shape of merged data: {merged_data.shape}")


In [None]:
merged_data.info()

### 5) Checking for outliers 


In [None]:
def identify_outliers(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    num_outliers = len(outliers)
    print(f"Number of outliers in {column}: {num_outliers}")
    return outliers

def remove_outliers(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers_removed = data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]
    num_removed = len(data) - len(outliers_removed)
    print(f"Number of outliers removed from {column}: {num_removed}\n")
    return outliers_removed

# First, identify outliers
clean_data = merged_data.copy()
for col in clean_data:
    if col not in ['timestamp', 'status']:
        outliers = identify_outliers(clean_data, col)


the features: ['COMP', 'DV_eletric','Towers', 'MPG','LPS','Pressure_switch','Oil_level','Caudal_impulses'] are binary features. So we do not remove outliers.

In [None]:
for col in clean_data:
    if col not in ['timestamp', 'status', 'LPS', 'Pressure_switch', 'Oil_level', 'Caudal_impulses']:
        cleaned_data = remove_outliers(clean_data, col)

In [None]:
#Investigate the columns with the binary values
binary_cols = ['LPS', 'Pressure_switch', 'Oil_level', 'Caudal_impulses']
#Ensure the the binary data is binary
cleaned_data[binary_cols] = cleaned_data[binary_cols].apply(np.round)

In [None]:
# count the number of unique values in each column
for col in cleaned_data.columns:
    print(f"number of unique values in {col}: {cleaned_data[col].nunique()}")
    

# 2. Exploratory data analysis

### 1) Correlation

In [None]:
#correlation 
correlation = cleaned_data.corr()
plt.figure(figsize = (10, 10))
sns.heatmap(correlation, annot = True, cmap = 'coolwarm')
plt.title('Correlation Matrix')
plt.show()


From the above correlation heatmap,  we can see that our target feature **"status"** has a strong correlation with these features: TP2, H1, DV_pressure, Oil_temparature, Motor_current, COMP, DV_electric and MPG.

### 2) Visualization

1. Outliers

In [None]:
# visualize all the features outliers in one plot 
sns.set(rc={'figure.figsize':(20,8.27)})
sns.boxplot(data = cleaned_data.drop(['timestamp', 'status'], axis = 1))
# plt.xticks(rotation = 45)
plt.title('Boxplot of all features')
plt.show()



2. Probability distribution


In [None]:
#visualize the probability distribution of all the features
def plot_col_distribution(data):
    fig, axes = plt.subplots(4, 4, figsize = (20, 10))
    axes = axes.flatten()
    for i, col in enumerate(data.columns):
        sns.distplot(data[col], ax = axes[i])
        axes[i].set_title(f'Distribution of {col}')
    plt.tight_layout()
    plt.show()
    
plot_col_distribution(cleaned_data.drop(['timestamp', 'status'], axis = 1))


3. Time series plot

In [None]:
cleaned_data.iloc[:,:16]


In [None]:
# reorganize according to timestamp 
cleaned_data.sort_values('timestamp', inplace = True)


In [None]:
# Plot the time series
cleaned_data.iloc[:,:16].plot(
        subplots =True,
        layout=(6, 3),
        figsize=(22,22),
        fontsize=10, 
        linewidth=1,
        sharex = False, 
        title='Visualization of the Original Time Series')
plt.show()