In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [3]:
filenames = ["data/msn_02_fuel_leak_signals_preprocessed.csv", "data/msn_10_fuel_leak_signals_preprocessed.csv", "data/msn_11_fuel_leak_signals_preprocessed.csv", "data/msn_12_fuel_leak_signals_preprocessed.csv", "data/msn_14_fuel_leak_signals_preprocessed.csv", "data/msn_29_fuel_leak_signals_preprocessed.csv", "data/msn_37_fuel_leak_signals_preprocessed.csv", "data/msn_53_fuel_leak_signals_preprocessed.csv"]

datasets = []
for filename in filenames:
    datasets.append(pd.read_csv(filename, sep=";"))
    
print("Datasets loaded")

Datasets loaded


In [4]:
for dataset in datasets:
    print(dataset.head)

<bound method NDFrame.head of                    UTC_TIME         MSN Flight  ENGINE_RUNNING_1  \
0       2011-03-09 12:06:01  A400M-0002  V0136              True   
1       2011-03-09 12:06:02  A400M-0002  V0136             False   
2       2011-03-09 12:06:03  A400M-0002  V0136             False   
3       2011-03-09 12:06:04  A400M-0002  V0136             False   
4       2011-03-09 12:06:05  A400M-0002  V0136             False   
...                     ...         ...    ...               ...   
623575  2011-03-10 17:27:23  A400M-0002  V0137             False   
623576  2011-03-10 17:27:24  A400M-0002  V0137             False   
623577  2011-03-10 17:27:25  A400M-0002  V0137             False   
623578  2011-03-10 17:27:26  A400M-0002  V0137             False   
623579  2011-03-10 17:27:27  A400M-0002  V0137             False   

        ENGINE_RUNNING_2  ENGINE_RUNNING_3  ENGINE_RUNNING_4  \
0                   True              True              True   
1                  False 

In [5]:
for dataset in datasets:
    print(dataset.info(verbose=True))
        

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 623580 entries, 0 to 623579
Data columns (total 111 columns):
 #    Column                                Dtype  
---   ------                                -----  
 0    UTC_TIME                              object 
 1    MSN                                   object 
 2    Flight                                object 
 3    ENGINE_RUNNING_1                      bool   
 4    ENGINE_RUNNING_2                      bool   
 5    ENGINE_RUNNING_3                      bool   
 6    ENGINE_RUNNING_4                      bool   
 7    FLIGHT_PHASE_COUNT                    float64
 8    FUEL_FLOW_1                           float64
 9    FUEL_FLOW_2                           float64
 10   FUEL_FLOW_3                           float64
 11   FUEL_FLOW_4                           float64
 12   FUEL_PITCH                            float64
 13   FUEL_ROLL                             float64
 14   FUEL_TRANSFER_MODE_VALUE              float64
 15 

In [6]:
# Count NaN values
for dataset in datasets:
    # print percentage of NaN values if more than 0.8
    for column in dataset.columns:
        if dataset[column].isna().sum() > 0.7 * len(dataset[column]):
            print("NaN percentage in column " + column + ": " + str(dataset[column].isna().sum() / len(dataset[column])))
    

NaN percentage in column APU_FUEL_FLOW_REQUEST_SIGNAL_1: 0.7257256486737869
NaN percentage in column FUEL_USED_2: 0.9045414327311336
NaN percentage in column FUEL_USED_3: 0.9019706890172294
NaN percentage in column FUEL_USED_4: 0.9102733225012468
NaN percentage in column FUEL_USED_1: 0.9156400315310242
NaN percentage in column FUEL_USED_2: 0.9270844741193431
NaN percentage in column FUEL_USED_3: 0.9290465512505408
NaN percentage in column FUEL_USED_4: 0.9346630335063438
NaN percentage in column FUEL_USED_1: 0.9260196158341397
NaN percentage in column FUEL_USED_2: 0.9617719074387006
NaN percentage in column FUEL_USED_3: 0.9619637376280304
NaN percentage in column FUEL_USED_4: 0.9621805088211096
NaN percentage in column FUEL_USED_1: 0.9603653579926987
NaN percentage in column FUEL_USED_2: 0.9420524874741246
NaN percentage in column FUEL_USED_3: 0.9419641443113576
NaN percentage in column FUEL_USED_4: 0.9424914021632871
NaN percentage in column FUEL_USED_1: 0.942016288324503
NaN percentag

In [7]:
# Correlation matrix for all datasets numercial values
#for dataset in datasets:
    #print(dataset.corr())


In [8]:
# draw plots for all datasets for column LEAK_DETECTION_LEAK_FLOW
for i in range(len(datasets)):
    print(filenames[i], datasets[i].shape)

data/msn_02_fuel_leak_signals_preprocessed.csv (623580, 111)
data/msn_10_fuel_leak_signals_preprocessed.csv (621610, 17)
data/msn_11_fuel_leak_signals_preprocessed.csv (4455992, 17)
data/msn_12_fuel_leak_signals_preprocessed.csv (3247664, 17)
data/msn_14_fuel_leak_signals_preprocessed.csv (4640993, 17)
data/msn_29_fuel_leak_signals_preprocessed.csv (4129447, 17)
data/msn_37_fuel_leak_signals_preprocessed.csv (3236645, 17)
data/msn_53_fuel_leak_signals_preprocessed.csv (3034227, 17)


In [9]:
# draw plots for all datasets for columns VALUE_FUEL_QTY_FT1;VALUE_FUEL_QTY_FT2;VALUE_FUEL_QTY_FT3;VALUE_FUEL_QTY_FT4;VALUE_FUEL_QTY_LXT;VALUE_FUEL_QTY_RXT
#for dataset in datasets:
 #   dataset.plot.scatter(x='UTC_TIME', y='VALUE_FUEL_QTY_FT1')
    #dataset.plot.scatter(x='UTC_TIME', y='VALUE_FUEL_QTY_FT2')
    #dataset.plot.scatter(x='UTC_TIME', y='VALUE_FUEL_QTY_FT3')
    #dataset.plot.scatter(x='UTC_TIME', y='VALUE_FUEL_QTY_FT4')
    #dataset.plot.scatter(x='UTC_TIME', y='VALUE_FUEL_QTY_LXT')
    #dataset.plot.scatter(x='UTC_TIME', y='VALUE_FUEL_QTY_RXT')
    

In [10]:
#datasets[0].plot.scatter(x='UTC_TIME', y='VALUE_FUEL_QTY_FT1')

In [11]:
# convert UTC_TIME to datetime
for dataset in datasets:
    dataset['UTC_TIME'] = pd.to_datetime(dataset['UTC_TIME'], format='%Y-%m-%d %H:%M:%S.%f')


In [12]:
for dataset in datasets:
    #introduce column date
    dataset['DATE'] = dataset['UTC_TIME'].dt.date
    dataset["MONTH"] = dataset['UTC_TIME'].dt.month
    dataset["DAY"] = dataset['UTC_TIME'].dt.day
    dataset["HOUR"] = dataset['UTC_TIME'].dt.hour
    dataset["MINUTE"] = dataset['UTC_TIME'].dt.minute
    dataset["SECOND"] = dataset['UTC_TIME'].dt.second
    
# number of seconds since epoch 
import time 


In [13]:
import plotly.subplots as sp
import plotly.graph_objs as go

def plot_datasets():
    # List of columns to plot for VALUE_FUEL_QTY_CT;VALUE_FUEL_QTY_FT1;VALUE_FUEL_QTY_FT2;VALUE_FUEL_QTY_FT3;VALUE_FUEL_QTY_FT4;VALUE_FUEL_QTY_LXT;VALUE_FUEL_QTY_RXT
    fuel_qty_cols = ['VALUE_FUEL_QTY_CT', 'VALUE_FUEL_QTY_FT1', 'VALUE_FUEL_QTY_FT2', 'VALUE_FUEL_QTY_FT3', 'VALUE_FUEL_QTY_FT4', 'VALUE_FUEL_QTY_LXT', 'VALUE_FUEL_QTY_RXT']

    # Loop over every unique date
    for date in datasets[0]['DATE'].unique():
        # Create a subplot
        fig = sp.make_subplots(rows=3, cols=3)
        
        # size of fig 
        fig.update_layout(height=1400, width=1400)

        # Loop over each column
        for i, col in enumerate(fuel_qty_cols):
            # Add scatter plot to subplot
            fig.add_trace(
                go.Scatter(x=datasets[0][datasets[0]['DATE'] == date]['UTC_TIME'], 
                        y=datasets[0][datasets[0]['DATE'] == date][col], 
                        mode='markers',
                        name=col),
                row=i//3 + 1, 
                col=i%3 + 1
            )

            # Update xaxis and yaxis titles
            fig.update_xaxes(title_text='UTC_TIME', row=i//3 + 1, col=i%3 + 1)
            fig.update_yaxes(title_text=col + " " + str(date), row=i//3 + 1, col=i%3 + 1)

        # Show the plot
        #fig.show()
        
        # save the plot as png
        fig.write_image("plots/" + str(date) + ".png")

In [14]:
datasets[0][datasets[0]['DATE'] == "2011-03-10"]

Unnamed: 0,UTC_TIME,MSN,Flight,ENGINE_RUNNING_1,ENGINE_RUNNING_2,ENGINE_RUNNING_3,ENGINE_RUNNING_4,FLIGHT_PHASE_COUNT,FUEL_FLOW_1,FUEL_FLOW_2,...,STATUS_OVERFLOW_LST,STATUS_OVERFLOW_RST,VALUE_FUEL_VOL_LST,VALUE_FUEL_VOL_RST,DATE,MONTH,DAY,HOUR,MINUTE,SECOND


In [15]:
# from dataset 0 show 2011-03-10
datasets[0][datasets[0]['DATE'] == "2011-03-10"]

Unnamed: 0,UTC_TIME,MSN,Flight,ENGINE_RUNNING_1,ENGINE_RUNNING_2,ENGINE_RUNNING_3,ENGINE_RUNNING_4,FLIGHT_PHASE_COUNT,FUEL_FLOW_1,FUEL_FLOW_2,...,STATUS_OVERFLOW_LST,STATUS_OVERFLOW_RST,VALUE_FUEL_VOL_LST,VALUE_FUEL_VOL_RST,DATE,MONTH,DAY,HOUR,MINUTE,SECOND


In [16]:
#!pip install lazypredict
#!pip install pyforest
#!pip install pycaret

In [17]:
import pyforest
lazy_imports()

['from sklearn.preprocessing import PolynomialFeatures',
 'from sklearn.preprocessing import RobustScaler',
 'from sklearn.preprocessing import OneHotEncoder',
 'import imutils',
 'from sklearn.feature_extraction.text import TfidfVectorizer',
 'from sklearn.linear_model import LinearRegression',
 'import sklearn',
 'from sklearn.decomposition import PCA',
 'from sklearn.feature_extraction.text import CountVectorizer',
 'import keras',
 'import cv2',
 'from PIL import Image',
 'from sklearn.linear_model import Ridge',
 'from sklearn.linear_model import ElasticNetCV',
 'import skimage',
 'from openpyxl import load_workbook',
 'from pathlib import Path',
 'from sklearn.preprocessing import LabelEncoder',
 'import numpy as np',
 'from sklearn.model_selection import RandomizedSearchCV',
 'import plotly.express as px',
 'from sklearn.preprocessing import MinMaxScaler',
 'import seaborn as sns',
 'import gensim',
 'from sklearn.linear_model import ElasticNet',
 'from sklearn.model_selection i

In [18]:
# introduce new columns for each dataset [CT_DIFF, LXT_DIFF, RXT_DIFF, FT1_DIFF, FT2_DIFF, FT3_DIFF, FT4_DIFF]
# value represents the difference between the current and the previous value
for dataset in datasets:
    dataset['CT_DIFF'] = dataset['VALUE_FUEL_QTY_CT'].diff()
    dataset['LXT_DIFF'] = dataset['VALUE_FUEL_QTY_LXT'].diff()
    dataset['RXT_DIFF'] = dataset['VALUE_FUEL_QTY_RXT'].diff()
    dataset['FT1_DIFF'] = dataset['VALUE_FUEL_QTY_FT1'].diff()
    dataset['FT2_DIFF'] = dataset['VALUE_FUEL_QTY_FT2'].diff()
    dataset['FT3_DIFF'] = dataset['VALUE_FUEL_QTY_FT3'].diff()
    dataset['FT4_DIFF'] = dataset['VALUE_FUEL_QTY_FT4'].diff()  

In [19]:
datasets[0].head()

Unnamed: 0,UTC_TIME,MSN,Flight,ENGINE_RUNNING_1,ENGINE_RUNNING_2,ENGINE_RUNNING_3,ENGINE_RUNNING_4,FLIGHT_PHASE_COUNT,FUEL_FLOW_1,FUEL_FLOW_2,...,HOUR,MINUTE,SECOND,CT_DIFF,LXT_DIFF,RXT_DIFF,FT1_DIFF,FT2_DIFF,FT3_DIFF,FT4_DIFF
0,2011-03-09 12:06:01,A400M-0002,V0136,True,True,True,True,,,,...,12,6,1,,,,,,,
1,2011-03-09 12:06:02,A400M-0002,V0136,False,False,False,False,1.0,0.0,0.0,...,12,6,2,,,,,,,
2,2011-03-09 12:06:03,A400M-0002,V0136,False,False,False,False,1.0,0.0,0.0,...,12,6,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2011-03-09 12:06:04,A400M-0002,V0136,False,False,False,False,1.0,0.0,0.0,...,12,6,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2011-03-09 12:06:05,A400M-0002,V0136,False,False,False,False,1.0,0.0,0.0,...,12,6,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
def plot_diff(): 
    #plot diff columns for dataset 0 for each unique date
    for date in datasets[0]['DATE'].unique():
        # create subplot 3 columns and len(datasets[0].unique) rows
        fig = sp.make_subplots(rows=3, cols=3)
        
        # size of fig
        fig.update_layout(height=1000, width=1000)
        
        diff_cols = ['CT_DIFF', 'LXT_DIFF', 'RXT_DIFF', 'FT1_DIFF', 'FT2_DIFF', 'FT3_DIFF', 'FT4_DIFF']
        
        # loop over the diff columns
        for i, col in enumerate(diff_cols):
            # add scatter plot to subplot
            fig.add_trace(
                go.Scatter(x=datasets[0][datasets[0]['DATE'] == date]['UTC_TIME'], 
                        y=datasets[0][datasets[0]['DATE'] == date][col], 
                        mode='markers',
                        name=col),
                row=i//3 + 1, 
                col=i%3 + 1
            )

            # update xaxis and yaxis titles
            fig.update_xaxes(title_text='UTC_TIME', row=i//3 + 1, col=i%3 + 1)
            fig.update_yaxes(title_text=col + " " + str(date), row=i//3 + 1, col=i%3 + 1)
            
        # save fig as png 
        fig.write_image("plots/diff_plots/" + str(date) + "_diff.png")

In [21]:
!docker run -p 8888:8888 pycaret/full

docker: Cannot connect to the Docker daemon at unix:///var/run/docker.sock. Is the docker daemon running?.
See 'docker run --help'.


In [23]:
# ideas
# 1. use lower bound and upper bound to detect outliers
# 2. use the bounds to plot the data and see if there are any outliers
# 3. deep learning auto detection anomaly detection

In [None]:
# use deep learning to detect anomalies
# https://www.analyticsvidhya.com/blog/2019/01/introduction-time-series-classification/
