In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [2]:
filenames = ["data/msn_02_fuel_leak_signals_preprocessed.csv", "data/msn_10_fuel_leak_signals_preprocessed.csv", "data/msn_11_fuel_leak_signals_preprocessed.csv", "data/msn_12_fuel_leak_signals_preprocessed.csv", "data/msn_14_fuel_leak_signals_preprocessed.csv", "data/msn_29_fuel_leak_signals_preprocessed.csv", "data/msn_37_fuel_leak_signals_preprocessed.csv", "data/msn_53_fuel_leak_signals_preprocessed.csv"]

datasets = []
for filename in filenames:
    datasets.append(pd.read_csv(filename, sep=";"))
    
print("Datasets loaded")

Datasets loaded


In [3]:
for dataset in datasets:
    print(dataset.head)

<bound method NDFrame.head of                    UTC_TIME         MSN Flight  ENGINE_RUNNING_1  \
0       2011-03-09 12:06:01  A400M-0002  V0136              True   
1       2011-03-09 12:06:02  A400M-0002  V0136             False   
2       2011-03-09 12:06:03  A400M-0002  V0136             False   
3       2011-03-09 12:06:04  A400M-0002  V0136             False   
4       2011-03-09 12:06:05  A400M-0002  V0136             False   
...                     ...         ...    ...               ...   
623575  2011-03-10 17:27:23  A400M-0002  V0137             False   
623576  2011-03-10 17:27:24  A400M-0002  V0137             False   
623577  2011-03-10 17:27:25  A400M-0002  V0137             False   
623578  2011-03-10 17:27:26  A400M-0002  V0137             False   
623579  2011-03-10 17:27:27  A400M-0002  V0137             False   

        ENGINE_RUNNING_2  ENGINE_RUNNING_3  ENGINE_RUNNING_4  \
0                   True              True              True   
1                  False 

In [4]:
for dataset in datasets:
    for feature in dataset.columns:
        print(feature, dataset[feature].unique())

UTC_TIME ['2011-03-09 12:06:01' '2011-03-09 12:06:02' '2011-03-09 12:06:03' ...
 '2011-03-10 17:27:25' '2011-03-10 17:27:26' '2011-03-10 17:27:27']
MSN ['A400M-0002']
Flight ['V0136' 'V0133' 'V0926' 'V0837' 'V0626' 'V0929' 'V0095' 'V0623' 'V0965'
 'V0344' 'V0346' 'V0835' 'V0624' 'V0097' 'V0962' 'V0098' 'V0134' 'V0889'
 'V0348' 'V0140' 'V0135' 'V0099' 'V0925' 'V0834' 'V0625' 'V0963' 'V0886'
 'V0966' 'V0347' 'V0833' 'V0622' 'V0964' 'V0884' 'V0138' 'V0345' 'V0890'
 'V0836' 'V0928' 'V0094' 'V0132' 'V0927' 'V0137']
ENGINE_RUNNING_1 [ True False]
ENGINE_RUNNING_2 [ True False]
ENGINE_RUNNING_3 [ True False]
ENGINE_RUNNING_4 [ True False]
FLIGHT_PHASE_COUNT [nan  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12.]
FUEL_FLOW_1 [      nan   0.      157.1401  ... 353.5194  353.104    58.49746]
FUEL_FLOW_2 [     nan   0.     161.2566 ... 346.3994 347.0245 346.8476]
FUEL_FLOW_3 [     nan   0.     159.3162 ... 348.0053 347.6331 348.707 ]
FUEL_FLOW_4 [     nan   0.     158.6913 ... 346.9147 345.3615 346

In [5]:
# Count NaN values
for dataset in datasets:
    # print percentage of NaN values if more than 0.8
    for column in dataset.columns:
        if dataset[column].isna().sum() > 0.7 * len(dataset[column]):
            print("NaN percentage in column " + column + ": " + str(dataset[column].isna().sum() / len(dataset[column])))
    

NaN percentage in column APU_FUEL_FLOW_REQUEST_SIGNAL_1: 0.7257256486737869
NaN percentage in column FUEL_USED_2: 0.9045414327311336
NaN percentage in column FUEL_USED_3: 0.9019706890172294
NaN percentage in column FUEL_USED_4: 0.9102733225012468
NaN percentage in column FUEL_USED_1: 0.9156400315310242
NaN percentage in column FUEL_USED_2: 0.9270844741193431
NaN percentage in column FUEL_USED_3: 0.9290465512505408
NaN percentage in column FUEL_USED_4: 0.9346630335063438
NaN percentage in column FUEL_USED_1: 0.9260196158341397
NaN percentage in column FUEL_USED_2: 0.9617719074387006
NaN percentage in column FUEL_USED_3: 0.9619637376280304
NaN percentage in column FUEL_USED_4: 0.9621805088211096
NaN percentage in column FUEL_USED_1: 0.9603653579926987
NaN percentage in column FUEL_USED_2: 0.9420524874741246
NaN percentage in column FUEL_USED_3: 0.9419641443113576
NaN percentage in column FUEL_USED_4: 0.9424914021632871
NaN percentage in column FUEL_USED_1: 0.942016288324503
NaN percentag

In [6]:
# Correlation matrix for all datasets numercial values
#for dataset in datasets:
    #print(dataset.corr())


In [7]:
# draw plots for all datasets for column LEAK_DETECTION_LEAK_FLOW
for i in range(len(datasets)):
    print(filenames[i], datasets[i].shape)

data/msn_02_fuel_leak_signals_preprocessed.csv (623580, 111)
data/msn_10_fuel_leak_signals_preprocessed.csv (621610, 17)
data/msn_11_fuel_leak_signals_preprocessed.csv (4455992, 17)
data/msn_12_fuel_leak_signals_preprocessed.csv (3247664, 17)
data/msn_14_fuel_leak_signals_preprocessed.csv (4640993, 17)
data/msn_29_fuel_leak_signals_preprocessed.csv (4129447, 17)
data/msn_37_fuel_leak_signals_preprocessed.csv (3236645, 17)
data/msn_53_fuel_leak_signals_preprocessed.csv (3034227, 17)


In [8]:
# draw plots for all datasets for columns VALUE_FUEL_QTY_FT1;VALUE_FUEL_QTY_FT2;VALUE_FUEL_QTY_FT3;VALUE_FUEL_QTY_FT4;VALUE_FUEL_QTY_LXT;VALUE_FUEL_QTY_RXT
#for dataset in datasets:
 #   dataset.plot.scatter(x='UTC_TIME', y='VALUE_FUEL_QTY_FT1')
    #dataset.plot.scatter(x='UTC_TIME', y='VALUE_FUEL_QTY_FT2')
    #dataset.plot.scatter(x='UTC_TIME', y='VALUE_FUEL_QTY_FT3')
    #dataset.plot.scatter(x='UTC_TIME', y='VALUE_FUEL_QTY_FT4')
    #dataset.plot.scatter(x='UTC_TIME', y='VALUE_FUEL_QTY_LXT')
    #dataset.plot.scatter(x='UTC_TIME', y='VALUE_FUEL_QTY_RXT')
    

In [9]:
#datasets[0].plot.scatter(x='UTC_TIME', y='VALUE_FUEL_QTY_FT1')

In [10]:
# convert UTC_TIME to datetime
for dataset in datasets:
    dataset['UTC_TIME'] = pd.to_datetime(dataset['UTC_TIME'], format='%Y-%m-%d %H:%M:%S.%f')


In [11]:
for dataset in datasets:
    #introduce column date
    dataset['DATE'] = dataset['UTC_TIME'].dt.date
    dataset["MONTH"] = dataset['UTC_TIME'].dt.month
    dataset["DAY"] = dataset['UTC_TIME'].dt.day
    dataset["HOUR"] = dataset['UTC_TIME'].dt.hour
    dataset["MINUTE"] = dataset['UTC_TIME'].dt.minute
    dataset["SECOND"] = dataset['UTC_TIME'].dt.second

In [12]:
import plotly.subplots as sp
import plotly.graph_objs as go

def plot_datasets():
    # List of columns to plot for VALUE_FUEL_QTY_CT;VALUE_FUEL_QTY_FT1;VALUE_FUEL_QTY_FT2;VALUE_FUEL_QTY_FT3;VALUE_FUEL_QTY_FT4;VALUE_FUEL_QTY_LXT;VALUE_FUEL_QTY_RXT
    fuel_qty_cols = ['VALUE_FUEL_QTY_CT', 'VALUE_FUEL_QTY_FT1', 'VALUE_FUEL_QTY_FT2', 'VALUE_FUEL_QTY_FT3', 'VALUE_FUEL_QTY_FT4', 'VALUE_FUEL_QTY_LXT', 'VALUE_FUEL_QTY_RXT']

    # Loop over every unique date
    for date in datasets[0]['DATE'].unique():
        # Create a subplot
        fig = sp.make_subplots(rows=3, cols=3)
        
        # size of fig 
        fig.update_layout(height=1400, width=1400)

        # Loop over each column
        for i, col in enumerate(fuel_qty_cols):
            # Add scatter plot to subplot
            fig.add_trace(
                go.Scatter(x=datasets[0][datasets[0]['DATE'] == date]['UTC_TIME'], 
                        y=datasets[0][datasets[0]['DATE'] == date][col], 
                        mode='markers',
                        name=col),
                row=i//3 + 1, 
                col=i%3 + 1
            )

            # Update xaxis and yaxis titles
            fig.update_xaxes(title_text='UTC_TIME', row=i//3 + 1, col=i%3 + 1)
            fig.update_yaxes(title_text=col + " " + str(date), row=i//3 + 1, col=i%3 + 1)

        # Show the plot
        #fig.show()
        
        # save the plot as png
        fig.write_image("plots/" + str(date) + ".png")

In [13]:
datasets[0][datasets[0]['DATE'] == "2011-03-10"]

Unnamed: 0,UTC_TIME,MSN,Flight,ENGINE_RUNNING_1,ENGINE_RUNNING_2,ENGINE_RUNNING_3,ENGINE_RUNNING_4,FLIGHT_PHASE_COUNT,FUEL_FLOW_1,FUEL_FLOW_2,...,STATUS_OVERFLOW_LST,STATUS_OVERFLOW_RST,VALUE_FUEL_VOL_LST,VALUE_FUEL_VOL_RST,DATE,MONTH,DAY,HOUR,MINUTE,SECOND


In [14]:
# from dataset 0 show 2011-03-10
datasets[0][datasets[0]['DATE'] == "2011-03-10"]

Unnamed: 0,UTC_TIME,MSN,Flight,ENGINE_RUNNING_1,ENGINE_RUNNING_2,ENGINE_RUNNING_3,ENGINE_RUNNING_4,FLIGHT_PHASE_COUNT,FUEL_FLOW_1,FUEL_FLOW_2,...,STATUS_OVERFLOW_LST,STATUS_OVERFLOW_RST,VALUE_FUEL_VOL_LST,VALUE_FUEL_VOL_RST,DATE,MONTH,DAY,HOUR,MINUTE,SECOND


In [15]:
#!pip install lazypredict
#!pip install pyforest
#!pip install pycaret

In [16]:
import pyforest
lazy_imports()

['from sklearn.linear_model import LinearRegression',
 'from sklearn.linear_model import Ridge',
 'from sklearn import svm',
 'from sklearn.model_selection import KFold',
 'from sklearn.feature_extraction.text import CountVectorizer',
 'from scipy import signal as sg',
 'from sklearn.model_selection import cross_val_score',
 'from fbprophet import Prophet',
 'from sklearn.preprocessing import PolynomialFeatures',
 'from sklearn.model_selection import train_test_split',
 'import sklearn',
 'import skimage',
 'from sklearn.model_selection import RandomizedSearchCV',
 'import spacy',
 'import fastai',
 'import pandas as pd',
 'import datetime as dt',
 'from dask import dataframe as dd',
 'import lightgbm as lgb',
 'from pyspark import SparkContext',
 'import textblob',
 'import re',
 'from xlrd import open_workbook',
 'import cv2',
 'import altair as alt',
 'import statsmodels.api as sm',
 'from sklearn import metrics',
 'import plotly.express as px',
 'import fbprophet',
 'from sklearn.p

In [17]:
# introduce new columns for each dataset [CT_DIFF, LXT_DIFF, RXT_DIFF, FT1_DIFF, FT2_DIFF, FT3_DIFF, FT4_DIFF]
# value represents the difference between the current and the previous value
for dataset in datasets:
    dataset['CT_DIFF'] = dataset['VALUE_FUEL_QTY_CT'].diff()
    dataset['LXT_DIFF'] = dataset['VALUE_FUEL_QTY_LXT'].diff()
    dataset['RXT_DIFF'] = dataset['VALUE_FUEL_QTY_RXT'].diff()
    dataset['FT1_DIFF'] = dataset['VALUE_FUEL_QTY_FT1'].diff()
    dataset['FT2_DIFF'] = dataset['VALUE_FUEL_QTY_FT2'].diff()
    dataset['FT3_DIFF'] = dataset['VALUE_FUEL_QTY_FT3'].diff()
    dataset['FT4_DIFF'] = dataset['VALUE_FUEL_QTY_FT4'].diff()  

In [18]:
datasets[0].head()

Unnamed: 0,UTC_TIME,MSN,Flight,ENGINE_RUNNING_1,ENGINE_RUNNING_2,ENGINE_RUNNING_3,ENGINE_RUNNING_4,FLIGHT_PHASE_COUNT,FUEL_FLOW_1,FUEL_FLOW_2,...,HOUR,MINUTE,SECOND,CT_DIFF,LXT_DIFF,RXT_DIFF,FT1_DIFF,FT2_DIFF,FT3_DIFF,FT4_DIFF
0,2011-03-09 12:06:01,A400M-0002,V0136,True,True,True,True,,,,...,12,6,1,,,,,,,
1,2011-03-09 12:06:02,A400M-0002,V0136,False,False,False,False,1.0,0.0,0.0,...,12,6,2,,,,,,,
2,2011-03-09 12:06:03,A400M-0002,V0136,False,False,False,False,1.0,0.0,0.0,...,12,6,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2011-03-09 12:06:04,A400M-0002,V0136,False,False,False,False,1.0,0.0,0.0,...,12,6,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2011-03-09 12:06:05,A400M-0002,V0136,False,False,False,False,1.0,0.0,0.0,...,12,6,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
#plot diff columns for dataset 0 for each unique date
for date in datasets[0]['DATE'].unique():
    # create subplot 3 columns and len(datasets[0].unique) rows
    fig = sp.make_subplots(rows=3, cols=3)
    
    # size of fig
    fig.update_layout(height=1000, width=1000)
    
    diff_cols = ['CT_DIFF', 'LXT_DIFF', 'RXT_DIFF', 'FT1_DIFF', 'FT2_DIFF', 'FT3_DIFF', 'FT4_DIFF']
    
    # loop over the diff columns
    for i, col in enumerate(diff_cols):
        # add scatter plot to subplot
        fig.add_trace(
            go.Scatter(x=datasets[0][datasets[0]['DATE'] == date]['UTC_TIME'], 
                    y=datasets[0][datasets[0]['DATE'] == date][col], 
                    mode='markers',
                    name=col),
            row=i//3 + 1, 
            col=i%3 + 1
        )

        # update xaxis and yaxis titles
        fig.update_xaxes(title_text='UTC_TIME', row=i//3 + 1, col=i%3 + 1)
        fig.update_yaxes(title_text=col + " " + str(date), row=i//3 + 1, col=i%3 + 1)
        
    # save fig as png 
    fig.write_image("plots/diff_plots/" + str(date) + "_diff.png")