# Importing packages

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle
import tensorflow as tf
import os
import glob

from sklearn.metrics import mean_squared_error, f1_score, accuracy_score, mean_absolute_error, r2_score,plot_confusion_matrix,classification_report
from sklearn.model_selection import cross_val_score, train_test_split, KFold,GridSearchCV,RepeatedStratifiedKFold 
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, RobustScaler, OrdinalEncoder
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.feature_selection._base import SelectorMixin
from sklearn.feature_extraction.text import _VectorizerMixin

In [2]:
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [3]:
import warnings # Ignore alert messages
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
plt.rcParams.update({
    'font.size': 18,
    'axes.linewidth':2,
    'axes.titlesize': 20,
    'lines.linewidth' : 1.5,
    'lines.markersize' : 10,
    'figure.figsize': (25,10),
    'xtick.labelsize': 15, 
    'ytick.labelsize': 15,
    'font.family': 'Arial',
    'legend.fontsize':14,
    'axes.labelsize':20,
    'axes.grid':True,
})

# System description

An experimental rig to demonstrate filter clogging failure has been constructed and consists of the following major components: 

- Pump,
- Liquid tanks,
- Tank stirrer,
- Pulsation dampener,
- Filter,
- Pressure and flow rate sensors,
- Data acquisition system connected to a computer.

The experiment rig is a circuit composed of a pump flowing a liquid from a tank to another through a filter. The circuit is instrumented with sensors able to monitor the following variables:

- Flow rate,
- Liquid pressure before the filter 
- Liquid pressure after the filter. 
  
The fluid injected in the system is a suspension composed by Polyetheretherketone (PEEK) particles and water with different concentrations. To eliminate possible pulsations in the flow the circuit includes a dampener. The figures below depict the employed experimental rig as well as the filter.

![title](Figures/Experimental_rig.png) ![title](Figures/Filter.png)

The main components applied to construct this experimental rig are the following:

- **Pump:** Since the system will involve contaminants in the fluid, a peristaltic pump has been used as its mechanism is more tolerant to particles in the liquid. A Masterflex® SN-77921-70 (Drive: 07523-80, Two Heads: 77200-62, Tubing: L/S© 24) model peristaltic pump was installed in the system to maintain the flow of the prepared suspension. The pump is a positive displacement source, providing a flow rate ranging from 0.28 to 1700 ml/min (i.e. from 0.1 to 600 RPM).
  
- **Dampener:** The aim of using rigid tubing is to prevent the system from unwanted tube expansion due to pressure build up, which affects the actual pressure build up generated from filter clogging. A Masterflex® pulse dampener is installed on the downstream side of pump to eliminate any pulsation in flow.A majority of the system is furnished with a rigid polypropylene tubing, whereas the pump side is covered with a flexible Tygon® LFL pump tubing.

- **Particles:** The suspension is composed of Polyetheretherketone (PEEK) particles and water. PEEK particles have a density (1.3g/cm3) close to that of room temperature water and have significantly low water absorption level (0.1% / 24 hours, ASTM D570). Having a low water absorption level will prevent particles expanding when they mix with water. Subsequently, closer density with water allows particles to suspend longer in water.

- **Flow Rate Sensor:** A GMAG100 series electromagnetic flow meter (measurement range: 3 – 25,000 millilitres per minute) is installed in the system to keep track of the flow rate in the system.
  
- **Pressure Sensors:** Upstream and downstream Ashcroft® G2 pressure transducers (measurement range: 0 – 100 PSI) are installed in the system to capture the pressure drop (i.e. ‘ΔP’) across the filter, which is considered as the main indicator of clogging.
  
- **Filter:** the filter has a pore mesh size of 125µm.

# Objective

-  Predict when the filter is fully clogged; 
-  Determine the remaining useful life (RUL) of the filter;
-  Assess model performance for decreasing dataset sizes (100%,75%,50%,25% of all experiments to create the model);
-  Assess the reliability of the RUL metric with a confidence interval (extra)

# Importing datasets

In [5]:
def read_excel_files(rel_path,initial_index):
    path = os.getcwd()
    os.chdir(path + rel_path) # Change directory to the folder containing the excel sheets
    new_path = os.getcwd()
    print("Directory changed successfully to", new_path)
    
    csv_files = glob.glob(os.path.join(new_path,"*.csv")) # List of all .csv files
    df = pd.DataFrame() # Creating empty dataframe to save all files
    
    for idx,file in enumerate(csv_files):
        sample_number = initial_index + idx
        
        aux = pd.read_csv(file)
        aux['Sample'] = sample_number
        df = df.append(aux,ignore_index=True)
    
    os.chdir(path)
    return df  

## Training dataset

In [6]:
df_small = read_excel_files(rel_path='\Training\Small',initial_index=1)
df_small  

Directory changed successfully to c:\Users\arthu\Documents\Data Challenges\PHM July 2020 - 5th european conference\PHM-July-2020-Data-Challenge\Training\Small


Unnamed: 0,Time(s),Flow_Rate(ml/m),Upstream_Pressure(psi),Downstream_Pressure(psi),Sample
0,0.0,9.935067,-0.30000,-1.10625,1
1,0.1,4.110983,0.61250,-0.34375,1
2,0.2,5.098116,1.05625,-0.08125,1
3,0.3,10.231207,1.20000,0.84375,1
4,0.4,8.355655,0.93125,0.79375,1
...,...,...,...,...,...
35639,254.5,284.061899,10.38750,0.45000,12
35640,254.6,278.830095,10.68750,0.40000,12
35641,254.7,270.735604,9.50000,-0.93125,12
35642,254.8,189.000992,8.36250,-0.85000,12


In [7]:
df_large = read_excel_files(rel_path='\Training\Large',initial_index=33) 
df_large

Directory changed successfully to c:\Users\arthu\Documents\Data Challenges\PHM July 2020 - 5th european conference\PHM-July-2020-Data-Challenge\Training\Large


Unnamed: 0,Time(s),Flow_Rate(ml/m),Upstream_Pressure(psi),Downstream_Pressure(psi),Sample
0,0.0,9.244074,-0.76875,-0.58750,33
1,0.1,1.741863,0.25625,-0.66250,33
2,0.2,6.282675,0.66250,0.43750,33
3,0.3,10.231207,-0.90625,-0.72500,33
4,0.4,1.149584,0.55000,-0.35625,33
...,...,...,...,...,...
27175,218.5,208.447512,16.95000,0.34375,44
27176,218.6,210.717918,17.22500,0.73750,44
27177,218.7,208.546225,17.39375,0.66250,44
27178,218.8,204.795120,17.17500,0.31875,44


In [8]:
df_train = pd.concat([df_small,df_large],axis=0,ignore_index=True)
df_train

Unnamed: 0,Time(s),Flow_Rate(ml/m),Upstream_Pressure(psi),Downstream_Pressure(psi),Sample
0,0.0,9.935067,-0.30000,-1.10625,1
1,0.1,4.110983,0.61250,-0.34375,1
2,0.2,5.098116,1.05625,-0.08125,1
3,0.3,10.231207,1.20000,0.84375,1
4,0.4,8.355655,0.93125,0.79375,1
...,...,...,...,...,...
62819,218.5,208.447512,16.95000,0.34375,44
62820,218.6,210.717918,17.22500,0.73750,44
62821,218.7,208.546225,17.39375,0.66250,44
62822,218.8,204.795120,17.17500,0.31875,44


In [9]:
df_op = pd.read_excel('Training/Training Operation Profiles of Samples.xlsx')
df_op

Unnamed: 0,Sample,Particle Size (micron),Solid Ratio(%)
0,1,45-53,0.4
1,2,45-53,0.4
2,3,45-53,0.4
3,4,45-53,0.4
4,5,45-53,0.425
5,6,45-53,0.425
6,7,45-53,0.425
7,8,45-53,0.425
8,9,45-53,0.45
9,10,45-53,0.45


In [10]:
df_train_joined = pd.DataFrame(df_train.join(df_op.set_index('Sample'),on='Sample'))
df_train_joined

Unnamed: 0,Time(s),Flow_Rate(ml/m),Upstream_Pressure(psi),Downstream_Pressure(psi),Sample,Particle Size (micron),Solid Ratio(%)
0,0.0,9.935067,-0.30000,-1.10625,1,45-53,0.40
1,0.1,4.110983,0.61250,-0.34375,1,45-53,0.40
2,0.2,5.098116,1.05625,-0.08125,1,45-53,0.40
3,0.3,10.231207,1.20000,0.84375,1,45-53,0.40
4,0.4,8.355655,0.93125,0.79375,1,45-53,0.40
...,...,...,...,...,...,...,...
62819,218.5,208.447512,16.95000,0.34375,44,63-75,0.45
62820,218.6,210.717918,17.22500,0.73750,44,63-75,0.45
62821,218.7,208.546225,17.39375,0.66250,44,63-75,0.45
62822,218.8,204.795120,17.17500,0.31875,44,63-75,0.45


## Validation set

In [11]:
df_small = read_excel_files(rel_path='\Validation\Small',initial_index=13)
df_large = read_excel_files(rel_path='\Validation\Large',initial_index=45) 
df_op = pd.read_excel('Validation/Validation Operation Profiles of Samples.xlsx')

df_val = pd.concat([df_small,df_large],axis=0,ignore_index=True)
df_val_joined = pd.DataFrame(df_val.join(df_op.set_index('Sample'),on='Sample'))
df_val_joined

Directory changed successfully to c:\Users\arthu\Documents\Data Challenges\PHM July 2020 - 5th european conference\PHM-July-2020-Data-Challenge\Validation\Small
Directory changed successfully to c:\Users\arthu\Documents\Data Challenges\PHM July 2020 - 5th european conference\PHM-July-2020-Data-Challenge\Validation\Large


Unnamed: 0,Time(s),Flow_Rate(ml/m),Upstream_Pressure(psi),Downstream_Pressure(psi),Sample,Particle Size (micron),Solid Ratio(%)
0,0.0,7.862088,-0.01250,0.27500,13,45-53,0.475
1,0.1,3.913556,0.95000,-0.02500,13,45-53,0.475
2,0.2,7.763375,0.08125,0.37500,13,45-53,0.475
3,0.3,3.913556,0.88750,-0.06250,13,45-53,0.475
4,0.4,11.810620,-0.57500,-0.41250,13,45-53,0.475
...,...,...,...,...,...,...,...
17935,233.5,70.248893,4.57500,-1.24375,48,63-75,0.475
17936,233.6,70.347606,6.26875,0.71250,48,63-75,0.475
17937,233.7,69.261760,4.61875,-0.33750,48,63-75,0.475
17938,233.8,66.102935,4.68125,-1.33750,48,63-75,0.475


# Preprocessing

From the data challenge documentation, they say that the system fails when the pressure drop (upstream pressure - downstream pressure) is higher than 20 psi. In this scenario, the filter is clogged. Therefore, we must create this label:

- 1 = clogged
- 0 = not clogged

In [12]:
df_train_joined['Pressure drop (psi)'] = df_train_joined['Upstream_Pressure(psi)'] - df_train_joined['Downstream_Pressure(psi)']

df_train_joined['Clogged'] = 0
logic_vector = df_train_joined['Pressure drop (psi)'] > 20 # Returns True when the filter is clogged
df_train_joined['Clogged'][logic_vector] = 1
df_train_joined

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_joined['Clogged'][logic_vector] = 1


Unnamed: 0,Time(s),Flow_Rate(ml/m),Upstream_Pressure(psi),Downstream_Pressure(psi),Sample,Particle Size (micron),Solid Ratio(%),Pressure drop (psi),Clogged
0,0.0,9.935067,-0.30000,-1.10625,1,45-53,0.40,0.80625,0
1,0.1,4.110983,0.61250,-0.34375,1,45-53,0.40,0.95625,0
2,0.2,5.098116,1.05625,-0.08125,1,45-53,0.40,1.13750,0
3,0.3,10.231207,1.20000,0.84375,1,45-53,0.40,0.35625,0
4,0.4,8.355655,0.93125,0.79375,1,45-53,0.40,0.13750,0
...,...,...,...,...,...,...,...,...,...
62819,218.5,208.447512,16.95000,0.34375,44,63-75,0.45,16.60625,0
62820,218.6,210.717918,17.22500,0.73750,44,63-75,0.45,16.48750,0
62821,218.7,208.546225,17.39375,0.66250,44,63-75,0.45,16.73125,0
62822,218.8,204.795120,17.17500,0.31875,44,63-75,0.45,16.85625,0


In [13]:
df_val_joined['Pressure drop (psi)'] = df_val_joined['Upstream_Pressure(psi)'] - df_val_joined['Downstream_Pressure(psi)']

df_val_joined['Clogged'] = 0
logic_vector = df_val_joined['Pressure drop (psi)'] > 20 # Returns True when the filter is clogged
df_val_joined['Clogged'][logic_vector] = 1
df_val_joined

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_val_joined['Clogged'][logic_vector] = 1


Unnamed: 0,Time(s),Flow_Rate(ml/m),Upstream_Pressure(psi),Downstream_Pressure(psi),Sample,Particle Size (micron),Solid Ratio(%),Pressure drop (psi),Clogged
0,0.0,7.862088,-0.01250,0.27500,13,45-53,0.475,-0.28750,0
1,0.1,3.913556,0.95000,-0.02500,13,45-53,0.475,0.97500,0
2,0.2,7.763375,0.08125,0.37500,13,45-53,0.475,-0.29375,0
3,0.3,3.913556,0.88750,-0.06250,13,45-53,0.475,0.95000,0
4,0.4,11.810620,-0.57500,-0.41250,13,45-53,0.475,-0.16250,0
...,...,...,...,...,...,...,...,...,...
17935,233.5,70.248893,4.57500,-1.24375,48,63-75,0.475,5.81875,0
17936,233.6,70.347606,6.26875,0.71250,48,63-75,0.475,5.55625,0
17937,233.7,69.261760,4.61875,-0.33750,48,63-75,0.475,4.95625,0
17938,233.8,66.102935,4.68125,-1.33750,48,63-75,0.475,6.01875,0


In [18]:
df_train_joined.describe()

Unnamed: 0,Time(s),Flow_Rate(ml/m),Upstream_Pressure(psi),Downstream_Pressure(psi),Sample,Solid Ratio(%),Pressure drop (psi),Clogged
count,62824.0,62824.0,62824.0,62824.0,62824.0,62824.0,62824.0,62824.0
mean,134.234044,523.405825,5.683977,0.010881,20.116325,0.423567,5.673095,0.116166
std,81.47007,101.318216,8.28075,0.710498,16.319267,0.020362,8.26236,0.320426
min,0.0,-3.687368,-3.9,-3.48125,1.0,0.4,-1.3875,0.0
25%,65.4,535.583386,0.975,-0.626563,5.0,0.4,0.9625,0.0
50%,130.8,557.300312,1.9875,0.11875,11.0,0.425,1.85625,0.0
75%,196.3,563.22311,4.79375,0.6625,37.0,0.45,4.68125,0.0
max,353.9,931.325003,35.30625,1.43125,44.0,0.45,34.95625,1.0


# Saving dataset

In [19]:
df_train_joined.to_csv('Processed_data/df_train.csv.zip',index=False,compression="zip")
df_val_joined.to_csv('Processed_data/df_val.csv.zip',index=False,compression="zip")