## Predict O2 output

Predict O2 output using different burner settings(for offline operation) and also using air and gas flow (for online operation). The initial predictions are made
depending on the burner settings but in operation the amount of O2 can vary how the gas and air flows are adjusted.

Import required packages:

In [None]:
import pandas as pd
import holoviews as hv
import hvplot.pandas
import data_processing_methods as dpm
import numpy as np
from sklearn.gaussian_process import GaussianProcessRegressor
from matplotlib import pyplot as plt
import plotly.express as px

Read the file with CV furnace burner settings

In [None]:
burner_settings_df = pd.read_excel('Burners_settings_vs_O2 _rev1.xlsx', sheet_name='Sheet1')
burner_settings_df

Plot differnt furnace settings

In [None]:
burner_settings_df.hvplot()

Plot standardised furnace settings

In [None]:
burner_settings_df = pd.read_excel('Burners_settings_vs_O2 _rev1.xlsx', sheet_name='Sheet1')
standardised_df = burner_settings_df.copy()
for col in burner_settings_df.columns:

    if (np.std(burner_settings_df[col])> 0):
        standardised_df[col] = dpm.standardise(burner_settings_df[col],np.mean(burner_settings_df[col]),np.std(burner_settings_df[col]))

    else:
        standardised_df = standardised_df.drop(columns=col)
    

standardised_df.hvplot(height= 500, width =1200, ylabel = 'Standardised furnace settings')

**Predict furnace O2 output**

After standardising furnace settings the correlation co-effcient of all the input parameters of the prediction model are evaluated so that similar type of data (corr coef. > 0.9) can be discarded.

In [None]:
# Find the correlation groups
X = standardised_df.drop(columns =['Output O2 / %', 'Output Burner usage / %'])
y = standardised_df['Output O2 / %']
corr_coef = X.corr(method='pearson')
corr_coef

In [None]:
# Drop columns with corr coeffcients higher than 0.9 
X = X.drop(columns =['Burner turns from zero', 'Fan speed / RPM'])
X_train = X[10:]
y_train = y[10:]
gpr = GaussianProcessRegressor().fit(X_train, y_train)

X_predict = X
y_predict = gpr.predict(X_predict)
X_predict

Plotting predicted and actual data

In [None]:
# plot predicted data
y_std = np.std(burner_settings_df['Output O2 / %'])
y_mean = np.mean(burner_settings_df['Output O2 / %'])
y_actual = y

plt.plot(y_predict*y_std+y_mean,label='predictions')
plt.ylabel('O2 %')
plt.scatter(range(len(y_actual)),y_predict*y_std+y_mean)

plt.plot(range(len(y_actual)), y_actual*y_std+y_mean,label='actual')
plt.scatter(range(len(y_actual)), y_actual*y_std+y_mean)
plt.legend()

Test the model for given valus of burner settings

In [None]:
# Predict O2 output for user defined input values
cols = list(X_train.columns)
X_new = pd.DataFrame([[int('1100'),int('-6'),int('80'),int('80'),int('96')]],
                         columns = cols)
X_new_pr= X_new.copy()
for col in cols:    
    X_new_pr[col] = dpm.standardise(X_new[col],np.mean(burner_settings_df[col]),np.std(burner_settings_df[col]))

y_new_predict = gpr.predict(X_new_pr)
Y_new = pd.DataFrame(y_new_predict*y_std+y_mean, columns=['Newly Predicted'])
Y_new

Generate random samples to simulate the developed ML model

In [None]:
# No. samples to be generated
N_samples = 1000
cols = list(X_train.columns)

# Initialise arrays to store samples of un-standardised and 
# standardised inputs
D = len (cols)
X_samples_us = np.zeros([N_samples, D])
X_samples = np.zeros([N_samples, D])

for i in range(N_samples):
    for j in range (D):
        X_samples_us[i, j] = np.random.uniform(np.min(burner_settings_df[cols[j]]), np.max(burner_settings_df[cols[j]]))

In [None]:
# Standardise the samples created
for j in range (D):    
    X_samples[:,j] = dpm.standardise(X_samples_us[:,j],np.mean(burner_settings_df[cols[j]]),np.std(burner_settings_df[cols[j]]))

# Save the predictions 
X_samples_df = pd.DataFrame(data = X_samples, columns = cols)
X_samples_us_df = pd.DataFrame(data = X_samples_us, columns = cols)
y_samples_predict = gpr.predict(X_samples_df)
y_samples_predict = y_samples_predict*y_std+y_mean
X_samples_us_df['Predicted O2 %'] = y_samples_predict

Plot predicted O2 output for generated random sample data and observe the general trend across the data distribution

In [None]:
# Find the mean of each bin (binning data)
n_bins = 30
x_plot = X_samples_us_df['Temperature setpoint / degC']
y_plot = X_samples_us_df['Predicted O2 %']

bins = np.linspace(np.min(x_plot), np.max(x_plot), n_bins)
main_effect = np.zeros(len(bins)-1)
main_effect_index = np.zeros(len(bins)-1)
main_effect_df = pd.DataFrame({})


for j in range(len(bins)-1):
    indx = np.logical_and(x_plot > bins[j], x_plot < bins[j+1])
    main_effect_index[j] = 0.5*(bins[j] + bins[j+1])

    # Only compute mean if there are any points in bin
    if np.sum(indx) > 0:
        main_effect[j] = np.mean(y_plot[indx])

main_effect_df['index'] = main_effect_index
main_effect_df['value'] = main_effect

In [None]:
# plot main effects of model inputs
fig_a = X_samples_us_df.hvplot.scatter(x = 'Temperature setpoint / degC', y = 'Predicted O2 %', height = 500, width = 1000, hover_cols = 'all')
fig_b = main_effect_df.hvplot.line(x= 'index', y = 'value', xlim = [1100, 1150],color= 'red')
fig = fig_a*fig_b
fig

**Predict furnace O2 output using air and gas flow**

Sensor data realting to gas and air flow are filtered  before inputting to the prediction model. The data collected when the furnace is switched on are used for both model training and validation.

In [98]:
# Read all air flow sensor data
df_merged = pd.read_pickle('merged_sensor_df.pkl')
df_merged = df_merged.resample('1T').mean()
cols = list(df_merged.columns[df_merged.columns.str.startswith('AIR_')])

df_air_merged_selected = df_merged.loc[:, df_merged.columns.str.startswith('AIR_')]
df_air_merged_filtered = df_air_merged_selected.copy()

for col in cols:
    df_air_merged_filtered.loc[:,col] = dpm.remove_spikes(df_air_merged_selected.loc[:,col],olr_def=1)
    df_air_merged_filtered.loc[:,col] = dpm.low_pass_filter(df_air_merged_filtered.loc[:,col],wn=0.1)
    
# Read gas flow sensor data    
df_merged = pd.read_pickle('merged_sensor_df.pkl')
df_merged = df_merged.resample('1T').mean()
cols = list(df_merged.columns[df_merged.columns.str.startswith('GAS_')])

df_gas_merged_selected = df_merged.loc[:, df_merged.columns.str.startswith('GAS_')]
df_gas_merged_filtered = df_gas_merged_selected.copy()

for col in cols:
    df_gas_merged_filtered.loc[:,col] = dpm.remove_spikes(df_gas_merged_selected.loc[:,col],olr_def=1)
    df_gas_merged_filtered.loc[:,col] = dpm.low_pass_filter(df_gas_merged_filtered.loc[:,col],wn=0.1)

    # Read gas flow sensor data    
df_merged = pd.read_pickle('merged_sensor_df.pkl')
df_merged = df_merged.resample('1T').mean()
cols = list(df_merged.columns[df_merged.columns.str.startswith('FURNACE_0126_341_04_O2')])

df_O2_selected = df_merged.loc[:, df_merged.columns.str.startswith('FURNACE_0126_341_04_O2')]
df_O2_filtered = df_O2_selected.copy()

for col in cols:
    df_O2_filtered.loc[:,col] = dpm.remove_spikes(df_O2_selected.loc[:,col],olr_def=1)
    df_O2_filtered.loc[:,col] = dpm.low_pass_filter(df_O2_filtered.loc[:,col],wn=0.01)

df_flow_combined = pd.concat([df_air_merged_filtered, df_gas_merged_filtered], axis=1)
df_flow_combined = pd.concat([df_flow_combined,df_O2_filtered], axis=1)
df_flow_combined = df_flow_combined[30000:-100]
df_flow_combined

Unnamed: 0_level_0,AIR_0123_945_05_TE,AIR_PID_COMBUSTION_AIR_PV,AIR_PID_COMBUSTION_AIR_SP,AIR_PID_COMBUSTION_AIR_OUT,AIR_DRIVE_AIR_SPEED,AIR_0123_945_03_PT,GAS_0110_943_07_FT_Nm3_h,GAS_0110_943_13_TE,GAS_0110_943_14_PT,GAS_0110_943_07_FT_m3_h,FURNACE_0126_341_04_O2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2022-09-26 20:00:00,19.231325,0.348927,100.0,100.0,-1.302122e-146,0.350376,-3.292051e-151,21.198860,0.878209,8.980284e-06,21.099971
2022-09-26 20:01:00,19.223334,0.345659,100.0,100.0,-4.026817e-146,0.347079,-2.767297e-151,21.199728,0.876074,1.449711e-05,21.099971
2022-09-26 20:02:00,19.216547,0.342682,100.0,100.0,-7.281149e-146,0.343973,-1.730366e-151,21.200738,0.873738,1.713616e-05,21.099971
2022-09-26 20:03:00,19.211030,0.340028,100.0,100.0,-1.088192e-145,0.341110,-1.141162e-152,21.201569,0.871288,1.749097e-05,21.099970
2022-09-26 20:04:00,19.206695,0.337705,100.0,100.0,-1.453888e-145,0.338518,2.106065e-151,21.201908,0.868826,1.617703e-05,21.099970
...,...,...,...,...,...,...,...,...,...,...,...
2022-10-13 22:16:00,22.404314,1.080793,100.0,100.0,8.587402e-114,1.080553,-7.006328e-84,24.703576,1.614921,8.768753e-21,21.000434
2022-10-13 22:17:00,22.403138,1.081165,100.0,100.0,7.221662e-114,1.080635,-8.962140e-84,24.703268,1.612502,-6.649488e-21,21.000419
2022-10-13 22:18:00,22.401022,1.081387,100.0,100.0,5.609176e-114,1.080558,-9.634588e-84,24.702322,1.609873,-1.737708e-20,21.000405
2022-10-13 22:19:00,22.397809,1.081416,100.0,100.0,3.952707e-114,1.080296,-9.310162e-84,24.700571,1.607211,-2.375996e-20,21.000390


In [99]:
# Standardise input data
standardised_df = df_flow_combined.copy()
for col in df_flow_combined.columns:

    if (np.std(df_flow_combined[col])> 0):
        standardised_df[col] = dpm.standardise(df_flow_combined[col],np.mean(df_flow_combined[col]),np.std(df_flow_combined[col]))
    else:
        standardised_df = standardised_df.drop(columns=col)

y = standardised_df['FURNACE_0126_341_04_O2']
X = standardised_df

Remove sensor data that closly correlate to exsisting data channels

In [100]:
corr_coef = df_air_merged_filtered.corr(method='pearson')
corr_coef

Unnamed: 0,AIR_0123_945_05_TE,AIR_PID_COMBUSTION_AIR_PV,AIR_PID_COMBUSTION_AIR_SP,AIR_PID_COMBUSTION_AIR_OUT,AIR_DRIVE_AIR_SPEED,AIR_0123_945_03_PT
AIR_0123_945_05_TE,1.0,0.904505,-0.012509,-0.200774,0.860948,0.904539
AIR_PID_COMBUSTION_AIR_PV,0.904505,1.0,-0.018975,-0.329993,0.980417,0.999999
AIR_PID_COMBUSTION_AIR_SP,-0.012509,-0.018975,1.0,0.015881,-0.020769,-0.018973
AIR_PID_COMBUSTION_AIR_OUT,-0.200774,-0.329993,0.015881,1.0,-0.503803,-0.330005
AIR_DRIVE_AIR_SPEED,0.860948,0.980417,-0.020769,-0.503803,1.0,0.980417
AIR_0123_945_03_PT,0.904539,0.999999,-0.018973,-0.330005,0.980417,1.0


In [101]:
corr_coef = df_gas_merged_filtered.corr(method='pearson')
corr_coef

Unnamed: 0,GAS_0110_943_07_FT_Nm3_h,GAS_0110_943_13_TE,GAS_0110_943_14_PT,GAS_0110_943_07_FT_m3_h
GAS_0110_943_07_FT_Nm3_h,1.0,0.759686,0.904059,0.999954
GAS_0110_943_13_TE,0.759686,1.0,0.89468,0.762845
GAS_0110_943_14_PT,0.904059,0.89468,1.0,0.905818
GAS_0110_943_07_FT_m3_h,0.999954,0.762845,0.905818,1.0


In [102]:
# Drop columns with corr coeffcients higher than 0.9 
X = X.drop(columns =['AIR_DRIVE_AIR_SPEED', 'AIR_0123_945_03_PT', 'GAS_0110_943_14_PT', 'GAS_0110_943_07_FT_m3_h'])

In [None]:
fig_c = X.hvplot(height = 500, width = 1600, line_width = 2, grid= True)
fig_c 

**plot predicted and actual data**

First half of the data is used for model training while the rest of it is used for validation

In [104]:
# Select the datafor training
X_train = X[0:int(len(X)/2)]
y_train = y[0:int(len(X)/2)]

# Train Gaussian Process regression model
gpr = GaussianProcessRegressor().fit(X_train, y_train)
y_std = np.std(df_flow_combined['FURNACE_0126_341_04_O2'])
y_mean = np.mean(df_flow_combined['FURNACE_0126_341_04_O2'])

In [107]:
# Make predictions with model trained using gas and air flows
X_predict = X
y_predict = gpr.predict(X_predict)
y = y.reset_index()

In [None]:
y_actual = y['FURNACE_0126_341_04_O2']
Y_df = pd.DataFrame(y_predict*y_std+y_mean, columns=['Predicted'])
Y_df['Actual'] = y_actual*y_std+y_mean 
Y_df['Date'] = y['Date']
Y_df.set_index('Date',inplace=True)
Y_df = Y_df[int(len(X)/2):]
fig_a = px.line(Y_df, labels={
                     "value": "O2 output",
                 }, title='O2 predictions - (w.r.t gas and air flows)',height=600, width=920)
fig_a 