In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor

In [2]:
df = pd.read_excel('/content/sample_data/New_combined_output.xlsx')
df.head()

Unnamed: 0,V,c_CO2,c_H,c_OH,i(H2),i(CO),i(HCOOH),i(CH2O),i(C2H4),i(CH3CH2OH),i(CH4),i(CH3OH),i(CH3COOH)
0,-1.176625,0.020325,1.44e-14,0.692,68246750.0,30199.868938,1592213.0,580.062113,1020.939809,10.910324,590524.3,6173.435869,15.30684
1,-1.011632,0.012978,2.39e-10,4.19e-05,999418.2,20699.209638,19049.0,10261.682628,46327.891091,5043.6707,133211.9,4204.610686,8.460137
2,-1.169984,0.001928,1.11e-08,8.98e-07,50255960.0,1.383354,111592.0,4007.85917,45546.866145,718.324513,3478274.0,38393.220988,4.562198e-08
3,-0.995469,0.03129,2.99e-08,3.35e-07,215572.4,1057.359578,10738.63,10127.89149,397387.153859,54516.296729,110511.6,3080.037751,0.02212183
4,-1.205031,0.009968,1.85e-15,5.41,80486470.0,10916.912214,887657.1,48.879726,12.972229,0.186377,97436.6,849.208067,2.353184


In [3]:
#finding out all the 0 values
for col in df.columns:
  missing = df.loc[df[col]==0].shape[0]
  print(f'{col} = {missing}')


V = 0
c_CO2 = 1
c_H = 0
c_OH = 0
i(H2) = 0
i(CO) = 1
i(HCOOH) = 1
i(CH2O) = 1
i(C2H4) = 1
i(CH3CH2OH) = 1
i(CH4) = 1
i(CH3OH) = 1
i(CH3COOH) = 16


In [6]:
#converting the 0 values to null values
# df['c_CO2'] = df['c_CO2'].replace(0, np.nan)
# df['i(CO)'] = df['i(CO)'].replace(0, np.nan)
# df['i(HCOOH)'] = df['i(HCOOH)'].replace(0, np.nan)
# df['i(CH2O)'] = df['i(CH2O)'].replace(0, np.nan)
# df['i(C2H4)'] = df['i(C2H4)'].replace(0, np.nan)
# df['i(CH3CH2OH)'] = df['i(CH3CH2OH)'].replace(0, np.nan)
# df['i(CH4)'] = df['i(CH4)'].replace(0, np.nan)
# df['i(CH3OH)'] = df['i(CH3OH)'].replace(0, np.nan)
# df['i(CH3COOH)'] = df['i(CH3COOH)'].replace(0, np.nan)

In [4]:
# Replace only the 0 values in the dataframe with 10^-29
df.replace({0: np.exp(-29 * np.log(10))}, inplace=True)

In [5]:
#confirm  that there are no 0's in the dataset
for col in df.columns:
  missing = df.loc[df[col]==0].shape[0]
  print(f'{col} = {missing}')


V = 0
c_CO2 = 0
c_H = 0
c_OH = 0
i(H2) = 0
i(CO) = 0
i(HCOOH) = 0
i(CH2O) = 0
i(C2H4) = 0
i(CH3CH2OH) = 0
i(CH4) = 0
i(CH3OH) = 0
i(CH3COOH) = 0


In [6]:
df.columns

Index(['V', 'c_CO2', 'c_H', 'c_OH', 'i(H2)', 'i(CO)', 'i(HCOOH)', 'i(CH2O)',
       'i(C2H4)', 'i(CH3CH2OH)', 'i(CH4)', 'i(CH3OH)', 'i(CH3COOH)'],
      dtype='object')

In [8]:
df.head()

Unnamed: 0,V,c_CO2,c_H,c_OH,i(H2),i(CO),i(HCOOH),i(CH2O),i(C2H4),i(CH3CH2OH),i(CH4),i(CH3OH),i(CH3COOH)
0,-1.176625,0.020325,1.44e-14,0.692,68246750.0,30199.868938,1592213.0,580.062113,1020.939809,10.910324,590524.3,6173.435869,15.30684
1,-1.011632,0.012978,2.39e-10,4.19e-05,999418.2,20699.209638,19049.0,10261.682628,46327.891091,5043.6707,133211.9,4204.610686,8.460137
2,-1.169984,0.001928,1.11e-08,8.98e-07,50255960.0,1.383354,111592.0,4007.85917,45546.866145,718.324513,3478274.0,38393.220988,4.562198e-08
3,-0.995469,0.03129,2.99e-08,3.35e-07,215572.4,1057.359578,10738.63,10127.89149,397387.153859,54516.296729,110511.6,3080.037751,0.02212183
4,-1.205031,0.009968,1.85e-15,5.41,80486470.0,10916.912214,887657.1,48.879726,12.972229,0.186377,97436.6,849.208067,2.353184


In [6]:
#finding log10 of  features without negative values and 0 so it will be easier to fit
df1 = np.log10(df[['c_H', 'c_OH', 'i(H2)', 'i(CO)', 'i(HCOOH)', 'i(CH2O)',
       'i(C2H4)', 'i(CH3CH2OH)', 'i(CH4)', 'i(CH3OH)', 'i(CH3COOH)']])
df1.head()

Unnamed: 0,c_H,c_OH,i(H2),i(CO),i(HCOOH),i(CH2O),i(C2H4),i(CH3CH2OH),i(CH4),i(CH3OH),i(CH3COOH)
0,-13.841638,-0.159894,7.834082,4.480005,6.202001,2.763475,3.009,1.037838,5.771238,3.790527,1.184886
1,-9.621602,-4.377786,5.999747,4.315954,4.279872,4.011219,4.665843,3.702747,5.124543,3.623726,0.927377
2,-7.954677,-6.046724,7.701188,0.140933,5.047633,3.602912,4.658459,2.856321,6.541364,4.584255,-7.340826
3,-7.524329,-6.474955,5.333593,3.024223,4.030949,4.005519,5.599214,4.736526,5.043408,3.488556,-1.655179
4,-14.732828,0.733197,7.905723,4.0381,5.948245,1.689129,1.113015,-0.729608,4.988722,2.929014,0.371656


In [8]:
df2 = df[['V', 'c_CO2']]
df2.head()

Unnamed: 0,V,c_CO2
0,-1.176625,0.020325
1,-1.011632,0.012978
2,-1.169984,0.001928
3,-0.995469,0.03129
4,-1.205031,0.009968


In [11]:
df3 = pd.concat([df2, df1], axis=1, join='inner')
df3

Unnamed: 0,V,c_CO2,c_H,c_OH,i(H2),i(CO),i(HCOOH),i(CH2O),i(C2H4),i(CH3CH2OH),i(CH4),i(CH3OH),i(CH3COOH)
0,-1.176625,0.020325,-13.841638,-0.159894,7.834082,4.480005,6.202001,2.763475,3.009000,1.037838,5.771238,3.790527,1.184886
1,-1.011632,0.012978,-9.621602,-4.377786,5.999747,4.315954,4.279872,4.011219,4.665843,3.702747,5.124543,3.623726,0.927377
2,-1.169984,0.001928,-7.954677,-6.046724,7.701188,0.140933,5.047633,3.602912,4.658459,2.856321,6.541364,4.584255,-7.340826
3,-0.995469,0.031290,-7.524329,-6.474955,5.333593,3.024223,4.030949,4.005519,5.599214,4.736526,5.043408,3.488556,-1.655179
4,-1.205031,0.009968,-14.732828,0.733197,7.905723,4.038100,5.948245,1.689129,1.113015,-0.729608,4.988722,2.929014,0.371656
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,-1.057962,0.027693,-13.412289,-0.588380,7.547761,3.750627,6.058675,1.580892,-0.510985,-3.416325,3.147666,1.536040,-0.203293
4996,-0.899450,0.009651,-13.217527,-0.782516,6.397673,0.014405,5.221892,-4.492082,-15.043827,-16.065500,-6.406296,-5.948313,-7.692118
4997,-1.206311,0.020114,-8.498941,-5.501689,7.384685,0.642325,5.770375,3.724694,5.695006,4.491143,7.120428,5.025766,-6.253326
4998,-1.023555,0.015282,-12.183759,-1.815309,7.322245,3.939841,5.644044,2.500412,1.169020,-1.757370,3.732387,2.161479,0.175137


In [12]:
df3.shape

(5000, 13)

In [15]:
#confirming if there are still no null values
df3.isnull().any()

V              False
c_CO2          False
c_H            False
c_OH           False
i(H2)          False
i(CO)          False
i(HCOOH)       False
i(CH2O)        False
i(C2H4)        False
i(CH3CH2OH)    False
i(CH4)         False
i(CH3OH)       False
i(CH3COOH)     False
dtype: bool

In [16]:
# X and y
X = df3[['V', 'c_CO2', 'c_H', 'c_OH',]]
y= df3[['i(H2)', 'i(CO)', 'i(HCOOH)', 'i(CH2O)','i(C2H4)', 'i(CH3CH2OH)', 'i(CH4)', 'i(CH3OH)', 'i(CH3COOH)']]

In [17]:
#splitting dataset into train and test
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=7)

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
#Pipeline is a major productivity tool to facilitate the process offers cleaning up code and collapsing all 
#preprocessing and modeling steps into to a single line of code. 

steps = [('scale', StandardScaler()),
         ('lr_multi',DecisionTreeRegressor())]#DecisionTreeRegressor(LogisticRegression()))]
pipe = Pipeline(steps)

In [19]:
pipe.fit(x_train,y_train)

Pipeline(steps=[('scale', StandardScaler()),
                ('lr_multi', DecisionTreeRegressor())])

In [20]:
pipe.score(x_test, y_test)

0.916478579338436

In [71]:
#taking the input or X values in order to make some predictions
V = float(input('Enter value for V: '))
c_CO2	 = float(input('Enter value for c_CO2: '))
c_H	 = float(input('Enter value for c_H: '))
c_OH	 = float(input('Enter value for c_OH: '))
print('\n')

X = [V, c_CO2, c_H, c_OH]

log_c_H = np.log10(X[2])
log_c_OH = np.log10(X[3])

X_refined = [[X[0], X[1], log_c_H, log_c_OH]]

y = pipe.predict(X_refined).tolist()

final_val = np.power(10, y).tolist()
outputs = ['i(H2)', 'i(CO)', 'i(HCOOH)', 'i(CH2O)', 'i(C2H4)', 'i(CH3CH2OH)', 'i(CH4)', 'i(CH3OH)', 'i(CH3COOH)']

print('\n')
print('The resulted output are:') 
print('\n')
for i, val in enumerate(final_val[0]):
   print(f'{outputs[i]}: {val}')


Enter value for V: -1.25
Enter value for c_CO2: 0.0147	
Enter value for c_H: 1.58E-11
Enter value for c_OH: 0.000630957	




The resulted output are:
i(H2): 58966482.75212392
i(CO): 26.798609261077793
i(HCOOH): 1109054.4768113394
i(CH2O): 2311.90339599914
i(C2H4): 173862.58237026984
i(CH3CH2OH): 7840.147046642349
i(CH4): 17889700.341287516
i(CH3OH): 111348.87775722209
i(CH3COOH): 1.4788814093755498e-05




In [33]:
pd.read_csv('/content/sample_data/test.csv')[1:6]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12
1,V,c_CO2,c_H,c_OH,i_H2,i_CO,i_HCOOH,i_CH2O,i_C2H4,i_EtOH,i_CH4,i_MeOH,i_CH3COOH
2,-0.9389,0.022,1.58E-06,6.31E-09,109420.369,135.9810772,7050.258795,9183.984601,222515.0568,20267.67158,23942.23391,1086.738644,0.000369545
3,-0.85,0.0183,1.58E-07,6.31E-08,135550.1842,14968.58045,29150.30674,6249.848805,29283.00489,873.56047,1322.923437,325.5184107,4.424194004
4,-1.25,0.0147,1.58E-11,0.000630957,5.91E+07,52.08500573,1122330.076,2295.716675,174444.3852,7880.174554,18004289.19,111654.3742,5.48E-05
5,-1.2056,0.0073,1.58E-14,0.630957344,89662483.8,10120.48294,789723.1271,424.0354243,1122.437305,16.87556465,1060443.233,9002.191475,2.02236962
