# Dependencies

In [None]:
import os
import datetime as dt

!python --version

Python 3.7.12


In [None]:
!pip install lightgbm==3.2.1

Collecting lightgbm==3.2.1
  Downloading lightgbm-3.2.1-py3-none-manylinux1_x86_64.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 33.2 MB/s 
Installing collected packages: lightgbm
Successfully installed lightgbm-3.2.1
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.[0m


In [None]:
!pip install xgboost==1.4.2

Collecting xgboost==1.4.2
  Downloading xgboost-1.4.2-py3-none-manylinux2010_x86_64.whl (166.7 MB)
[K     |████████████████████████████████| 166.7 MB 51.2 MB/s 
Installing collected packages: xgboost
Successfully installed xgboost-1.4.2
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.[0m


In [None]:
import sklearn
import joblib
import numpy as np
import pandas as pd
from joblib import dump
from sklearn.svm import SVR, LinearSVR, NuSVR
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import HuberRegressor, LinearRegression, ElasticNet
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.inspection import permutation_importance
import lightgbm as lgbm
import xgboost as xgb
from sklearn.model_selection import GridSearchCV


print("numpy==%s" %np.__version__)
print("pandas==%s" %pd.__version__)
print("scikit-learn==%s" %sklearn.__version__)
print("joblib==%s" %joblib.__version__)

REGRESSORS = {
    #('KNeighborsRegressor', KNeighborsRegressor()),
    ('LinearRegression', LinearRegression()),
    ('HuberRegressor', HuberRegressor()),
    ('ElasticNet', ElasticNet()),
    ('LinearSVR', LinearSVR()),
    #('SVR', SVR()),
    #('NuSVR', NuSVR()),
    ('GradientBoostingRegressor', GradientBoostingRegressor()),
    ('AdaBoostRegressor', AdaBoostRegressor()),
    #('GaussianProcessRegressor', GaussianProcessRegressor()),
    #('MLPRegressor', MLPRegressor((16,8,4))),
}

numpy==1.19.5
pandas==1.2.5
scikit-learn==1.0
joblib==1.1.0


In [None]:


params = {
    'num_leaves': [7, 14, 21, 28, 31, 50],
    'learning_rate': [0.1, 0.03, 0.003],
    'max_depth': [-1, 3, 5],
    'n_estimators': [50, 100, 200, 500],
}

grid = GridSearchCV(lgbm.LGBMRegressor(random_state=0), params, scoring='r2', cv=5)

In [None]:
TREES = {
    ('Lightgbm', grid),
    ('XGBoost',xgb.XGBRegressor(objective='reg:squarederror')),
}

# Import datasets

In [None]:
training_path = os.path.join(".", "processed", "train.csv")
print("Read the training dataset from %s" %(training_path))

pretesting_path = os.path.join(".", "processed", "test.csv")
print("Read the pretesting dataset from %s" %(pretesting_path))

Read the training dataset from ./processed/train.csv
Read the pretesting dataset from ./processed/test.csv


In [None]:
training_set = pd.read_csv(training_path)
training_set

Unnamed: 0,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],metal_linker,organic_linker1,organic_linker2,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol],...,etb,fof,nbo,pcu,pts,rht,sra,tbo,the,CO2_working_capacity [mL/g]
0,-0.041606,0.581323,-1.497285,-1.371381,-1.190670,1.864160,2.966228,3.563762,0.755278,1.568435,...,4.550317,-0.123032,-0.147367,-1.430295,-0.048779,-0.00657,-0.370973,-0.019712,-0.023693,101.224774
1,-0.724843,-0.672924,-1.268760,-1.261383,-1.070389,-0.686227,0.928205,0.320671,-0.254473,0.611158,...,-0.219765,-0.123032,-0.147367,0.699156,-0.048779,-0.00657,-0.370973,-0.019712,-0.023693,118.987011
2,-0.271145,-0.209823,-0.461613,-0.554545,-0.551339,1.545362,0.465018,0.320671,0.198422,0.422076,...,-0.219765,-0.123032,-0.147367,-1.430295,-0.048779,-0.00657,2.695612,-0.019712,-0.023693,187.626004
3,0.440474,-0.001898,0.760820,1.001721,1.347692,1.545362,-0.461356,0.222396,-0.403419,-0.526174,...,-0.219765,-0.123032,-0.147367,-1.430295,-0.048779,-0.00657,2.695612,-0.019712,-0.023693,55.786959
4,0.282346,0.357211,-0.352239,0.619071,0.229946,1.864160,3.799964,3.367211,-0.317785,0.558331,...,4.550317,-0.123032,-0.147367,-1.430295,-0.048779,-0.00657,-0.370973,-0.019712,-0.023693,111.690462
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46336,-0.686003,-0.518327,-0.646012,-0.203367,-0.726884,-0.686227,0.001831,-0.170706,-1.057365,-1.745722,...,-0.219765,-0.123032,-0.147367,0.699156,-0.048779,-0.00657,-0.370973,-0.019712,-0.023693,7.483415
46337,0.002501,0.132100,0.298245,2.088561,0.889324,-0.367429,-0.368718,-0.662083,-1.090529,-1.830565,...,-0.219765,-0.123032,-0.147367,0.699156,-0.048779,-0.00657,-0.370973,-0.019712,-0.023693,5.968178
46338,0.633005,0.362411,0.721057,3.160735,2.541291,-0.367429,-0.461356,-0.367257,-1.239194,-2.490474,...,-0.219765,-0.123032,-0.147367,0.699156,-0.048779,-0.00657,-0.370973,-0.019712,-0.023693,6.346203
46339,7.802476,4.140956,2.051541,4.407480,6.311179,-1.005026,-0.461356,-0.170706,-1.286549,-3.235074,...,-0.219765,-0.123032,-0.147367,0.699156,-0.048779,-0.00657,-0.370973,-0.019712,-0.023693,-4.405398


In [None]:
pretesting_set = pd.read_csv(pretesting_path)
pretesting_set

Unnamed: 0,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],metal_linker,organic_linker1,organic_linker2,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol],...,bcu,etb,fof,nbo,pcu,pts,rht,sra,tbo,the
0,-0.676648,-0.651973,-1.518383,-1.616025,-1.191212,-0.686227,-0.646631,0.517222,0.967949,1.398798,...,-0.22669,-0.219765,-0.123032,-0.147367,0.699156,-0.048779,-0.00657,-0.370973,-0.019712,-0.023693
1,0.510338,1.908794,-1.189668,-0.600072,-0.983700,-0.048630,-0.553993,-0.367257,-0.315892,-0.860195,...,-0.22669,-0.219765,-0.123032,-0.147367,-1.430295,-0.048779,-0.00657,-0.370973,-0.019712,-0.023693
2,-0.516445,-0.696018,1.657145,0.674681,0.616254,-0.367429,-0.090806,-0.367257,-0.690711,-0.948818,...,-0.22669,-0.219765,-0.123032,-0.147367,0.699156,-0.048779,-0.00657,-0.370973,-0.019712,-0.023693
3,-0.177007,-0.479194,2.108731,1.344955,1.476642,-0.686227,-1.017180,0.713773,-0.934242,-0.875957,...,-0.22669,-0.219765,-0.123032,-0.147367,0.699156,-0.048779,-0.00657,-0.370973,-0.019712,-0.023693
4,0.102354,-0.181128,1.067237,1.191671,1.214408,-0.367429,-0.368718,-0.170706,-0.696895,-0.658708,...,-0.22669,-0.219765,-0.123032,-0.147367,0.699156,-0.048779,-0.00657,-0.370973,-0.019712,-0.023693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16995,12.117181,2.772305,4.826764,5.128679,15.755946,-0.367429,-0.831905,-0.662083,-1.290539,-3.222734,...,-0.22669,-0.219765,-0.123032,6.785772,-1.430295,-0.048779,-0.00657,-0.370973,-0.019712,-0.023693
16996,0.894561,-0.040184,2.722023,3.252400,4.778951,-0.367429,-1.017180,-0.956910,-1.135142,-2.609347,...,-0.22669,-0.219765,-0.123032,6.785772,-1.430295,-0.048779,-0.00657,-0.370973,-0.019712,-0.023693
16997,0.731370,-0.194638,3.057574,2.820353,4.579025,-0.686227,-0.461356,0.222396,-1.281423,-2.688173,...,-0.22669,-0.219765,-0.123032,-0.147367,0.699156,-0.048779,-0.00657,-0.370973,-0.019712,-0.023693
16998,0.736380,-0.289305,3.110845,3.400184,5.880443,-0.367429,-0.461356,0.418947,-1.270371,-2.854109,...,-0.22669,-0.219765,-0.123032,-0.147367,0.699156,-0.048779,-0.00657,-0.370973,-0.019712,-0.023693


# Matrix representation

In [None]:
TARGET = 'CO2_working_capacity [mL/g]'
FEATURES = list(training_set.columns)
FEATURES.remove(TARGET)

FEATURES, TARGET

(['volume [A^3]',
  'weight [u]',
  'surface_area [m^2/g]',
  'void_fraction',
  'void_volume [cm^3/g]',
  'metal_linker',
  'organic_linker1',
  'organic_linker2',
  'CO2/N2_selectivity',
  'heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]',
  'density [g/cm^3]',
  'acs',
  'bcu',
  'etb',
  'fof',
  'nbo',
  'pcu',
  'pts',
  'rht',
  'sra',
  'tbo',
  'the'],
 'CO2_working_capacity [mL/g]')

In [None]:
X = training_set[FEATURES].values
y = training_set[TARGET].values

print(f"Features matirx size     : {X.shape}")
print(f"Target values vector size: {y.shape}")

Features matirx size     : (46341, 22)
Target values vector size: (46341,)


# Train models

## Cross-validation

In [None]:
kf = KFold(n_splits=7, shuffle=True)

datasets = [
    (X[train_index], X[test_index], y[train_index], y[test_index]) for train_index, test_index in kf.split(X, y) 
]

In [None]:
result = []
print("name, fold, Train_R2, R2, MAE, logMAE")
for name, reg in TREES:
    for ds_cnt, ds in enumerate(datasets):
        X_train, X_test, y_train, y_test = ds
        reg.fit(X_train,y_train)
        self_rsq = reg.score(X_train, y_train)
        rsq = reg.score(X_test, y_test)
        mae = mean_absolute_error(y_test, reg.predict(X_test))
        lmae = np.log(mae)
        print(f"{name}, {ds_cnt+1}, {self_rsq}, {rsq}, {mae}, {lmae}")
        result.append(
            (name, ds_cnt+1, self_rsq, rsq, mae, lmae)
        )
        del X_train, X_test, y_train, y_test, self_rsq, rsq, mae, lmae
    del reg

name, fold, Train_R2, R2, MAE, logMAE


Lightgbm, 1, 0.9074434387856759, 0.880163130233538, 19.40381257380002, 2.9654695711835695
Lightgbm, 2, 0.9150220965312452, 0.8842035223979562, 19.15913124860588, 2.9527794295839387
Lightgbm, 3, 0.9092292150829823, 0.8861991627911516, 19.238382225929758, 2.9569073578109273
Lightgbm, 4, 0.9098832105071544, 0.8865857272937832, 19.193961685380135, 2.9545957340165203
Lightgbm, 5, 0.9073216400670026, 0.8810955735471171, 19.300598013940824, 2.9601360806090904
Lightgbm, 6, 0.9074409433607222, 0.881889370303324, 19.502565212126683, 2.9705460062576754
Lightgbm, 7, 0.9209598772823657, 0.8754878349686593, 19.423698450028194, 2.9664938901493
XGBoost, 1, 0.9268957041084382, 0.870300659672838, 19.85884442495116, 2.9886494708693268
XGBoost, 2, 0.9268362489924182, 0.8726638809808779, 19.970708936207032, 2.994266646858054
XGBoost, 3, 0.9268214037984858, 0.8798847557372329, 19.726547351435755, 2.981965309862331
XGBoost, 4, 0.9274591655108997, 0.8758526101697156, 19.78024490643966, 2.9846837076366044
XGBo

## Models evaluation

In [None]:
df = pd.DataFrame(result, columns=("name", "fold", "Train_R2", "R2", "MAE", "logMAE"))
df

Unnamed: 0,name,fold,Train_R2,R2,MAE,logMAE


In [None]:
sum_df = df.drop(columns="fold").groupby("name").mean()
sum_df

DataError: No numeric types to aggregate

In [None]:
best_model = sum_df["logMAE"].idxmin()
best_model

NameError: name 'sum_df' is not defined

## Train the final model

In [None]:
reg = None
for name, model in TREES:
    if name==best_model:
        reg = model

NameError: name 'best_model' is not defined

In [None]:
if reg:
    reg.fit(X, y)

In [None]:
if reg:
    grid.fit(X, y)

# Save the final model

In [None]:
name = "model.joblib"
model_path = os.path.join(".", "models", name)
dump(reg, model_path) 

['./models/model.joblib']

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=d59402b5-3ee5-4c9e-aee0-4349ccd0c385' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>