In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer
from sklearn.compose import TransformedTargetRegressor

In [2]:
from sklearn.model_selection import GridSearchCV

# Dataset

In [3]:
filename = input('Training dataset filename: ')

In [4]:
df = pd.read_csv(f'../data/processed/train/{filename}.csv')
df.head()

Unnamed: 0,metal_linker,organic_linker1,organic_linker2,topology,Co2N2_selectivity,heat_adsorption,CO2_working_capacity,functional_groups1,functional_groups2
0,3,4,11,5,22.864166,6.786041,105.284502,10,9
1,10,44,57,2,33.61678,7.147286,101.224774,15,1
2,2,22,24,5,19.263726,6.347967,118.987011,1,10
3,9,17,24,8,25.701377,6.190085,187.626004,20,2
4,2,1,22,5,30.001838,6.478063,79.210001,7,5


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66524 entries, 0 to 66523
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   metal_linker          66524 non-null  int64  
 1   organic_linker1       66524 non-null  int64  
 2   organic_linker2       66524 non-null  int64  
 3   topology              66524 non-null  int64  
 4   Co2N2_selectivity     66524 non-null  float64
 5   heat_adsorption       66524 non-null  float64
 6   CO2_working_capacity  66524 non-null  float64
 7   functional_groups1    66524 non-null  int64  
 8   functional_groups2    66524 non-null  int64  
dtypes: float64(3), int64(6)
memory usage: 4.6 MB


In [6]:
df.columns

Index(['metal_linker', 'organic_linker1', 'organic_linker2', 'topology',
       'Co2N2_selectivity', 'heat_adsorption', 'CO2_working_capacity',
       'functional_groups1', 'functional_groups2'],
      dtype='object')

In [7]:
cat_cols = [
    'functional_groups1', 'functional_groups2', 'topology',
    'metal_linker', 'organic_linker1', 'organic_linker2'
]

In [8]:
samples = df.drop(columns='CO2_working_capacity')
labels = df['CO2_working_capacity']

In [9]:
X_train, X_test, y_train, y_test, = train_test_split(samples, labels, test_size=0.3, random_state=122)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=141)

In [10]:
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

(46566, 8) (46566,)
(9979, 8) (9979,)
(9979, 8) (9979,)


# LGBM

In [11]:
from lightgbm import LGBMRegressor
import lightgbm as lgb

In [12]:
lgbm_model = LGBMRegressor(
    objective = 'rmse',
    num_threads = 8
)

In [13]:
param_lgbm_tune = {
    'max_depth': [5, 6, 7, 8],
    'num_leaves': [30, 50, 70, 100],
    'n_estimator': [3000, 2000]
}

In [14]:
lgbm_grid = GridSearchCV(lgbm_model, param_lgbm_tune, refit=True, verbose=0)

In [None]:
lgbm_grid.fit(
    X_train, y_train,
    eval_set = [(X_train, y_train), (X_val, y_val)],
    early_stopping_rounds = 10,
    feature_name = X_train.columns.to_list(),
    categorical_feature = cat_cols,
    eval_metric = 'mae'
)

In [16]:
lgbm_grid.best_estimator_

LGBMRegressor(max_depth=8, n_estimator=3000, num_leaves=100, num_threads=8,
              objective='rmse')

In [17]:
"""lgbm_model.fit(
    X_train, y_train,
    eval_set = [(X_train, y_train), (X_val, y_val)],
    early_stopping_rounds = 10,
    feature_name = X_train.columns.to_list(),
    categorical_feature = cat_cols,
    eval_metric = 'mae'
)"""

"lgbm_model.fit(\n    X_train, y_train,\n    eval_set = [(X_train, y_train), (X_val, y_val)],\n    early_stopping_rounds = 10,\n    feature_name = X_train.columns.to_list(),\n    categorical_feature = cat_cols,\n    eval_metric = 'mae'\n)"

In [18]:
lgbm_grid.best_estimator_.score(X_test, y_test)

0.8174459905215401

In [21]:
model_filename = input('')
lgbm_grid.best_estimator_.booster_.save_model(f'../models/lgbm/{model_filename}.txt')

<lightgbm.basic.Booster at 0x7f343a8e27f0>

# CatBoost

In [22]:
from catboost import CatBoostRegressor

In [23]:
catboost_model = CatBoostRegressor(
    loss_function='RMSE',
    eval_metric='MAE'
)

In [24]:
param_catboost_tune = {
    'depth': [6, 7, 8, 9, 10],
    'iterations': [1000, 750, 500]
}

In [25]:
cat_result = catboost_model.grid_search(
    param_catboost_tune,
    X_train, y_train,
    cv=3, stratified=False
)

0:	learn: 120.6119497	test: 119.5924492	best: 119.5924492 (0)	total: 52.2ms	remaining: 52.1s
1:	learn: 117.0283720	test: 116.0200246	best: 116.0200246 (1)	total: 60ms	remaining: 30s
2:	learn: 113.6122643	test: 112.6217695	best: 112.6217695 (2)	total: 65ms	remaining: 21.6s
3:	learn: 110.2215142	test: 109.2456879	best: 109.2456879 (3)	total: 71.7ms	remaining: 17.9s
4:	learn: 106.9758832	test: 106.0124232	best: 106.0124232 (4)	total: 79.6ms	remaining: 15.8s
5:	learn: 103.8090403	test: 102.8634863	best: 102.8634863 (5)	total: 85.3ms	remaining: 14.1s
6:	learn: 100.8199702	test: 99.8910676	best: 99.8910676 (6)	total: 91.8ms	remaining: 13s
7:	learn: 97.8426906	test: 96.9288002	best: 96.9288002 (7)	total: 101ms	remaining: 12.5s
8:	learn: 94.9848743	test: 94.0821777	best: 94.0821777 (8)	total: 108ms	remaining: 11.9s
9:	learn: 92.2249066	test: 91.3312661	best: 91.3312661 (9)	total: 115ms	remaining: 11.4s
10:	learn: 89.6178369	test: 88.7315035	best: 88.7315035 (10)	total: 123ms	remaining: 11.1s
1

In [26]:
cat_result['params']

{'depth': 10, 'iterations': 1000}

In [27]:
catboost_model.score(X_test, y_test)

0.8139102937371244

In [28]:
model_filename = input('')
catboost_model.save_model(f'../models/catboost/{model_filename}.json', format='json')