In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, Draw
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.gaussian_process import GaussianProcessRegressor, kernels
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pickle
import optuna
import shap
from Utils import *
from matplotlib.offsetbox import AnchoredText

In [2]:
data = pd.read_csv('../../data/poly_dataset_imputation.csv')
pre_data = polymer_data(data)
fp = pre_data.get_FP()
y_He = data['log10_He_Bayes'].to_numpy().reshape(-1, 1)
y_H2 = data['log10_H2_Bayes'].to_numpy().reshape(-1, 1)
y_N2 = data['log10_N2_Bayes'].to_numpy().reshape(-1, 1)
y_O2 = data['log10_O2_Bayes'].to_numpy().reshape(-1, 1)
y_CO2 = data['log10_CO2_Bayes'].to_numpy().reshape(-1, 1)
y_CH4 = data['log10_CH4_Bayes'].to_numpy().reshape(-1, 1)
feature_name = [str(ft) for ft in range(fp.shape[1])]
kernel = kernels.RBF() + kernels.WhiteKernel()

In [5]:
train_model(y_N2, 'N2')

In [6]:
train_model(y_O2, 'O2')
train_model(y_CO2, 'CO2')
train_model(y_CH4, 'CH4')

In [4]:
def train_model(gas,gas_name=None):
    X_train, X_test, y_train, y_test = train_test_split(fp, gas, test_size=0.2, random_state=190)
    model = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=0, normalize_y=True, n_targets=1)
    model.fit(X_train, y_train)

    with open('../models/model_'+ gas_name+'.pkl', 'wb') as f:
        pickle.dump(model, f)

    with open('../models/model_'+ gas_name+'.pkl', 'rb') as f:
        model = pickle.load(f)
    