In [2]:
from dataset import AtomDataset
import logging
import os
import pickle

import argparse
from typing import List, Dict, Any

from model import get_model, model_configs, validate_model, save_model, load_model
from descriptor import get_descriptor, descriptor_configs
from scaler import CustomStandardScaler, CustomStandardScalerInner, StandardScalerWithResizing

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from cuml.metrics import r2_score, mean_squared_error, accuracy_score

In [3]:
dataset = AtomDataset()
try:
	dataset.load_cache()
except FileNotFoundError:
	dataset.load_dataset()
	dataset.calculate_all_atomisation_energies()
	dataset.save_cache()

In [4]:
models_path = os.listdir("models")
models_path

['random_forest_coulomb.pickle',
 'ridge_coulomb.pickle',
 'lasso_coulomb.pickle',
 'random_forest_sine.pickle',
 'ridge_sine.pickle',
 'lasso_sine.pickle',
 'random_forest_ewald_sum.pickle',
 'ridge_ewald_sum.pickle',
 'lasso_ewald_sum.pickle',
 'random_forest_coulomb_full_ds.pickle',
 'ridge_coulomb_full_ds.pickle',
 'lasso_coulomb_full_ds.pickle',
 'random_forest_sine_full_ds.pickle',
 'ridge_sine_full_ds.pickle',
 'lasso_sine_full_ds.pickle',
 'random_forest_ewald_sum_full_ds.pickle',
 'ridge_ewald_sum_full_ds.pickle',
 'lasso_ewald_sum_full_ds.pickle',
 'ridge_soap_truncated_expansion.pickle',
 'random_forest_acsf_truncated_expansion.pickle',
 'ridge_acsf_truncated_expansion.pickle',
 'lasso_acsf_truncated_expansion.pickle',
 'random_forest_soap_truncated_expansion.pickle',
 'lasso_soap_truncated_expansion.pickle',
 'random_forest_mbtr_truncated_expansion.pickle',
 'ridge_mbtr_truncated_expansion.pickle',
 'lasso_mbtr_truncated_expansion.pickle',
 'random_forest_lmbtr_truncated_ex

In [5]:
models = []
for model_path in models_path:
    with open(f"models/{model_path}", "rb") as f:
        models.append(pickle.load(f))
models

[{'model': RandomForestRegressor(),
  'scaler': StandardScaler(),
  'descriptor': <dscribe.descriptors.coulombmatrix.CoulombMatrix at 0x7e48beb11bb0>},
 {'model': Ridge(),
  'scaler': StandardScaler(),
  'descriptor': <dscribe.descriptors.coulombmatrix.CoulombMatrix at 0x7e48beda6420>},
 {'model': Lasso(),
  'scaler': StandardScaler(),
  'descriptor': <dscribe.descriptors.coulombmatrix.CoulombMatrix at 0x7e48beda6d20>},
 {'model': RandomForestRegressor(),
  'scaler': StandardScaler(),
  'descriptor': <dscribe.descriptors.sinematrix.SineMatrix at 0x7e48be921ac0>},
 {'model': Ridge(),
  'scaler': StandardScaler(),
  'descriptor': <dscribe.descriptors.sinematrix.SineMatrix at 0x7e48bec23140>},
 {'model': Lasso(),
  'scaler': StandardScaler(),
  'descriptor': <dscribe.descriptors.sinematrix.SineMatrix at 0x7e48bec239b0>},
 {'model': RandomForestRegressor(),
  'scaler': StandardScaler(),
  'descriptor': <dscribe.descriptors.ewaldsummatrix.EwaldSumMatrix at 0x7e48be83c050>},
 {'model': Ridge

In [6]:
y = np.array(list(dataset.atomisation_energies.values()))
X = dataset.molecules

X_train, _X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [7]:
models_name = []
models_rmse = []
models_r2 = []
models_acc = []

for i, model in enumerate(models):
    X_test = model["descriptor"].create(_X_test, n_jobs=10)
    X_test = np.asarray(X_test, dtype=np.ndarray)
    r2, rmse, acc = validate_model(model["model"], model["scaler"], X_test, y_test)
    models_name.append(models_path[i].split(".")[0])
    models_r2.append(r2)
    models_rmse.append(rmse)
    models_acc.append(acc)

  calculator=atoms.get_calculator(),
  calculator=atoms.get_calculator(),
  calculator=atoms.get_calculator(),
  calculator=atoms.get_calculator(),
  calculator=atoms.get_calculator(),
  calculator=atoms.get_calculator(),
  calculator=atoms.get_calculator(),
  calculator=atoms.get_calculator(),
  calculator=atoms.get_calculator(),
  calculator=atoms.get_calculator(),


In [8]:
_models_rmse = [float(x) for x in models_rmse]

In [9]:
import pandas as pd

df = pd.DataFrame({
    "name": models_name,
    "r2": models_r2,
    "rmse": _models_rmse,
    "acc": models_acc,
}).sort_values(by=["acc"], ascending=False)

df

Unnamed: 0,name,r2,rmse,acc
3,random_forest_sine,0.189943,0.334012,0.742
6,random_forest_ewald_sum,0.206219,0.330639,0.734
0,random_forest_coulomb,0.169525,0.338195,0.725
4,ridge_sine,0.025232,0.366399,0.721
7,ridge_ewald_sum,0.074813,0.356959,0.72
1,ridge_coulomb,0.079997,0.355958,0.72
9,random_forest_coulomb_full_ds,-0.275328,0.419097,0.711
2,lasso_coulomb,-2.5e-05,0.371116,0.706
5,lasso_sine,-2.5e-05,0.371116,0.706
8,lasso_ewald_sum,-2.5e-05,0.371116,0.706


In [13]:
print(df.to_markdown())

|    | name                                    |           r2 |     rmse |   acc |
|---:|:----------------------------------------|-------------:|---------:|------:|
|  3 | random_forest_sine                      |  0.189943    | 0.334012 | 0.742 |
|  6 | random_forest_ewald_sum                 |  0.206219    | 0.330639 | 0.734 |
|  0 | random_forest_coulomb                   |  0.169525    | 0.338195 | 0.725 |
|  4 | ridge_sine                              |  0.0252319   | 0.366399 | 0.721 |
|  7 | ridge_ewald_sum                         |  0.0748135   | 0.356959 | 0.72  |
|  1 | ridge_coulomb                           |  0.0799973   | 0.355958 | 0.72  |
|  9 | random_forest_coulomb_full_ds           | -0.275328    | 0.419097 | 0.711 |
|  2 | lasso_coulomb                           | -2.54299e-05 | 0.371116 | 0.706 |
|  5 | lasso_sine                              | -2.54299e-05 | 0.371116 | 0.706 |
|  8 | lasso_ewald_sum                         | -2.54299e-05 | 0.371116 | 0.706 |
| 11