Skip to content

Commit

Permalink
Merge pull request #87 from nlesc-nano/dev
Browse files Browse the repository at this point in the history
Add script to run scikit models
  • Loading branch information
felipeZ committed May 21, 2021
2 parents 282e86e + 2e0928b commit c3f6c7c
Show file tree
Hide file tree
Showing 4 changed files with 196 additions and 5 deletions.
84 changes: 84 additions & 0 deletions scripts/run_scikit_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env python
import argparse
import logging
from pathlib import Path

import numpy as np
import json
import sklearn.gaussian_process.kernels as gp
from scipy import stats

from swan.dataset import FingerprintsData
from swan.modeller import SKModeller
from swan.utils.log_config import configure_logger

configure_logger(Path("."))

# Starting logger
LOGGER = logging.getLogger(__name__)


# Scikit learn model hyperparameters
dict_parameters = {
"decision_tree": {'criterion': 'friedman_mse', 'max_features': 'auto', 'splitter': 'random'},
"svm": {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'},
"gaussian_process": {"kernel": gp.ConstantKernel(1.0, (1e-4, 1e4)) * gp.RBF(10.0, (1e-4, 1e4))}
}

# Training variables
properties = [
"Dissocation energy (nucleofuge)",
"Dissociation energy (electrofuge)",
"Electroaccepting power(w+)",
"Electrodonating power (w-)",
"Electronegativity (chi=-mu)",
"Electronic chemical potential (mu)",
"Electronic chemical potential (mu+)",
"Electronic chemical potential (mu-)",
"Electrophilicity index (w=omega)",
"Global Dual Descriptor Deltaf+",
"Global Dual Descriptor Deltaf-",
"Hardness (eta)",
"Hyperhardness (gamma)",
"Net Electrophilicity",
"Softness (S)"
]


def run_all(path_data: str, output_file: str):
nruns = 3
models = ["gaussian_process"] # ["decision_tree", "svm"]
rvalues = {}
for p in properties:
rvalues[p] = {}
data = FingerprintsData(
path_data, properties=[p], sanitize=False)
data.scale_labels()
for m in models:
mean = np.mean([run_scikit_model(m, data) for i in range(nruns)])
print(f"model: {m} property: {p} mean: {mean}")
rvalues[p][m] = mean

with open(f"{output_file}.json", 'w') as handler:
json.dump(rvalues, handler)


def run_scikit_model(name_model: str, data: FingerprintsData):
parameters = dict_parameters[name_model]
modeller = SKModeller(data, name_model, **parameters)
modeller.train_model()
predicted, expected = modeller.validate_model()
reg = stats.linregress(predicted, expected.flatten())
return reg.rvalue


def main():
parser = argparse.ArgumentParser()
parser.add_argument("-f", "--file", help="File with the properties", required=True)
parser.add_argument("-o", "--output", help="output file", required=True)
args = parser.parse_args()
run_all(args.file, args.output)


if __name__ == "__main__":
main()
107 changes: 107 additions & 0 deletions scripts/search_scikit_hyper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/usr/bin/env python

import argparse
import logging
from pathlib import Path
from typing import Optional

import numpy as np
import pandas as pd
from sklearn import gaussian_process, svm, tree
from sklearn.model_selection import GridSearchCV
import sklearn.gaussian_process.kernels as gp

from swan.dataset import FingerprintsData
from swan.utils.log_config import configure_logger

configure_logger(Path("."))

# Starting logger
LOGGER = logging.getLogger(__name__)

path_data = Path("data/Carboxylic_acids/CDFT/cdft_random_500.csv")
properties = [
# "Dissocation energy (nucleofuge)",
# "Dissociation energy (electrofuge)",
# "Electroaccepting power(w+)",
# "Electrodonating power (w-)",
# "Electronegativity (chi=-mu)",
# "Electronic chemical potential (mu)",
# "Electronic chemical potential (mu+)",
# "Electronic chemical potential (mu-)",
# "Electrophilicity index (w=omega)",
# "Global Dual Descriptor Deltaf+",
# "Global Dual Descriptor Deltaf-",
# "Hardness (eta)",
# "Hyperhardness (gamma)",
# "Net Electrophilicity",
"Softness (S)"
]


supported_models = {
"decision_tree": tree.DecisionTreeRegressor,
"svm": svm.SVR,
"gaussian_process": gaussian_process.GaussianProcessRegressor
}

supported_parameters = {
"tree": {
"criterion": ("mse", "friedman_mse", "mae"),
"splitter": ("best", "random"),
"max_features": ("auto", "sqrt", "log2"),
},
"svm": {
"kernel": ("linear", "poly", "rbf", "sigmoid"),
"gamma": ("scale", "auto"),
"C": [1, 5, 10],
"shrinking": (True, False)
},
"gaussian": {
"kernel": [
gp.ConstantKernel(1.0, (1e-1, 1e3)) * gp.RBF(10.0, (1e-3, 1e3)),
gp.ConstantKernel(1.0, (1e-1, 1e3)) * gp.DotProduct(),
gp.ConstantKernel(1.0, (1e-1, 1e3)) * gp.Matern(10.0, (1e-3, 1e3)),
gp.ConstantKernel(1.0, (1e-1, 1e3)) * gp.RationalQuadratic(),
],
}
}


def get_data(size: Optional[int]):
"""Get the fingerprints data."""
# Fingerprints
data = FingerprintsData(
path_data, properties=properties, sanitize=False)
data.scale_labels()
# Take a sample
size = len(data.fingerprints) if size is None else int(size)
indices = np.random.choice(np.arange(len(data.fingerprints)), size=size, replace=False)
return data.fingerprints[indices], data.labels[indices]


def search_for_hyperparameters(model_name: str, nsamples: Optional[int]):
"""Use a Grid Search for the best hyperparameters."""
fingerprints, labels = get_data(nsamples)
model = supported_models[model_name]()
parameters = supported_parameters[model_name]
grid = GridSearchCV(model, parameters, scoring="r2")
grid.fit(fingerprints, labels.flatten())
df = pd.DataFrame(grid.cv_results_)
df.sort_values('rank_test_score', inplace=True)
columns = ['params', 'mean_test_score', 'rank_test_score']
df.to_csv(f"{model_name}_hyperparameters.csv", columns=columns, index=False)
print(df[columns][:5])


def main():
parser = argparse.ArgumentParser()
parser.add_argument("-m", "--model", choices=["decision_tree", "svm", "gaussian_process"], default="decision_tree")
parser.add_argument("-n", "--nsamples", help="Number of sample to use", default=None)
args = parser.parse_args()

search_for_hyperparameters(args.model, args.nsamples)


if __name__ == "__main__":
main()
6 changes: 3 additions & 3 deletions swan/modeller/scikit_modeller.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ def __init__(self, data: FingerprintsData, name: str, **kwargs):
self.path_model = "swan_skmodeller.pkl"

supported_models = {
"decisiontree": tree.DecisionTreeRegressor,
"decision_tree": tree.DecisionTreeRegressor,
"svm": svm.SVR,
"gaussianprocess": gaussian_process.GaussianProcessRegressor
"gaussian_process": gaussian_process.GaussianProcessRegressor
}

if name.lower() in supported_models:
Expand Down Expand Up @@ -73,7 +73,7 @@ def train_model(self, frac: Tuple[float, float] = (0.8, 0.2)):
fraction to divide the dataset, by default [0.8, 0.2]
"""
self.split_data(frac)
self.model.fit(self.features_trainset, self.labels_trainset)
self.model.fit(self.features_trainset, self.labels_trainset.flatten())
self.save_model()

def save_model(self):
Expand Down
4 changes: 2 additions & 2 deletions tests/test_scikit_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def run_test(model: str, **kwargs):

def test_decision_tree():
"""Check the interface to the Decisiontree class."""
run_test("decisiontree")
run_test("decision_tree")


def test_svm():
Expand All @@ -33,4 +33,4 @@ def test_svm():
def test_gaussian_process():
"""Check the interface to the support vector machine."""
kernel = ConstantKernel(constant_value=10)
run_test("gaussianprocess", kernel=kernel)
run_test("gaussian_process", kernel=kernel)

0 comments on commit c3f6c7c

Please sign in to comment.