-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #87 from nlesc-nano/dev
Add script to run scikit models
- Loading branch information
Showing
4 changed files
with
196 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
#!/usr/bin/env python | ||
import argparse | ||
import logging | ||
from pathlib import Path | ||
|
||
import numpy as np | ||
import json | ||
import sklearn.gaussian_process.kernels as gp | ||
from scipy import stats | ||
|
||
from swan.dataset import FingerprintsData | ||
from swan.modeller import SKModeller | ||
from swan.utils.log_config import configure_logger | ||
|
||
configure_logger(Path(".")) | ||
|
||
# Starting logger | ||
LOGGER = logging.getLogger(__name__) | ||
|
||
|
||
# Scikit learn model hyperparameters | ||
dict_parameters = { | ||
"decision_tree": {'criterion': 'friedman_mse', 'max_features': 'auto', 'splitter': 'random'}, | ||
"svm": {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}, | ||
"gaussian_process": {"kernel": gp.ConstantKernel(1.0, (1e-4, 1e4)) * gp.RBF(10.0, (1e-4, 1e4))} | ||
} | ||
|
||
# Training variables | ||
properties = [ | ||
"Dissocation energy (nucleofuge)", | ||
"Dissociation energy (electrofuge)", | ||
"Electroaccepting power(w+)", | ||
"Electrodonating power (w-)", | ||
"Electronegativity (chi=-mu)", | ||
"Electronic chemical potential (mu)", | ||
"Electronic chemical potential (mu+)", | ||
"Electronic chemical potential (mu-)", | ||
"Electrophilicity index (w=omega)", | ||
"Global Dual Descriptor Deltaf+", | ||
"Global Dual Descriptor Deltaf-", | ||
"Hardness (eta)", | ||
"Hyperhardness (gamma)", | ||
"Net Electrophilicity", | ||
"Softness (S)" | ||
] | ||
|
||
|
||
def run_all(path_data: str, output_file: str): | ||
nruns = 3 | ||
models = ["gaussian_process"] # ["decision_tree", "svm"] | ||
rvalues = {} | ||
for p in properties: | ||
rvalues[p] = {} | ||
data = FingerprintsData( | ||
path_data, properties=[p], sanitize=False) | ||
data.scale_labels() | ||
for m in models: | ||
mean = np.mean([run_scikit_model(m, data) for i in range(nruns)]) | ||
print(f"model: {m} property: {p} mean: {mean}") | ||
rvalues[p][m] = mean | ||
|
||
with open(f"{output_file}.json", 'w') as handler: | ||
json.dump(rvalues, handler) | ||
|
||
|
||
def run_scikit_model(name_model: str, data: FingerprintsData): | ||
parameters = dict_parameters[name_model] | ||
modeller = SKModeller(data, name_model, **parameters) | ||
modeller.train_model() | ||
predicted, expected = modeller.validate_model() | ||
reg = stats.linregress(predicted, expected.flatten()) | ||
return reg.rvalue | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-f", "--file", help="File with the properties", required=True) | ||
parser.add_argument("-o", "--output", help="output file", required=True) | ||
args = parser.parse_args() | ||
run_all(args.file, args.output) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
#!/usr/bin/env python | ||
|
||
import argparse | ||
import logging | ||
from pathlib import Path | ||
from typing import Optional | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from sklearn import gaussian_process, svm, tree | ||
from sklearn.model_selection import GridSearchCV | ||
import sklearn.gaussian_process.kernels as gp | ||
|
||
from swan.dataset import FingerprintsData | ||
from swan.utils.log_config import configure_logger | ||
|
||
configure_logger(Path(".")) | ||
|
||
# Starting logger | ||
LOGGER = logging.getLogger(__name__) | ||
|
||
path_data = Path("data/Carboxylic_acids/CDFT/cdft_random_500.csv") | ||
properties = [ | ||
# "Dissocation energy (nucleofuge)", | ||
# "Dissociation energy (electrofuge)", | ||
# "Electroaccepting power(w+)", | ||
# "Electrodonating power (w-)", | ||
# "Electronegativity (chi=-mu)", | ||
# "Electronic chemical potential (mu)", | ||
# "Electronic chemical potential (mu+)", | ||
# "Electronic chemical potential (mu-)", | ||
# "Electrophilicity index (w=omega)", | ||
# "Global Dual Descriptor Deltaf+", | ||
# "Global Dual Descriptor Deltaf-", | ||
# "Hardness (eta)", | ||
# "Hyperhardness (gamma)", | ||
# "Net Electrophilicity", | ||
"Softness (S)" | ||
] | ||
|
||
|
||
supported_models = { | ||
"decision_tree": tree.DecisionTreeRegressor, | ||
"svm": svm.SVR, | ||
"gaussian_process": gaussian_process.GaussianProcessRegressor | ||
} | ||
|
||
supported_parameters = { | ||
"tree": { | ||
"criterion": ("mse", "friedman_mse", "mae"), | ||
"splitter": ("best", "random"), | ||
"max_features": ("auto", "sqrt", "log2"), | ||
}, | ||
"svm": { | ||
"kernel": ("linear", "poly", "rbf", "sigmoid"), | ||
"gamma": ("scale", "auto"), | ||
"C": [1, 5, 10], | ||
"shrinking": (True, False) | ||
}, | ||
"gaussian": { | ||
"kernel": [ | ||
gp.ConstantKernel(1.0, (1e-1, 1e3)) * gp.RBF(10.0, (1e-3, 1e3)), | ||
gp.ConstantKernel(1.0, (1e-1, 1e3)) * gp.DotProduct(), | ||
gp.ConstantKernel(1.0, (1e-1, 1e3)) * gp.Matern(10.0, (1e-3, 1e3)), | ||
gp.ConstantKernel(1.0, (1e-1, 1e3)) * gp.RationalQuadratic(), | ||
], | ||
} | ||
} | ||
|
||
|
||
def get_data(size: Optional[int]): | ||
"""Get the fingerprints data.""" | ||
# Fingerprints | ||
data = FingerprintsData( | ||
path_data, properties=properties, sanitize=False) | ||
data.scale_labels() | ||
# Take a sample | ||
size = len(data.fingerprints) if size is None else int(size) | ||
indices = np.random.choice(np.arange(len(data.fingerprints)), size=size, replace=False) | ||
return data.fingerprints[indices], data.labels[indices] | ||
|
||
|
||
def search_for_hyperparameters(model_name: str, nsamples: Optional[int]): | ||
"""Use a Grid Search for the best hyperparameters.""" | ||
fingerprints, labels = get_data(nsamples) | ||
model = supported_models[model_name]() | ||
parameters = supported_parameters[model_name] | ||
grid = GridSearchCV(model, parameters, scoring="r2") | ||
grid.fit(fingerprints, labels.flatten()) | ||
df = pd.DataFrame(grid.cv_results_) | ||
df.sort_values('rank_test_score', inplace=True) | ||
columns = ['params', 'mean_test_score', 'rank_test_score'] | ||
df.to_csv(f"{model_name}_hyperparameters.csv", columns=columns, index=False) | ||
print(df[columns][:5]) | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-m", "--model", choices=["decision_tree", "svm", "gaussian_process"], default="decision_tree") | ||
parser.add_argument("-n", "--nsamples", help="Number of sample to use", default=None) | ||
args = parser.parse_args() | ||
|
||
search_for_hyperparameters(args.model, args.nsamples) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters