-
Notifications
You must be signed in to change notification settings - Fork 0
/
examples.py
126 lines (93 loc) · 4.16 KB
/
examples.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# Example use cases for depablo_box
############################################################################
# Start to end use case
############################################################################
"""I want to calculate different chemical properties and perform machine learning on them"""
from depablo_box import PDBML, model
# initialize class
dx = PDBML()
# access depablo_box database as pandas dataframe
df = dx.df
# say we want to calculate these descriptors and add to dataframe
descriptor_list = ['BalabanJ', 'BertzCT', 'Ipc', 'HallKierAlpha', 'MolLogP', 'MolMR']
dx.add_descriptors(descriptor_list)
# see if there are any correlations between the above properties
dx.correlation_map(descriptor_list)
# define model arguments
input_properties = descriptor_list
output_property = "glass_transition_temperature"
na_strategy = "remove" # options: 'mean', 'remove' - default is 'remove'
# initialize model
ml = model(df, input_properties, output_property, na_strategy=na_strategy)
# start training
algorithm = "Support Vector Regression"
ml.train(algorithm)
# view model results
print("YOUR MODEL R^2 SCORE: {}".format(ml.r_2))
ml.feature_importance() # plot feature importances
# Predict on new data that follows the correponding index placement of the descriptor_list
# new_data is [[Any], [Any]] where each array inside represents a single row of input
new_data = [["10.5", "29", "102.1", "91.2", "1.1", "0.15"]]
results = ml.predict(new_data)
print(results)
# Finally, export fitted model as pickle file
outpath = "Tg_prediction_model.pickle"
ml.export_fitted_model(outpath)
############################################################################
# Experiment use case
############################################################################
from depablo_box import PDBML, model
# initialize class
dx = PDBML()
# check if number of radical electrons is already in dataframe
df = dx.df
print(list(df))
# it seems like this property hasn't been added yet,
# so we add number of radical electrons for each polymer to dataframe
descriptor_list = ["NumValenceElectrons"]
dx.add_descriptors(descriptor_list)
# plot number of radical electrons against solubility parameters as a scatterplot
dx.plot_properties(property_x="solubility_parameter", property_y=descriptor_list[0])
# get correlation between number of radical electrons against solubility parameters
dx.property_correlation("solubility_parameter", "NumValenceElectrons")
"""hmm... there's doesn't seem to be much of a correlation.
I then want to know what the maximum predictive power is, so I use a
support vector machine regression to fit a function that as maximally predictive"""
# define model arguments
input_properties = descriptor_list
output_property = "solubility_parameter"
na_strategy = "remove" # options: 'mean', 'remove' - default is 'remove'
# initialize model
ml = model(df, input_properties, output_property, na_strategy=na_strategy)
# start training
algorithm = "Lasso Regression"
ml.train(algorithm)
# see the predictiveness of the model
print("YOUR MODEL R^2 SCORE: {}".format(ml.r_2))
"""It turns out there is a very good correlation between the two, so I now want
to save this model as a pickle file"""
# export fitted model
outpath = "Tg_prediction_model.pickle"
ml.export_fitted_model(outpath)
# load fitted model
import pickle
with open(outpath, "rb") as f:
ml = pickle.load(f) # ml = sklearn regressor class
new_data = [[1000]]
results = ml.predict(new_data)
print("YOUR RESULTS: {}".format(results))
############################################################################
# Generate Quantum Chemistry Files
############################################################################
"""I want to convert SMILES format into some other format (maybe input files for quantum chemistry codes)"""
from depablo_box import PDBML
# initialize class
dx = PDBML()
# list all supported conversions
print(dx.conversion_formats)
# either SMILES format or polymer name
polymer_identifier = 'C=CC(=O)NC(C)C'
conversion_format = 'Gaussian 98/03 Input'
outpath = '/file/path/your_polymer.xyz'
# writes gaussian codes to outpath
dx.create_input_file(polymer_identifier, conversion_format, outpath)