-
Notifications
You must be signed in to change notification settings - Fork 1
/
alpha_trends.py
143 lines (123 loc) · 4.86 KB
/
alpha_trends.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python
import os, pickle, random, time
try:
os.remove('my_gp_module.pyc')
except OSError:
pass
import scipy as sp
from scipy.linalg import eigh
from my_gp_module import GaussianProcess
import matplotlib.pyplot as plt
# --------------------------------------------
# WHAT IT DOES:
# Given Ntest configurations to keep track of,
# For increasing number Ntot of configurations:
# - Teach all of them and save their regression coefficients alpha
# - Do a second teaching not including the test configurations
# - Predict energy of test configurations using 2nd teaching and evaluate error
# Plot alpha vs. error // alpha STD vs. MAE error
# --------------------------------------------
# --------------------------------------------
# Parameters for the run
# --------------------------------------------
split = 1
N_models = 1
theta0 = 10.0
Ntest = 100
# --------------------------------------------
# Load all database
# --------------------------------------------
ttt = time.clock()
if not os.path.exists('qm7.pkl'): os.system('wget http://www.quantum-machine.org/data/qm7.pkl')
dataset = pickle.load(open('qm7.pkl','r'))
# --------------------------------------------
# Extract training data and test set
# --------------------------------------------
allP = dataset['P'][range(0,split)+range(split+1,5)].flatten()
print "TIMER load_data", time.clock() - ttt
nteach = sp.int32(sp.exp(sp.linspace(sp.log(2*Ntest), sp.log(allP.size), 25)))
# --------------------------------------------
# Loop over different training set sizes
# --------------------------------------------
alpha = []
alpha_std = []
mae_error = []
errors = []
for Nteach in nteach:
# --------------------------------------------
# First time include the test set to calculate their alpha
# --------------------------------------------
print "\n", "-"*60, "\n"
print "N teach = %d" % Nteach
# Select training data
P = allP[:Nteach]
X = dataset['X'][P]
T = dataset['T'][P]
# --------------------------------------------
# Extract feature(s) from training data and test set
# --------------------------------------------
# in this case, only sorted eigenvalues of Coulomb matrix
ttt = time.clock()
eigX = [(eigh(M, eigvals_only=True))[::-1] for M in X]
print "TIMER eval_features", time.clock() - ttt
# Observations
y = T.ravel()
# Setup a Gaussian Process model
ttt = time.clock()
gp = GaussianProcess(corr='absolute_exponential', theta0=sp.asarray([theta0]),
nugget=1e-3, verbose=True, normalise=True, do_features_projection=False, low_memory=False)
# Fit to data
gp.fit(eigX, y)
print "TIMER teach", time.clock() - ttt
local_alpha = gp.alpha[:Ntest]
print "alpha STD: %f" % sp.std(local_alpha)
print "alpha MAV: %f" % sp.mean(sp.absolute(local_alpha))
alpha.append(local_alpha.flatten())
alpha_std.append(sp.std(local_alpha))
# --------------------------------------------
# Second time don't include the test set and predict
# --------------------------------------------
# Extract feature(s) from training data and test set
# --------------------------------------------
eigt = eigX[:Ntest]
eigX = eigX[Ntest:]
# Observations
y = T.ravel()[Ntest:]
y_test = T.ravel()[:Ntest]
# Setup a Gaussian Process model
ttt = time.clock()
gp = GaussianProcess(corr='absolute_exponential', theta0=sp.asarray([theta0]),
nugget=1e-3, verbose=True, normalise=True, do_features_projection=False, low_memory=False)
# Fit to data
gp.fit(eigX, y)
print "TIMER teach", time.clock() - ttt
ttt = time.clock()
# Make the prediction on test set
y_pred, MSE = gp.predict(eigt, eval_MSE=True)
sigma = sp.sqrt(MSE)
mae_error.append(sp.absolute(y_pred-y_test).mean(axis=0))
errors.append(sp.absolute(y_pred-y_test))
print('\n test set:')
print('MAE: %5.2f kcal/mol' % sp.absolute(y_pred-y_test).mean(axis=0))
print('RMSE: %5.2f kcal/mol' % sp.square(y_pred-y_test).mean(axis=0)**.5)
print "TIMER predict", time.clock() - ttt
# Plot alpha STD vs. MAE error scatter (1 plot, dots, ~ 1 line)
# plt.plot(alpha_std, mae_error, 'o')
# plt.xlabel("regression coefficients STD")
# plt.ylabel("mean absolute error")
# plt.savefig('alphastd_vs_maeerror.png')
# Plot alpha vs. error scatter for selected test confs (1 plot, nplots <= Ntest lines)
nplots = 8
alpha = sp.array(alpha).T
errors = sp.array(errors).T
for a, err in zip(alpha[:nplots], errors[:nplots]):
plt.plot(a, err, 'o')
plt.xlabel("regression coefficient")
plt.ylabel("absolute error [kcal/mol]")
plt.savefig('alpha_vs_error.png')
for a, err in zip(alpha, errors):
plt.plot(a, err, 'o')
plt.xlabel("regression coefficient")
plt.ylabel("absolute error [kcal/mol]")
plt.savefig('alpha_vs_error_all.png')
# # sp.save("alphas.npy", sp.array(alpha))