-
Notifications
You must be signed in to change notification settings - Fork 0
/
softmax_regression.py
286 lines (238 loc) · 9.37 KB
/
softmax_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
class SoftmaxClassifier(object):
'''
Basic implementation of a softmax classifier.
Parameters
----------
random_state: int (default: 1337)
Defines the random state of numpy for a particular class instance.
n_classes: int (default: 2)
Number of classes used for the classification.
epochs: int (default: 500)
Number of epochs to train the model. This should be validated with a
validation set in real world problems to avoid overtraining or
stopping too early.
learning_rate: float (default: 0.1)
Initial learning rate. Also has to be validated to archieve a fast
convergence. A learning rate decay schedule still needs to be
implemented.
'''
def __init__(self,
random_state=None,
n_classes=2,
epochs=500,
learning_rate=.1):
if random_state is None:
np.random.RandomState(1337)
else:
np.random.RandomState(random_state)
self._is_fitted = False
self._W = None
self._b = None
self._n_classes = n_classes
self.epochs = epochs
self.learning_rate = learning_rate
def fit(self, Xs, Ys, init_params=True):
'''
Fits the model for given `Xs` and `Ys` and initializes all the
parameters if needed.
Parameters
----------
Xs: array of shape (n_points, n_features)
Two dimensional array containing data points with all features.
Ys: array of shape (n_points, n_classes)
Two dimensional array containing the labels of all data point in a
one-hot structure.
init_params: bool
If True the weights and biases get initialized, otherwise the
class attributes _W and _b are used.
'''
self._fit(Xs=Xs, Ys=Ys, init_params=init_params)
self._is_fitted = True
return self
def _fit(self, Xs, Ys, init_params):
if init_params:
n_classes = self._n_classes
n_features = Xs.shape[1]
self._W, self._b = self._init_params(n_features, n_classes)
self.costs = []
if self._W.any() is None or self._b.any() is None:
raise AttributeError(
'Initialize weights and biases before fitting.')
# Implement batch use here! For the moment one batch = all data
for i in range(self.epochs):
scores = self._scores(Xs, self._W, self._b)
probs = self._softmax(scores)
cost = self._cost(probs, Ys)
self.costs.append(cost)
dW, db = self._calc_gradient(Xs, Ys, probs)
# The gradient can also be calculated numerically to check
# the analytical calculation
# dW_, db_ = self._calc_numeric_gradient(Xs, Ys)
# print(dW, db)
# print(dW_, db_)
self._W += -self.learning_rate * dW
self._b += -self.learning_rate * db
def predict(self, Xs):
'''
Predict the class of data points `Xs` given the current model.
Parameters
----------
Xs: array of shape (n_points, n_features)
Two dimensional array containing data points with all features.
Returns
-------
array of shape (n_points,)
The returned array contains the predicted classlabel for every data
point in `Xs`.
'''
if not self._is_fitted:
raise AttributeError('Model is not fitted, yet!')
return self._predict(Xs)
def _predict(self, Xs):
scores = self._scores(Xs, self._W, self._b)
probs = self._softmax(scores)
return self._to_classlabel(probs)
def _calc_gradient(self, Xs, Ys, probs):
diff = -(Ys - probs)
diff /= diff.shape[0]
dW = Xs.T.dot(diff)
db = np.sum(diff, axis=0)
return dW, db
def _calc_numeric_gradient(self, Xs, Ys):
from copy import deepcopy as cp
dW = np.zeros(self._W.shape)
db = np.zeros(self._b.shape)
h = self.learning_rate
it = np.nditer(self._W, flags=['multi_index'], op_flags=['readwrite'])
while not it.finished:
ix = it.multi_index
Wph = cp(self._W)
Wph[ix] += h
yp_pred = self._softmax(self._scores(Xs, Wph, self._b))
cph = self._cost(yp_pred, Ys)
Wmh = cp(self._W)
Wmh[ix] -= h
ym_pred = self._softmax(self._scores(Xs, Wmh, self._b))
cmh = self._cost(ym_pred, Ys)
dW[ix] = (cph - cmh) / (2 * h)
it.iternext()
it = np.nditer(self._b, flags=['multi_index'], op_flags=['readwrite'])
while not it.finished:
ix = it.multi_index
bph = cp(self._b)
bph[ix] += h
yp_pred = self._softmax(self._scores(Xs, self._W, bph))
cph = self._cost(yp_pred, Ys)
bmh = cp(self._b)
bmh[ix] -= h
ym_pred = self._softmax(self._scores(Xs, self._W, bmh))
cmh = self._cost(ym_pred, Ys)
db[ix] = (cph - cmh) / (2 * h)
it.iternext()
return dW, db
def _init_params(self, n_features, n_classes):
weight_shape = (n_features, n_classes)
bias_shape = (n_classes,)
W = np.random.rand(n_features * n_classes) / \
np.sqrt(n_features * n_classes)
W = W.reshape(weight_shape)
b = np.zeros(bias_shape)
return W, b
def _scores(self, xs, W, b):
scores = np.matmul(xs, W) + b
return scores
def _softmax(self, scores):
scores -= np.max(scores)
probs = (np.exp(scores).T / np.sum(np.exp(scores), axis=1)).T
return probs
def _to_classlabel(self, probs):
return np.argmax(probs, axis=1)
def _cross_entropy(self, y_pred, y_true):
y_pred = np.clip(y_pred, 1e-10, 1)
cross_entropy = -np.sum(y_true * np.log(y_pred), axis=1)
return cross_entropy
def _cost(self, y_pred, y_true):
'''
This function calculates the actual cost,
which allows to add additional terms to the function,
e.g. L2 regularization.
'''
return np.mean(self._cross_entropy(y_pred, y_true))
def generate_easy_data_(self, N_points=50):
Xs_0_mean = [2., 10.]
Xs_0_cov = [[1., 0.], [0., 1.]]
Xs_0 = np.random.multivariate_normal(Xs_0_mean,
Xs_0_cov,
size=N_points)
Xs_1_mean = [5., 5.]
Xs_1_cov = [[1., 0.], [0., 1.]]
Xs_1 = np.random.multivariate_normal(Xs_1_mean,
Xs_1_cov,
size=N_points)
Xs = np.append(Xs_0, Xs_1, axis=0)
n_classes = 2
Ys_0 = np.zeros(N_points, dtype=np.int)
Ys_1 = np.ones(N_points, dtype=np.int)
Ys_ = np.append(Ys_0, Ys_1)
Ys = np.eye(n_classes)[Ys_]
return Xs, Ys
def generate_spiral_data(self, N_points=50):
n_classes = self._n_classes
n_features = 2
Xs = np.zeros((N_points * n_classes, n_features))
Ys = np.zeros((N_points * n_classes), dtype=np.int)
for i in range(n_classes):
ix = range(N_points * i, N_points * (i + 1))
r = np.linspace(0., 1., N_points)
t = np.linspace(i * 4, (i + 1) * 4, N_points) + \
np.random.randn(N_points) * 0.2
Xs[ix] = np.c_[r * np.sin(t), r * np.cos(t)]
Ys[ix] = i
Ys = np.eye(n_classes)[Ys]
return Xs, Ys
def generate_two_pop(self):
P0 = np.load('data/P0.npy')
P1 = np.load('data/P1.npy')
Xs = np.append(P0.T, P1.T, axis=0)
Ys = np.append(np.zeros(P0.shape[1], dtype=np.int),
np.ones(P1.shape[1], dtype=np.int))
Ys = np.eye(2)[Ys]
return Xs, Ys
def plot_decision_regions(Xs, Ys, classifier,
ax=None, res=.01, colors=None):
if ax is None:
ax = plt.gca()
if colors is None:
colors = sns.color_palette()
x_min, x_max = np.min(Xs[:, 0]) - .1, np.max(Xs[:, 0]) + .1
y_min, y_max = np.min(Xs[:, 1]) - .1, np.max(Xs[:, 1]) + .1
xx, yy = np.meshgrid(np.arange(x_min, x_max, res),
np.arange(y_min, y_max, res))
Z = classifier.predict(np.array([xx.ravel(), yy.ravel()]).T)
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z,
alpha=.3,
levels=np.arange(Z.max() + 2) - .5,
colors=colors)
ax.axis(xmin=x_min, xmax=x_max, ymin=y_min, ymax=y_max)
for ix in np.unique(np.argmax(Ys, axis=1)):
ax.scatter(x=Xs[np.argmax(Ys, axis=1) == ix, 0],
y=Xs[np.argmax(Ys, axis=1) == ix, 1],
alpha=.8,
label='Class {}'.format(int(ix)),
c=colors[ix])
ax.legend(loc='lower right', fancybox=True)
if __name__ == '__main__':
sc = SoftmaxClassifier()
Xs, Ys = sc.generate_two_pop()
sc.fit(Xs, Ys)
plot_decision_regions(Xs, Ys, sc)
plt.savefig('2_pop.pdf')
sc = SoftmaxClassifier(n_classes=3)
Xs, Ys = sc.generate_spiral_data()
sc.fit(Xs, Ys)
plot_decision_regions(Xs, Ys, sc)
plt.savefig('spiral_data.pdf')