Skip to content

Commit

Permalink
ENH sample_weight argument in discrete NB estimators
Browse files Browse the repository at this point in the history
Also, renamed Naive Bayes' unique_y attribute to _classes; it was undocumented.
  • Loading branch information
larsmans authored and ogrisel committed Nov 8, 2011
1 parent 2d934c4 commit c1f36ff
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 14 deletions.
35 changes: 21 additions & 14 deletions sklearn/naive_bayes.py
Expand Up @@ -61,7 +61,7 @@ def predict(self, X):
Predicted target values for X
"""
jll = self._joint_log_likelihood(X)
y_pred = self.unique_y[np.argmax(jll, axis=1)]
y_pred = self._classes[np.argmax(jll, axis=1)]
return y_pred

def predict_log_proba(self, X):
Expand Down Expand Up @@ -173,7 +173,7 @@ def fit(self, X, y):
X = np.asarray(X)
y = np.asarray(y)

self.unique_y = unique_y = np.unique(y)
self._classes = unique_y = np.unique(y)
n_classes = unique_y.shape[0]
_, n_features = X.shape

Expand All @@ -189,7 +189,7 @@ def fit(self, X, y):
def _joint_log_likelihood(self, X):
X = array2d(X)
joint_log_likelihood = []
for i in xrange(np.size(self.unique_y)):
for i in xrange(np.size(self._classes)):
jointi = np.log(self.class_prior[i])
n_ij = - 0.5 * np.sum(np.log(np.pi * self.sigma[i, :]))
n_ij -= 0.5 * np.sum(((X - self.theta[i, :]) ** 2) / \
Expand All @@ -208,7 +208,7 @@ class BaseDiscreteNB(BaseNB):
_joint_log_likelihood(X) as per BaseNB
"""

def fit(self, X, y, class_prior=None):
def fit(self, X, y, sample_weight=None, class_prior=None):
"""Fit Naive Bayes classifier according to X, y
Parameters
Expand All @@ -220,6 +220,9 @@ def fit(self, X, y, class_prior=None):
y : array-like, shape = [n_samples]
Target values.
sample_weight : array-like, shape = [n_samples], optional
Weights applied to individual samples (1. for unweighted).
class_prior : array, shape [n_classes]
Custom prior probability per class.
Overrides the fit_prior parameter.
Expand All @@ -239,28 +242,32 @@ def fit(self, X, y, class_prior=None):
masks (use `indices=True` in CV)."
raise ValueError(msg)

self.unique_y, inv_y_ind = unique(y, return_inverse=True)
n_classes = self.unique_y.size
self._classes, inv_y_ind = unique(y, return_inverse=True)
n_classes = self._classes.size

Y = LabelBinarizer().fit_transform(y)
if Y.shape[1] == 1:
Y = np.concatenate((1 - Y, Y), axis=1)

if sample_weight is not None:
Y *= array2d(sample_weight).T

if class_prior:
assert len(class_prior) == n_classes, \
'Number of priors must match number of classs'
self.class_log_prior_ = np.log(class_prior)
elif self.fit_prior:
y_count = np.bincount(inv_y_ind)
self.class_log_prior_ = np.log(y_count) - np.log(len(y))
# empirical prior, with sample_weight taken into account
y_freq = Y.sum(axis=0)
self.class_log_prior_ = np.log(y_freq) - np.log(y_freq.sum())
else:
self.class_log_prior_ = np.zeros(n_classes) - np.log(n_classes)

Y = LabelBinarizer().fit_transform(y)
if Y.shape[1] == 1:
Y = np.concatenate((1 - Y, Y), axis=1)

N_c, N_c_i = self._count(X, Y)

self.feature_log_prob_ = (np.log(N_c_i + self.alpha)
- np.log(N_c.reshape(-1, 1)
+ self.alpha * X.shape[1]))
- np.log(N_c.reshape(-1, 1)
+ self.alpha * X.shape[1]))

return self

Expand Down
9 changes: 9 additions & 0 deletions sklearn/tests/test_naive_bayes.py
Expand Up @@ -108,3 +108,12 @@ def test_discretenb_uniform_prior():
clf.fit([[0], [0], [1]], [0, 0, 1])
prior = np.exp(clf.class_log_prior_)
assert_array_equal(prior, np.array([.5, .5]))


def test_sample_weight():
clf = MultinomialNB()
clf.fit([[1, 2], [1, 2], [1, 0]],
[0, 0, 1],
sample_weight=[1,1,4])
assert_array_equal(clf.predict([1, 0]), [1])
assert_array_almost_equal(np.exp(clf.intercept_), [1/3., 2/3.])

0 comments on commit c1f36ff

Please sign in to comment.