# ナイーブベイズクラス分類器

ナイーブベイズクラス分類器は線形モデルによく似た分類器。    
線形モデルより高速だが汎化性能が劣る。

- 特徴

```
線形モデルに似たクラス分類器．
訓練が線形モデルよりも高速．
速度の代償として，汎化機能ではLogisticRegression, LinearSVCよりも僅かに劣る．

高速で学習出来る理由として，クラスに対する統計値をここの特徴量ごとに集めてパラメータを学習するからである．
```

- 種類
    - GaussianNB: 任意の連続値
    - BernoulliNB: 二値データ
    - MultinomialNB: カウントデータ

In [2]:
import numpy as np

X = np.array([[0, 1, 0, 1],
                        [1, 0, 1, 1],
                        [0, 0, 0, 1],
                        [1, 0, 1, 0]])

y = np.array([0, 1, 0, 1])

ゼロと非ゼロの要素数のカウントのコーディング例

In [3]:
counts = {}
for label in np.unique(y):
    print("label", label)
    counts[label] = X[y == label].sum(axis=0)
print("Feature counts:\n{}".format(counts))

label 0
label 1
Feature counts:
{0: array([0, 1, 0, 2]), 1: array([2, 0, 2, 1])}


# GaussianNB

ユリデータ分類

In [2]:
import mglearn
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
%matplotlib inline

In [3]:
from sklearn import datasets
iris = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=42)
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB().fit(X_train, y_train)
gnb.score(X_test, y_test)

1.0

In [79]:
# from sklearn.naive_bayes import MultinomialNB
# from sklearn import metrics
# from sklearn.datasets import fetch_20newsgroups
# from sklearn.feature_extraction.text import CountVectorizer

# categories = ['alt.atheism', 'talk.religion.misc','comp.graphics', 'sci.space']
# newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
# vectorizer = CountVectorizer()
# vectors_test = vectorizer.transform(newsgroups_test.data)
# clf = MultinomialNB(alpha=.01)
# clf.fit(vectors, newsgroups_train.target)
# pred = clf.predict(vectors_test)
# metrics.f1_score(newsgroups_test.target, pred, average='macro')

In [80]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
newsgroups_train = fetch_20newsgroups(subset='train',
                                       remove=('headers', 'footers', 'quotes'),
                                       categories=categories)
vectors = vectorizer.fit_transform(newsgroups_train.data)
clf = MultinomialNB(alpha=.01)
clf.fit(vectors, newsgroups_train.target)
vectors_test = vectorizer.transform(newsgroups_test.data)
pred = clf.predict(vectors_test)
metrics.f1_score(newsgroups_test.target, pred, average='macro')

0.83519815555028587

### ナイーブベイズの利点, 欠点, パラメータ

- パラメータ

```
alpha: モデルの複雑さを制御する．
alphaが大きいとスムーズになり，モデルの複雑さが減少する．

GaussianNBは高次元データ，他の二つはテキストのようなカウントデータに対して用いられる．
MultinomialNBよりBernoulliNBの方が若干性能がいいが，非ゼロ特徴量(大きなドキュメント)にはMultinomialNBが有効
```

    
ナイーブベイズモデルの利点と欠点の多くは線形モデルと共通する。    
訓練も予測も非常に高速で、訓練の過程も理解しやすい。    
高次元の疎なデータに対してうまく機能するし、パラメータの設定に対しても頑健である。    
線形モデルですら時間のかかるような大規模データに対するベースラインモデルとして非常に有用。

<hr>

In [95]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
newsgroups_train = fetch_20newsgroups(subset='train',
                                       remove=('headers', 'footers', 'quotes'),
                                       categories=categories)
vectors = vectorizer.fit_transform(newsgroups_train.data)
print(vectors.shape)
print(vectors[:2,:])

# vectors_test = vectorizer.transform(newsgroups_test.data)

# print(vectors_test[:5])
# iris = datasets.load_iris()
# X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=42)

#http://www.kamishima.net/mlmpyja/nbayes1/fit1.html
#https://github.com/krzjoa/Bayes/blob/master/bayes/classifiers/cnb.py

(2034, 26879)
  (0, 21225)	1
  (0, 22504)	1
  (0, 10874)	1
  (0, 4030)	1
  (0, 5604)	1
  (0, 10651)	1
  (0, 10701)	2
  (0, 11962)	1
  (0, 21173)	1
  (0, 20041)	1
  (0, 2408)	1
  (0, 4326)	1
  (0, 14847)	1
  (0, 21177)	1
  (0, 24057)	2
  (0, 21409)	1
  (0, 2427)	1
  (0, 15444)	1
  (0, 21284)	1
  (0, 9935)	1
  (0, 17134)	1
  (0, 24082)	7
  (0, 12920)	3
  (0, 23103)	1
  (0, 17121)	1
  :	:
  (1, 23758)	1
  (1, 16783)	1
  (1, 24212)	1
  (1, 26270)	1
  (1, 10165)	1
  (1, 7972)	1
  (1, 3341)	1
  (1, 22163)	1
  (1, 26066)	1
  (1, 14330)	2
  (1, 6887)	1
  (1, 9744)	1
  (1, 4252)	1
  (1, 21696)	1
  (1, 10651)	1
  (1, 11962)	1
  (1, 4326)	1
  (1, 14847)	1
  (1, 24082)	1
  (1, 3254)	1
  (1, 13712)	1
  (1, 24380)	4
  (1, 3042)	1
  (1, 26405)	1
  (1, 24080)	1


In [19]:
# # alpha = 1
# # weight_normalized=False
# # complement_features_ = None
# # alpha_sum_ = None
# import numpy as np


# X = np.array([[1,1,1,1,1],[0,0,0,0,0], [0,0,0,0,0], [1,1,1,1,1]])
# y = [1,0,0,1]

# pY_ = None
# pXgY_ = None
# n_samples = X.shape[0]
# n_features = X.shape[1]
# n_classes = 2
# n_fvalues = 2

# print(n_features)

5


In [24]:
# def fit(X, y):
#     nY = np.zeros(n_classes, dtype=int)
#     for i in range(n_samples):
#         nY[y[i]] += 1

#     pY_ = np.empty(n_classes, dtype=float)
#     print(pY_)
#     for i in range(n_classes):
#         pY_[i] = nY[i] / n_samples

#     nXY = np.zeros((n_features, n_fvalues, n_classes), dtype=int)
#     for i in range(n_samples):
#         for j in range(n_features):
#             nXY[j, X[i, j], y[i]] += 1

#     pXgY_ = np.empty((n_features, n_fvalues, n_classes), dtype=float)
#     print(pXgY_)
#     for j in range(n_features):
#         for xi in range(n_fvalues):
#             for yi in range(n_classes):
#                 pXgY_[j, xi, yi] = nXY[j, xi, yi] / float(nY[yi])
    
# def predict(X):
#     print(pY_)
#     logpXY = np.log(pY_)
#     for j in range(n_features):
#         logpXY = logpXY + np.log(pXgY_[j, xi[j], :])
#     y[i] = np.argmax(logpXY)


In [26]:
# fit(X, y)

[  9.88131292e-324   9.88131292e-324]
[[[  9.88131292e-324   0.00000000e+000]
  [  0.00000000e+000   9.88131292e-324]]

 [[  9.88131292e-324   0.00000000e+000]
  [  0.00000000e+000   9.88131292e-324]]

 [[  9.88131292e-324   0.00000000e+000]
  [  0.00000000e+000   9.88131292e-324]]

 [[  9.88131292e-324   0.00000000e+000]
  [  0.00000000e+000   9.88131292e-324]]

 [[  9.88131292e-324   0.00000000e+000]
  [  0.00000000e+000   9.88131292e-324]]]


In [27]:
predict(X)

None


AttributeError: 'NoneType' object has no attribute 'log'

In [70]:
# cnb = ComplementNB()
# cnb.fit(X_train, y_train).score(X_test, y_test)


# alpha = 1
# weight_normalized=False
# complement_features_ = None
# alpha_sum_ = None

# pY_ = None
# pXgY_ = None
# n_samples = X.shape[0]
# n_features = X.shape[1]
# n_classes = 2
# n_fvalues = 2

# if n_samples != len(y):
#     raise ValueError('Mismatched number of samples.')

import numpy as np


# X = np.array([[1,1,1,1,1],[0,0,0,0,0], [0,0,0,0,0], [1,1,1,1,1]])
# y = [1,0,0,1]

class ComplementNB(object):
    def __init__(self):
        self.pY_ = None
        self.pXgY_ = None

    def fit(self, X, y):
        n_samples = X.shape[0]
        n_features = X.shape[1]
        if n_samples != len(y):
            raise ValueError('Mismatched number of samples.')

        
        nY = np.zeros(n_classes, dtype=int)
        print("nY.shape ", nY.shape)
        print("len(y) ", len(y))
        for i in range(n_samples):
            nY[y[i]] += 1

        self.pY_ = np.empty(n_classes, dtype=float)
        for i in range(n_classes):
            self.pY_[i] = nY[i] / n_samples

        nXY = np.zeros((n_features, n_fvalues, n_classes), dtype=int)
        for i in range(n_samples):
            for j in range(n_features):
                nXY[j, X[i, j], y[i]] += 1

        self.pXgY_ = np.empty((n_features, n_fvalues, n_classes),
                              dtype=float)
        for j in range(n_features):
            for xi in range(n_fvalues):
                for yi in range(n_classes):
                    self.pXgY_[j, xi, yi] = nXY[j, xi, yi] / float(nY[yi])

    def predict(self, X):
        logpXY = np.log(self.pY_)
#         for j in range(n_features):
#             logpXY = logpXY + np.log(self.pXgY_[j, xi[j], :])
        print(np.argmax(logpXY))



In [71]:
cnb = ComplementNB()
cnb.fit(X, y)

nY.shape  (2,)
len(y)  4


In [61]:
# np.array([1,1,1,1,1])
t0 = np.array([0,0,0,0,0])
t1 = np.array([1,1,1,1,1])

cnb.predict(t0)

0


In [43]:
cnb.predict(t1)

0


In [72]:
iris = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=42)
# y_train = list(y_train)
cnb = ComplementNB()
cnb.fit(X_train, y_train)


nY.shape  (2,)
len(y)  112


IndexError: index 2 is out of bounds for axis 0 with size 2

In [45]:
X_train.shape

(112, 4)

In [46]:
y_train.shape

(112,)

In [51]:
y_train = list(y_train)


In [47]:
X = np.array([[1,1,1,1,1],[0,0,0,0,0], [0,0,0,0,0], [1,1,1,1,1]])
X.shape

(4, 5)

In [48]:
y = [1,0,0,1]
y.shape

AttributeError: 'list' object has no attribute 'shape'