Naive Bayes Classifier
======================
Fitting
-------
Bernoulli Naive Bayes (Taken from Murphy's book)
<img src="fit.png",width=600>
Prediction
----------
$$ p(y=c\,|\,\mathbf{x},\mathcal{D}) \propto \pi_c\prod_{j=1}^{D}(\theta_{jc})^{x_j}(1-\theta_{jc})^{1-x_j} $$

In [1]:
import random
import nltk
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
nltk.download('movie_reviews')

Preprocessing
-------------

In [2]:
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

In [3]:
def get_features(text):
    document_words = set(text)
    features = []
    for word in word_features:
        features.append(word in document_words)
    return features

In [4]:
#print(get_features(movie_reviews.words('pos/cv957_8737.txt'))) 
xData=[]
yData=[]
for (d,c) in documents:
    xData.append(get_features(d))
    if c=='pos':
        yData.append(True)
    else:
        yData.append(False)    

Classification
--------------

In [5]:
#clf = MultinomialNB()
clf = BernoulliNB()
print(cross_val_score(clf, xData, yData, cv=10))
#clf.fit(xData, yData)
#accuracy_score(yData, clf.predict(xData))
#f1_score(yData, clf.predict(xData), average='binary')  

[ 0.695  0.65   0.63   0.665  0.685  0.675  0.695  0.69   0.69   0.76 ]


<img src="XV.png",width=300>