In [1]:
import os.path as op
import numpy as np

from sklearn.base import BaseEstimator, ClassifierMixin
from collections import Counter 

In [2]:
# Load data
print("Loading dataset")

from glob import glob
filenames_neg = sorted(glob(op.join('..', 'data', 'imdb1', 'neg', '*.txt')))
filenames_pos = sorted(glob(op.join('..', 'data', 'imdb1', 'pos', '*.txt')))

texts_neg = [open(f).read() for f in filenames_neg]
texts_pos = [open(f).read() for f in filenames_pos]
texts = texts_neg + texts_pos
y = np.ones(len(texts), dtype=np.int)
y[:len(texts_neg)] = 0.

print("%d documents" % len(texts))

Loading dataset
2000 documents


In [3]:
def count_words(texts):
    """Vectorize text : return count of each word in the text snippets

    Parameters
    ----------
    texts : list of str
        The texts

    Returns
    -------
    vocabulary : dict
        A dictionary that points to an index in counts for each word.
    counts : ndarray, shape (n_samples, n_features)
        The counts of each word in each text.
        n_samples == number of documents.
        n_features == number of words in vocabulary.
    """
    vocabulary = Counter([])
    for text in texts:
        #pass
        list1 = text.split()
        dir1 = Counter(list1)
        vocabulary = vocabulary + dir1
    vocabulary = dict(vocabulary)
    n_features = len(vocabulary)

    counts = np.zeros(n_features)
    j=0
    for i in vocabulary:
        counts[j] = vocabulary[i]
        j +=1
    
    return vocabulary, counts

In [4]:
def CountDocsInClass(y,c):
    count = 0
    for i in y:
        if i == c:
            count = count + 1
    return count

def ConcatenateTextOfAllDocsInClass(D1, c):
    y = zip(*D1)[1]
    D = zip(*D1)[0]
    text_c = ''
    for i in np.arange(0, len(D)):
        if y[i] == c:
            text_c = text_c + D[i]
    return text_c

def ExtractTokensFromDoc(V, d):
    d = d.split()
    i = 0
    W = []
    for v in V:
        if v in d:
            W.append(i)
        i +=1
    return W  

def TrainMultinomialNB(C, D1):  
    y = zip(*D1)[1]
    D = zip(*D1)[0]
    V,counts = count_words(D)
    N = len(D)
    prior = np.zeros(len(C))
    condprob = np.zeros((len(V), len(C)))
    i = 0
    for c in C:
        Nc = CountDocsInClass(y,c)
        prior[i] = float(Nc)/N 
        text_c = ConcatenateTextOfAllDocsInClass(D1, c)
        
        V1 = text_c.split()
        T_ct = np.zeros(len(V))
        ii = 0
        for m in V:
            for l in V1:
                if l == m:
                    T_ct[ii] += 1
            ii += 1

        j = 0
        for t in V:
            condprob[j][i] = float(T_ct[j]+1)/(sum(T_ct+1)) 
            j += 1
        i += 1
    return V, prior, condprob

def ApplyMultinomialNB(C,V, prior, condprob, d):
    W = ExtractTokensFromDoc(V, d)
    score = np.zeros(len(C))
    i = 0
    for c in C:
        score[i] = np.log(prior[i])
        for j in W:
            score[i] += np.log(condprob[j][i])
        i = i + 1
    if score[0] > score[1] :
        return 0
    else:
        return 1

In [5]:
class NB(BaseEstimator, ClassifierMixin):
    def __init__(self):
        pass

    def fit(self, X, y):
        D1 = zip(X, y)
        self.C = [0, 1]
        self.V, self.prior, self.condprob = TrainMultinomialNB(self.C, D1)
        return

    def predict(self, X):
        d = X
        return ApplyMultinomialNB(self.C,self.V, self.prior, self.condprob, d)
        #return (np.random.randn(len(X)) > 0).astype(np.int)

    def score(self, X, y):
        correct = 0
        for i in np.arange(0, len(X)):
            if ApplyMultinomialNB(self.C,self.V, self.prior, self.condprob, X[i]) == y[i]:
                correct += 1
        return float(correct) / len(X)  
        #return np.mean(self.predict(X) == y)

In [6]:
nb = NB()
nb.fit(texts[::2], y[::2])

In [7]:
print "The score is " + str(nb.score(texts[1::2], y[1::2]))

The score is 0.836


In [8]:
#cross validation 
X_cv = texts[400:525] + texts[1400:1525]
y_cv = np.zeros(250)
y_cv[:125] = y[400:525]
y_cv[125:] = y[1400:1525]
#1
X_train = X_cv[50:]
y_train = y_cv[50:]
X_test = X_cv[:50]
y_test = y_cv[:50]
nb1 = NB()
nb1.fit(X_train, y_train)
score1 = nb1.score(X_test, y_test)

In [9]:
#2
X_train = X_cv[:50] + X_cv[100:]
y_train = np.zeros(200)
y_train[:50] = y_cv[:50]
y_train[50:] = y_cv[100:]
X_test = X_cv[50:100]
y_test = y_cv[50:100]
nb2 = NB()
nb2.fit(X_train, y_train)
score2 = nb2.score(X_test, y_test)

In [10]:
#3
X_train = X_cv[:100] + X_cv[150:]
y_train = np.zeros(200)
y_train[:100] = y_cv[:100]
y_train[100:] = y_cv[150:]
X_test = X_cv[100:150]
y_test = y_cv[100:150]
nb3 = NB()
nb3.fit(X_train, y_train)
score3 = nb3.score(X_test, y_test)

In [11]:
#4
X_train = X_cv[:150] + X_cv[120:]
y_train = np.zeros(200)
y_train[:150] = y_cv[:150]
y_train[150:] = y_cv[200:]
X_test = X_cv[150:200]
y_test = y_cv[150:200]
nb4 = NB()
nb4.fit(X_train, y_train)
score4 = nb4.score(X_test, y_test)

In [12]:
#5
X_train = X_cv[:200]
y_train = y_cv[:200]
X_test = X_cv[200:]
y_test = y_cv[200:]
nb5 = NB()
nb5.fit(X_train, y_train)
score5 = nb5.score(X_test, y_test)

In [13]:
print "The score of cross validation is " + str((score1+score2+score3+score4+score5)/5.0)

The score of cross validation is 0.584


In [14]:
#in order to ingore the words in english.stop
filenames_stop = sorted(glob(op.join('..', 'data', 'english.stop')))
texts_stop = open('english.stop').read() 

In [15]:
def count_words2(texts):
    vocabulary = Counter([])
    for text in texts:
        #pass
        list1 = list(word for word in text.split() if word not in texts_stop)
        dir1 = Counter(list1)
        vocabulary = vocabulary + dir1
    vocabulary = dict(vocabulary)
    n_features = len(vocabulary)

    counts = np.zeros(n_features)
    j=0
    for i in vocabulary:
        counts[j] = vocabulary[i]
        j +=1
    
    return vocabulary, counts

In [16]:
def TrainMultinomialNB2(C, D1):  
    y = zip(*D1)[1]
    D = zip(*D1)[0]
    V,counts = count_words2(D)
    N = len(D)
    prior = np.zeros(len(C))
    condprob = np.zeros((len(V), len(C)))
    i = 0
    for c in C:
        Nc = CountDocsInClass(y,c)
        prior[i] = float(Nc)/N 
        text_c = ConcatenateTextOfAllDocsInClass(D1, c)
        
        V1 = text_c.split()
        T_ct = np.zeros(len(V))
        ii = 0
        for m in V:
            for l in V1:
                if l == m:
                    T_ct[ii] += 1
            ii += 1

        j = 0
        for t in V:
            condprob[j][i] = float(T_ct[j]+1)/(sum(T_ct+1))
            j += 1
        i += 1
    return V, prior, condprob

In [17]:
class NB2(BaseEstimator, ClassifierMixin):
    def __init__(self):
        pass

    def fit(self, X, y):
        D1 = zip(X, y)
        self.C = [0, 1]
        self.V, self.prior, self.condprob = TrainMultinomialNB2(self.C, D1)
        return

    def predict(self, X):
        d = X
        return ApplyMultinomialNB(self.C,self.V, self.prior, self.condprob, d)
        #return (np.random.randn(len(X)) > 0).astype(np.int)

    def score(self, X, y):
        correct = 0
        for i in np.arange(0, len(X)):
            if ApplyMultinomialNB(self.C, self.V, self.prior, self.condprob, X[i]) == y[i]:
                correct += 1
        return float(correct) / len(X)  
        #return np.mean(self.predict(X) == y)

In [18]:
nb = NB2()
nb.fit(texts[::2], y[::2])

In [19]:
print "The score after ingoring the “stop words” in the file is " + str(nb.score(texts[1::2], y[1::2]))

The score after ingoring the “stop words” in the file is 0.832
