In [195]:
import os.path as op
import numpy as np
import string

from sklearn.base import BaseEstimator, ClassifierMixin

In [196]:
print("Loading dataset")

# Loading datasets
from glob import glob
filenames_neg = sorted(glob(op.join('..', 'data', 'imdb1', 'neg', '*.txt')))
filenames_pos = sorted(glob(op.join('..', 'data', 'imdb1', 'pos', '*.txt')))
filenames_stop_words = sorted(glob(op.join('..','data','english.stop')))

# Get the list of stop words

########################################
#### CHANGE HERE FOR THE STOP_WORDS ####
#stop_words = [open(f).read() for f in filenames_stop_words][0].split() #question 5
stop_words = []
########################################

print("Done!")

Loading dataset
Done!


In [197]:
# Define labeled texts
texts_neg = [open(f).read() for f in filenames_neg]
texts_pos = [open(f).read() for f in filenames_pos]
texts = texts_neg + texts_pos
y = np.ones(len(texts), dtype=np.int)
y[:len(texts_neg)] = 0.

print("%d documents" % len(texts))

2000 documents


In [198]:
def remove_punctionation_and_stopwords(text_list) :
    """Process text : Remove punctiation and stop words from a list of words"""
    exclude = set(string.punctuation)
    new_text = [word for word in text_list if word not in exclude and word not in stop_words]
    return new_text

In [199]:
def get_vocabulary(texts):
    """Return a set of all the word in a corpus of pre-processed texts"""
    vocabulary = set()
    for text in texts:
        text = remove_punctionation_and_stopwords(text.split())
        for word in text :
            vocabulary.add(word)
    return vocabulary


In [200]:
def count_words(texts):
    """Vectorize text : return count of each word in the text snippets

    Parameters
    ----------
    texts : list of str
        The texts

    Returns
    -------
    vocabulary : dict
        A dictionary that points to an index in counts for each word.
    counts : ndarray, shape (n_samples, n_features)
        The counts of each word in each text.
        n_samples == number of documents.
        n_features == number of words in vocabulary.
    """
    print("Processing corpus of texts")
    vocabulary_set = list(get_vocabulary(texts))
    counts = np.zeros((len(texts),len(vocabulary_set)))
    vocabulary = {}
    for i, elem in enumerate(vocabulary_set) :
        vocabulary[elem] = i
    for i in range(len(texts)) :
        text = remove_punctionation_and_stopwords(texts[i].split())
        for word in text :
            counts[i,vocabulary[word]] = counts[i,vocabulary[word]] + 1
    return vocabulary, counts

In [203]:
def countDocsInClass(y,c) :
    """ return the number of docs that are labeled c """
    return np.sum((y==c).astype(np.int32))

In [204]:
def concatenateTextOfAllDocsInClass(X,y,c) :
    """ return the number of time each word was found in the corpus X of texts that are labeled c 
    Parameters
    ----------
    X : count_words(texts)[1]
    y : array of labels
    c : label ("class") {0,1}
    Returns
    -------"""
    assert X.shape[0] == len(y)
    concatenated = np.zeros(X.shape[1])
    for i in range(X.shape[0]) :
        if int(y[i]) == c :
            concatenated += X[i,:]
    return concatenated

In [205]:
def countTokensOfTerm(hist_of_words_c,word_index) :
    """ return the number of time a token appeared in a corpus of texts that are labeled c"""
    return hist_of_words_c[word_index]

def extractTokensFromDoc(doc) :
    """ return all tokens that appear at least once in the doc"""
    return np.nonzero(doc)

In [101]:
class NB(BaseEstimator, ClassifierMixin):
    def __init__(self, allClasses = [0,1]):
        self.nDocs = len(texts)
        self.prior = [] # P(class)
        self.condProb = np.zeros((len(vocabulary),len(allClasses))) # initialize P(word|class)

    def fit(self, X, y, allClasses = [0,1]):
        print("Training model ...")
        for c in allClasses :
            Nc = countDocsInClass(y,c) # Number of docs in class c
            self.prior.append(Nc/self.nDocs) # prior[c] = Nc/nDocs
            hist_of_words_c = concatenateTextOfAllDocsInClass(X,y,c)
            vocabulary_size = len(hist_of_words_c[hist_of_words_c > 0])
            l = len(vocabulary)
            for word in vocabulary :
                word_index = vocabulary[word] #return the index of the word in the vocabulary
                nTokens_c = countTokensOfTerm(hist_of_words_c,word_index) # return how many times the word appears in the class c docs
                self.condProb[word_index,c] = (nTokens_c+1)/(np.sum(hist_of_words_c)+vocabulary_size) # update P(word|class) according to Laplace smoothing
        return self.prior, self.condProb

    def predict(self, X, allClasses = [0,1]):
        print("Predicting  ...")
        array_score = np.zeros(X.shape[0])
        last = 0 
        for num_doc in range(X.shape[0]) :
            if last != int(num_doc/X.shape[0]*100) :
                last = int(num_doc/X.shape[0]*100)
                #print(last, " % done", end = '\r')
            score = [0,0]
            word_indexes = extractTokensFromDoc(X[num_doc,:])
            for c in allClasses :
                score[c] = np.log(self.prior[c]) # 0.5 for each
                for word_index in list(word_indexes)[0] :
                    score[c] += np.log(self.condProb[word_index,c]) # adding log of probabilities
            array_score[num_doc] = np.argmax(np.array(score))
        return array_score

    def score(self, X, y):
        return np.mean(self.predict(X) == y)

In [207]:
if __name__ == '__main__' :
    vocabulary, X = count_words(texts)
    print(X.shape)
    # Try to fit, predict and score
    # We only split the dataset in two halves, future works will test a p-folds version.
    nb = NB()
    nb.fit(X[::2], y[::2])
    print("accuracy = {acc} %".format(acc = 100 * nb.score(X[1::2], y[1::2])))

Processing corpus of texts
(2000, 50894)
Training model ...
Predicting  ...
accuracy = 81.0 %


In [83]:
def split_data(X,y,n) :
    train_size = int(X.shape[0]*(1-1/n))
    indices = np.random.permutation(X.shape[0])
    training_idx, test_idx = indices[:train_size], indices[train_size:]
    X_train, X_test, y_train, y_test = X[training_idx,:], X[test_idx,:], y[training_idx], y[test_idx]
    return X_train, X_test, y_train, y_test
    

In [112]:
def cross_validation(X,y,n) :
    nb = NB()
    scores = []
    for i in range(n):
        print("Processing fold %d out of %d"%(i+1,n))
        X_train, X_test, y_train, y_test = split_data(X,y,n)
        nb.fit(X_train,y_train)
        acc = nb.score(X_test,y_test)
        scores.append(acc)
        print("accuracy = {n} % \n".format(n = 100*acc))
    return np.mean(np.array(scores))
    

In [114]:
accuracy = cross_validation(X,y,5)
print("mean accuracy = {n} % \n".format(n = 100*accuracy))

Processing fold 1 out of 5
Training model ...
Predicting  ...
accuracy = 81.25 % 

Processing fold 2 out of 5
Training model ...
Predicting  ...
accuracy = 81.0 % 

Processing fold 3 out of 5
Training model ...
Predicting  ...
accuracy = 82.25 % 

Processing fold 4 out of 5
Training model ...
Predicting  ...
accuracy = 85.0 % 

Processing fold 5 out of 5
Training model ...
Predicting  ...
accuracy = 81.5 % 

mean accuracy = 82.2 % 



#### Finding the word with the highest conditional probability  

In [300]:
def max_proba_stop_words(stop_words,vocabulary,model) :
    indexes = [vocabulary[word] for word in stop_words if word in vocabulary]
    max_proba = np.max(abs(model.condProb[indexes,1]-model.condProb[indexes,0]))
    return indexes, max_proba

stop_words = []
nb = NB()
nb.fit(X[::], y[::])

Training model ...


([0.5, 0.5], array([[4.65155944e-06, 1.38731275e-06],
        [1.55051981e-06, 6.93656374e-06],
        [1.55051981e-06, 2.77462549e-06],
        ...,
        [1.55051981e-06, 2.77462549e-06],
        [6.20207925e-06, 2.77462549e-06],
        [3.10103962e-06, 2.77462549e-06]]))

In [301]:
indexes, max_proba = max_proba_stop_words([open(f).read() for f in filenames_stop_words][0].split(),vocabulary,nb)

In [302]:
max_proba

0.0032954539444079722

In [306]:
for index in indexes :
    p = nb.condProb[index,:]
    if np.max(p) > 0.05 :
        print(p)
        print(index)
        
for word in vocabulary : 
    p = nb.condProb[vocabulary[word],:]
    if np.max(p) > 0.05 :
        print(p)
        print(vocabulary[word])
        
# the word nb 30941 has the highest conditional probability, and it is a stop_word.

[0.05406973 0.05730711]
30941
[0.05406973 0.05730711]
30941


In [307]:
for word in vocabulary.keys() :
    if vocabulary[word] == 30941 :
        print(word)

the
