In [1]:
from concurrent import futures
import nltk
from nltk.corpus import reuters, stopwords
from scipy import sparse
import string
from collections import defaultdict, Counter
import numpy as np
import os
import sys

from nltk.tokenize import word_tokenize
import sklearn
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
%matplotlib inline
import matplotlib.pyplot as plt
from nltk.util import ngrams
import re
NUM_THREADS = 24s

nltk.download('reuters')
nltk.download('stopwords')
nltk.download('punkt')

reuters_freqThreshold = 50000
shakespeare_freqThreshold = 100000

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\monish\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\monish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\monish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Reuters Data Scrubbing

In [2]:
all_files = reuters.fileids()
file_count = len(all_files)
print(len(reuters.fileids()))
punctuation = string.punctuation.replace("'", "")
stopset = set(stopwords.words("english"))

def thread_task(rawWords):
    freqMap = defaultdict(int)
    #convert to lowercase
    lower = [word.lower() for word in rawWords]
    
    #remove punctuation from tokens
    punc_filter = str.maketrans('', '', punctuation)
    stripped = [word.translate(punc_filter) for word in lower]
    
    #remove remaining alphanumerics
    words = [word for word in stripped if word.isalpha()]
    
    #filter out basic stopwords
    cleaned_word_bank = [word for word in words if word not in stopset]
    
    for word in cleaned_word_bank:
        freqMap[word] += 1
    return freqMap

def thread_exec(WORDS):
    with futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as ex:
        results = list(ex.map(thread_task, WORDS))
    return results


10788


In [3]:
freqMap = defaultdict(int)
REUTERS_WORDS = [[str(word) for word in reuters.words(file)] for file in reuters.fileids()]
t = thread_exec(REUTERS_WORDS)
for ifreqMap in t:
    for k in ifreqMap.keys():    
        freqMap[k] += ifreqMap[k]    
        
freqTuples = list(freqMap.items())
sorted_freq = freqTuples.sort(key= lambda x: x[1], reverse=True)
topTuples = freqTuples[:reuters_freqThreshold]
filtered_vocab = [tup[0] for tup in topTuples]
reuters_vocab_np = np.asarray(filtered_vocab, dtype='str')
print(reuters_vocab_np.shape)

(29027,)


## Shakespeare Data Scrubbing

In [4]:
folder = './works'
sub = "[^a-zA-Z' ]+"
all_files = [file for t,y, file in os.walk(folder)][0]
stopset = set(stopwords.words("english"))

def thread_task(rawWords):
    freqMap = defaultdict(int)
    words = [re.sub(sub, '', word) for word in rawWords]
    lower = [word.lower() for word in words]
    words = [word for word in lower if word.isalpha()]
    cleaned_word_bank = [word for word in words if word not in stopset]    
    
    for word in cleaned_word_bank:
        freqMap[word] += 1
    return freqMap

def thread_exec(WORDS):
    with futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as ex:
        results = list(ex.map(thread_task, WORDS))
    return results
    

In [5]:
freqMap = defaultdict(int)
SHAKESPEARE_WORDS = [[str(word) for word in open(folder+'/'+file).read().split()] for file in all_files]
t = thread_exec(SHAKESPEARE_WORDS)
for ifreqMap in t:
    for k in ifreqMap.keys():    
        freqMap[k] += ifreqMap[k]   
        
freqTuples = list(freqMap.items())
sorted_freq = freqTuples.sort(key= lambda x: x[1], reverse=True)
topTuples = freqTuples[:shakespeare_freqThreshold]
filtered_vocab2 = [tup[0] for tup in topTuples]
shakespeare_vocab_np = np.asarray(filtered_vocab, dtype='str')
print(shakespeare_vocab_np.shape)

(29027,)


# PreProcessing

In [8]:
from ipywidgets import IntProgress, HTML, VBox
from IPython.display import display

vocab = reuters_vocab_np
for word in shakespeare_vocab_np:
    if(word not in vocab):
        vocab.append(word)
print(vocab.shape)


shakespeare_data = []
i=0
punc_filter = str.maketrans('', '', punctuation)
progress = IntProgress(min=0, max=len(SHAKESPEARE_WORDS))
label = HTML()
box = VBox(children=[label, progress])
display(box)
for doc in SHAKESPEARE_WORDS:
    shakespeare_data.append(np.zeros(vocab.shape))
    #print(doc)
    for word in doc:
        word = re.sub(sub,'',word).lower()
        shakespeare_data[i][vocab==word] = 1.0
    i+=1
    progress.value += 1
    label.value = u'{name}: {index} / {size}'.format(
                        name="Docs",
                        index=i,
                        size=len(SHAKESPEARE_WORDS)
                    )
shakespeare_data = np.array(shakespeare_data)   


reuters_data = []
i=0
punc_filter = str.maketrans('', '', punctuation)
progress = IntProgress(min=0, max=len(REUTERS_WORDS))
label = HTML()
box = VBox(children=[label, progress])
display(box)
for doc in REUTERS_WORDS:
    reuters_data.append(np.zeros(vocab.shape))
    for word in doc:
        word = word.lower()
        word = word.translate(punc_filter)
        reuters_data[i][vocab==word] = 1.0
    i+=1   
    progress.value += 1
    label.value = u'{name}: {index} / {size}'.format(
                        name="Docs",
                        index=i,
                        size=len(REUTERS_WORDS)
                    )
reuters_data = np.array(reuters_data)


['said' 'mln' 'vs' ... 'genecor' 'additivies' 'krn']


VBox(children=(HTML(value=''), IntProgress(value=0, max=42)))

(42, 29027)
['said' 'mln' 'vs' ... 'genecor' 'additivies' 'krn']


VBox(children=(HTML(value=''), IntProgress(value=0, max=10788)))

(10788, 29027)


In [9]:
X = shakespeare_data
y = np.zeros(X.shape[0])

X = np.concatenate((X, reuters_data), axis=0)
y = np.concatenate((y, np.zeros(reuters_data.shape[0])), axis=0)

print(X.shape)
print(y.shape)

X.dump('X')
y.dump('y')


(10830, 29027)
(10830,)


# Training

In [10]:
X = np.load('X')
y = np.load('y')

In [11]:
def difference(l1,l2):
    precisionError = len([b for a,b in zip(l1,l2) if b != a and b == 0 and a == 1])
    recallError = len([b for a,b in zip(l1,l2) if b != a and b == 1 and a == 0])
    print("Precision Error is " + str(precisionError))
    print("Recall Error is " + str(recallError))

    return (precisionError) + (recallError)

def runModels(Xtr, Ytr):
    kf = KFold(n_splits=5)    
    differencesB = []
    differencesG = []

    for train_index, test_index in kf.split(Xtr):
        x_train, x_test = [Xtr[i] for i in train_index], [Xtr[i] for i in test_index]
        y_train, y_test = [Ytr[i] for i in train_index], [Ytr[i] for i in test_index]
        
        clf = BernoulliNB()
        clf.fit(x_train,y_train)
        
        print(clf.predict(x_test))
        differencesB.append(difference(clf.predict(x_test), y_test) / len(y_test))

        clfb = GaussianNB()
        clfb.fit(x_train,y_train)
        
        print(clfb.predict(x_test))
        differencesG.append(difference(clfb.predict(x_test), y_test) / len(y_test))

    return differencesB, differencesG

In [12]:
print(runModels(X, y))

  self.class_log_prior_ = (np.log(self.class_count_) -


[0. 0. 0. ... 0. 0. 0.]
Precision Error is 0
Recall Error is 0
[0. 0. 0. ... 0. 0. 0.]
Precision Error is 0
Recall Error is 0
[0. 0. 0. ... 0. 0. 0.]
Precision Error is 0
Recall Error is 0
[0. 0. 0. ... 0. 0. 0.]
Precision Error is 0
Recall Error is 0
[0. 0. 0. ... 0. 0. 0.]
Precision Error is 0
Recall Error is 0
[0. 0. 0. ... 0. 0. 0.]
Precision Error is 0
Recall Error is 0
[0. 0. 0. ... 0. 0. 0.]
Precision Error is 0
Recall Error is 0
[0. 0. 0. ... 0. 0. 0.]
Precision Error is 0
Recall Error is 0
[0. 0. 0. ... 0. 0. 0.]
Precision Error is 0
Recall Error is 0
[0. 0. 0. ... 0. 0. 0.]
Precision Error is 0
Recall Error is 0
([0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0])


# Validation