In [21]:
from concurrent import futures
import nltk
from nltk.corpus import reuters, stopwords
from scipy import sparse
import string
from collections import defaultdict, Counter
import numpy as np
import os
import sys

from nltk.tokenize import word_tokenize
import sklearn
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
%matplotlib inline
import matplotlib.pyplot as plt
from nltk.util import ngrams
import re
NUM_THREADS = 24

nltk.download('reuters')
nltk.download('stopwords')
nltk.download('punkt')

reuters_freqThreshold = 5000
shakespeare_freqThreshold = 5000

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\bgalk\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bgalk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bgalk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Reuters Data Scrubbing

In [22]:
all_files = reuters.fileids()
file_count = len(all_files)
print(len(reuters.fileids()))
punctuation = string.punctuation.replace("'", "")
stopset = set(stopwords.words("english"))

def thread_task(rawWords):
    freqMap = defaultdict(int)
    #convert to lowercase
    lower = [word.lower() for word in rawWords]
    
    #remove punctuation from tokens
    punc_filter = str.maketrans('', '', punctuation)
    stripped = [word.translate(punc_filter) for word in lower]
    
    #remove remaining alphanumerics
    words = [word for word in stripped if word.isalpha()]
    
    #filter out basic stopwords
    cleaned_word_bank = [word for word in words if word not in stopset]
    
    for word in cleaned_word_bank:
        freqMap[word] += 1
    return freqMap

def thread_exec(WORDS):
    with futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as ex:
        results = list(ex.map(thread_task, WORDS))
    return results


10788


In [24]:
freqMap = defaultdict(int)
REUTERS_WORDS = [[str(word) for word in reuters.words(file)] for file in reuters.fileids()]
t = thread_exec(REUTERS_WORDS)
for ifreqMap in t:
    for k in ifreqMap.keys():    
        freqMap[k] += ifreqMap[k]    
        
freqTuples = list(freqMap.items())
sorted_freq = freqTuples.sort(key= lambda x: x[1], reverse=True)
topTuples_r = freqTuples[:reuters_freqThreshold]
filtered_vocab = [tup[0] for tup in topTuples_r]
reuters_vocab_np = np.asarray(filtered_vocab, dtype='str')
print(reuters_vocab_np.shape)

(5000,)


## Shakespeare Data Scrubbing

In [25]:
folder = './works'
sub = "[^a-zA-Z' ]+"
all_files = [file for t,y, file in os.walk(folder)][0]
stopset = set(stopwords.words("english"))

def thread_task(rawWords):
    freqMap = defaultdict(int)
    words = [re.sub(sub, '', word) for word in rawWords]
    lower = [word.lower() for word in words]
    words = [word for word in lower if word.isalpha()]
    cleaned_word_bank = [word for word in words if word not in stopset]    
    
    for word in cleaned_word_bank:
        freqMap[word] += 1
    return freqMap

def thread_exec(WORDS):
    with futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as ex:
        results = list(ex.map(thread_task, WORDS))
    return results
    

In [26]:
freqMap = defaultdict(int)
SHAKESPEARE_WORDS = [[str(word) for word in open(folder+'/'+file).read().split()] for file in all_files]
t = thread_exec(SHAKESPEARE_WORDS)
for ifreqMap in t:
    for k in ifreqMap.keys():    
        freqMap[k] += ifreqMap[k]   
        
freqTuples = list(freqMap.items())
sorted_freq = freqTuples.sort(key= lambda x: x[1], reverse=True)
topTuples_s = freqTuples[:shakespeare_freqThreshold]
filtered_vocab = [tup[0] for tup in topTuples_s]
shakespeare_vocab_np = np.asarray(filtered_vocab, dtype='str')
print(shakespeare_vocab_np.shape)

(5000,)


## Outputting top words/frequencies to a file

In [27]:
reuters_freq_file = open("reuters_frequencies.txt", "w+")
shakes_freq_file = open("shakes_frequencies.txt", "w+")

for tup in topTuples_r:
    reuters_freq_file.write(f"{tup[0]}, {tup[1]}")
for tup in topTuples_s:
    shakes_freq_file.write(f"{tup[0]}, {tup[1]}")
    
reuters_freq_file.close()
shakes_freq_file.close()

# PreProcessing

In [6]:
from ipywidgets import IntProgress, HTML, VBox
from IPython.display import display

vocab = reuters_vocab_np
tmp = list()
for word in shakespeare_vocab_np:
    if(word not in vocab):
        tmp.append(word)
vocab = np.append(vocab, tmp)
print(vocab.shape)


shakespeare_data = []
i=0
punc_filter = str.maketrans('', '', punctuation)
progress = IntProgress(min=0, max=len(SHAKESPEARE_WORDS))
label = HTML()
box = VBox(children=[label, progress])
display(box)
for doc in SHAKESPEARE_WORDS:
    shakespeare_data.append(np.zeros(vocab.shape))
    #print(doc)
    for word in doc:
        word = re.sub(sub,'',word).lower()
        shakespeare_data[i][vocab==word] = 1.0
    i+=1
    progress.value += 1
    label.value = u'{name}: {index} / {size}'.format(
                        name="Docs",
                        index=i,
                        size=len(SHAKESPEARE_WORDS)
                    )
shakespeare_data = np.array(shakespeare_data)   


reuters_data = []
i=0
punc_filter = str.maketrans('', '', punctuation)
progress = IntProgress(min=0, max=len(REUTERS_WORDS))
label = HTML()
box = VBox(children=[label, progress])
display(box)
for doc in REUTERS_WORDS:
    reuters_data.append(np.zeros(vocab.shape))
    for word in doc:
        word = word.lower()
        word = word.translate(punc_filter)
        reuters_data[i][vocab==word] = 1.0
    i+=1   
    progress.value += 1
    label.value = u'{name}: {index} / {size}'.format(
                        name="Docs",
                        index=i,
                        size=len(REUTERS_WORDS)
                    )
reuters_data = np.array(reuters_data)


(8586,)


VBox(children=(HTML(value=''), IntProgress(value=0, max=42)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=10788)))

In [7]:
X = shakespeare_data
y = np.zeros(X.shape[0])

X = np.concatenate((X, reuters_data), axis=0)
y = np.concatenate((y, np.ones(reuters_data.shape[0])), axis=0)

print(X.shape)
print(y.shape)

# X.dump('X')
# y.dump('y')


(10830, 8586)
(10830,)


# Training

In [8]:
X = np.load('X')
y = np.load('y')

In [9]:
def difference(l1,l2):
    precisionError = len([b for a,b in zip(l1,l2) if b != a and b == 0 and a == 1])
    recallError = len([b for a,b in zip(l1,l2) if b != a and b == 1 and a == 0])
    print("Precision Error is " + str(precisionError))
    print("Recall Error is " + str(recallError))

    return (precisionError) + (recallError)

def runModels(Xtr, Ytr):
    kf = KFold(n_splits=5)    
    differencesB = []
    differencesG = []

    for train_index, test_index in kf.split(Xtr):
        x_train, x_test = [Xtr[i] for i in train_index], [Xtr[i] for i in test_index]
        y_train, y_test = [Ytr[i] for i in train_index], [Ytr[i] for i in test_index]
        
        clf = BernoulliNB()
        clf.fit(x_train,y_train)
        
        print(clf.predict(x_test))
        differencesB.append(difference(clf.predict(x_test), y_test) / len(y_test))

        clfb = GaussianNB()
        clfb.fit(x_train,y_train)
        
        print(clfb.predict(x_test))
        differencesG.append(difference(clfb.predict(x_test), y_test) / len(y_test))

    return differencesB, differencesG

In [10]:
print(runModels(X, y))

  self.class_log_prior_ = (np.log(self.class_count_) -


[0. 0. 0. ... 0. 0. 0.]
Precision Error is 0
Recall Error is 0
[0. 0. 0. ... 0. 0. 0.]
Precision Error is 0
Recall Error is 0


  self.class_log_prior_ = (np.log(self.class_count_) -


[0. 0. 0. ... 0. 0. 0.]
Precision Error is 0
Recall Error is 0
[0. 0. 0. ... 0. 0. 0.]
Precision Error is 0
Recall Error is 0


  self.class_log_prior_ = (np.log(self.class_count_) -


[0. 0. 0. ... 0. 0. 0.]
Precision Error is 0
Recall Error is 0
[0. 0. 0. ... 0. 0. 0.]
Precision Error is 0
Recall Error is 0


  self.class_log_prior_ = (np.log(self.class_count_) -


[0. 0. 0. ... 0. 0. 0.]
Precision Error is 0
Recall Error is 0
[0. 0. 0. ... 0. 0. 0.]
Precision Error is 0
Recall Error is 0


  self.class_log_prior_ = (np.log(self.class_count_) -


[0. 0. 0. ... 0. 0. 0.]
Precision Error is 0
Recall Error is 0
[0. 0. 0. ... 0. 0. 0.]
Precision Error is 0
Recall Error is 0
([0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0])


# Validation

In [13]:
# custom k-fold cross validation code
# custom k-fold cross validation code

# def k_fold(X, Y1, Y2,folds)
#     split_size = X.shape[0]//folds
#     splits = []
#     for i in range(folds):
#         start = split_size*i
#         end = (split_size*(i+1))
#         validation = (X[start:end,], Y[start:end])
#         train_x = np.append(X[:start,], X[end:,])
#         train_y = np.append(Y[:start],Y[end:])
#         training = (np.atleast_2d(train_x).T, train_y)
#         splits.append((training, validation))
#     return splits

fold_error = []
print("seperating start")
X_s = shakespeare_data
X_r = reuters_data

Y_s = np.zeros(X_s.shape[0])
Y_r = np.ones(X_r.shape[0])

print("seperated docs")

## 5 -> 8 shakespere docs
## 5 -> ~2k reuters docs
folds = KFold(n_splits=5, random_state=None, shuffle=False)
splits = []
training_sx = []
training_sy = []

validation_sx = []
validation_sy = []


training_rx = []
training_ry = []
validation_rx = []
validation_ry = []

differencesB = []


print("splitting shakespere")
for train_index, test_index in folds.split(X_s, Y_s):
    x_tr, x_validation = X_s[train_index], X_s[test_index]
    y_tr, y_validation = Y_s[train_index], Y_s[test_index]
    training_sx.append(x_tr)
    training_sy.append(y_tr)
    validation_sx.append(x_validation) 
    validation_sy.append(y_validation)
print("split shakespere done")
    
for train_index, test_index in folds.split(X_r, Y_r):
    x_tr, x_validation = X_r[train_index], X_r[test_index]
    y_tr, y_validation = Y_r[train_index], Y_r[test_index]
    training_rx.append(x_tr)
    training_ry.append(y_tr)
    validation_rx.append(x_validation) 
    validation_ry.append(y_validation)
    
print("split shakespere done")

print("training")

print(training_rx[0])

for i in range(len(training_sx)):
    training_global_x = (np.concatenate((training_sx[i], training_rx[i]), axis=0))
    training_global_y = (np.concatenate((training_sy[i], training_ry[i]), axis=0))
    validation_global_x = (np.concatenate((validation_sx[i], validation_rx[i]), axis=0))
    validation_global_y = (np.concatenate((validation_sy[i], validation_ry[i]), axis=0))

    
    clf = BernoulliNB()
    clf.fit(training_global_x,training_global_y)
        
    pred = (clf.predict(validation_global_x))
#     clf.score(training_global_x, training_global_y)
    print(clf.score(validation_global_x, validation_global_y))
    
    if (i == 3):
        print(np.where((pred==validation_global_y)==False))
    differencesB.append(difference(pred, validation_global_y) / len(validation_global_y))


    
differencesB
# fold_error.append(cross_validation_error(5, splits))



# Prediction

### Preparing test documents to classify

In [None]:
## creating test documents to run classificaiton on
all_files = reuters.fileids()

# vectors of document tokens
test_vectorR = [w.lower() for w in reuters.words(all_files[0])]

test_vectorS = [w.lower() for w in SHAKESPEARE_WORDS[0]]

test_mergedS =  [w.lower() for w in SHAKESPEARE_WORDS[1]]

test_mergedS.extend(test_vectorS)

test_mergedS = np.random.choice(test_mergedS, len(SHAKESPEARE_WORDS[1]))


sparse_R = np.zeros(vocab.shape)
sparse_S = np.zeros(vocab.shape)
sparse_M = np.zeros(vocab.shape)

# vectorizing array of tokens
for w in test_vectorR:
    i, = np.where(vocab == w)
    sparse_R[i] = 1
    
for w in test_vectorS:
    i, = np.where(vocab == w)
    sparse_S[i] = 1
    
for w in test_mergedS:
    i, = np.where(vocab == w)
    sparse_M[i] = 1

# sanity check
print(sparse_M.shape)
print(sparse_M)

### Classifying the prepared test documents

In [None]:
X_s = shakespeare_data
X_r = reuters_data

Y_s = np.zeros(X_s.shape[0])
Y_r = np.ones(X_r.shape[0])

X = np.concatenate((X_s, X_r), axis=0)
Y = np.concatenate((Y_s, Y_r), axis=0)

clf = BernoulliNB()
clf.fit(X,Y)

pred = (clf.predict([sparse_M]))
print(pred)

# Rare Words

# Visualization

## Error Rate vs Threshold [work in progress]

In [None]:
threshold_values = [100, 1000, 2000, 3000, 5000]

# test_vector -> tokenized doc
# vocab -> merged vocab
def vectorize(test_vector,vocab):
    sparse = np.zeros(vocab.shape)
    for w in test_vector:
        i, = np.where(vocab == w)
        sparse[i] = 1
    return sparse
        
        
for threshold in threshold_values:
    top_s= shakespeare_vocab_np[:threshold]
    top_r = reuters_vocab_np[:threshold]
    
    intersect = np.intersect1d(top_s, top_r)
    vocab = np.concatenate((top_s, top_r), axis=0)
    vocab = np.array([v for v in vocab if v not in intersect])
    
    for row in shakespeare_data:
        

In [20]:
from yellowbrick.text import TSNEVisualizer
from sklearn.feature_extraction.text import TfidfVectorizer



ModuleNotFoundError: No module named 'yellowbrick'