In [1]:
import numpy as np
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import re
        
import warnings
warnings.filterwarnings("ignore")

RUN_KNN = False
RUN_NB = True

# all_topics = ['Coffee', 'Arduino', 'Anime']
all_topics = ['Anime', 'Arduino', 'Astronomy', 'Biology', 'Chess', 'Coffee', 'Cooking', 'Law', 
              'Space', 'Windows_Phone', 'Wood_Working']


In [2]:
text = "Bangladesh, officially the People's Republic of Bangladesh, is a country in South Asia. It is the eighth-most populous country in the world, with a population exceeding 162 million people. It is not other countries. It is a better place to live in."
def preprocess_text(text, all_type=True, debug=True):
    if debug: print("===Raw Text:===\n", text)

    #1.Lowercase the text
    text = text.lower()
    if debug: print("\n===After Lowercase:===\n", text)

    if all_type:
        #Number Removal
        text = re.sub(r'[-+]?\d+', '', text)
        if debug: print("\n===After Removing Numbers:===\n", text)

    #2.Remove punctuations
    text=text.translate((str.maketrans('','',string.punctuation)))
    if debug: print("\n===After Removing Punctuations:===\n", text)

    #4.Tokenize
    text = word_tokenize(text)
    if debug: print("\n===After Tokenizing:===\n", text)

    #3.Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if not word in stop_words]
    if debug: print("\n===After Stopword Removal:===\n", text)

    #6.Lemmatize tokens
    lemmatizer=WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    if debug: print("\n===After Lemmatization:===\n", text)

    #5.Stemming tokens
    stemmer= PorterStemmer()
    text = [stemmer.stem(word) for word in text]
    if debug: print("\n===After Stemming:===\n", text)
    
    return text

text = preprocess_text(text, all_type=True, debug=False)
print(text)

['bangladesh', 'offici', 'peopl', 'republ', 'bangladesh', 'countri', 'south', 'asia', 'eighthmost', 'popul', 'countri', 'world', 'popul', 'exceed', 'million', 'peopl', 'countri', 'better', 'place', 'live']


In [3]:
#Following operation is required if you run this cell for the first time
#!pip3 install bs4
from bs4 import BeautifulSoup as bs
with open('Sample.xml','r',encoding='utf-8') as file:
    content = file.read()
    soup = bs(content)
    for items in soup.findAll("item"):
        print(items)
        
# https://stackoverflow.com/questions/3351485/how-to-remove-all-html-tags-from-downloaded-page
def func_remove_html_tags(document):
#     for __tag__ in ['p', 'em', 'i', 'b']:
#         document = document.replace('<'+__tag__+'>', '').replace('</'+__tag__+'>', '')
#     for __tag__ in ['<p>', '</p>', '<em>', '</em>', '<i>',  '</i>',  '<b>',  '</b>']:
#         document = document.replace(__tag__, '')
#     print('\n', i, ':', document)
    p = re.compile(r'<.*?>')
    document = p.sub('', document)
    return document

<item name="item1">This is the first line of text.</item>
<item name="item2">This is the second line.</item>


In [4]:
from bs4 import BeautifulSoup as bs
# topic_name = 'Anime'
# remove_html_tags = True
# train_cnt, val_cnt, test_cnt = 500, 200, 500

def preprocess_from_xml(topic_name, remove_html_tags=True, train_cnt=500, val_cnt=200, test_cnt=500, debug=False):
    with open('Data/Training/{}.xml'.format(topic_name),'r',encoding='utf-8') as file:
        content = file.read()
        soup = bs(content)
        rows = soup.findAll("row")
        train_, val_, text_ = [], [], []
        all_words = []
        tracker_idx = 0
        for i, items in enumerate(rows):
            document = items['body'].replace('\n', ' ')
            if debug: print('\n{}: before>'.format(i), document)
            if remove_html_tags:
                document = func_remove_html_tags(document=document)
            if debug: print('after>', document)
            document = preprocess_text(text=document, all_type=True, debug=False)
            if document == []:
                continue
            if tracker_idx == train_cnt:
                break
            train_ += [[document, topic_name]]
            all_words += document
            tracker_idx += 1
        
        cnt_temp = tracker_idx
        for i, items in enumerate(rows[cnt_temp:]):
            document = items['body'].replace('\n', ' ')
            if remove_html_tags:
                document = func_remove_html_tags(document=document)
            document = preprocess_text(text=document, all_type=True, debug=False)
            if document == []:
                continue
            if tracker_idx == cnt_temp+val_cnt:
                break
            val_ += [[document, topic_name]]
            tracker_idx += 1
            
        cnt_temp = tracker_idx
        for i, items in enumerate(rows[cnt_temp:]):
            document = items['body'].replace('\n', ' ')
            if remove_html_tags:
                document = func_remove_html_tags(document=document)
            document = preprocess_text(text=document, all_type=True, debug=False)
            if document == []:
                continue
            if tracker_idx == cnt_temp+test_cnt:
                break
            text_ += [[document, topic_name]]
            tracker_idx += 1
        
        print('train_, val_, text_, all_words', train_.__len__(), val_.__len__(), text_.__len__(), all_words.__len__())
        return train_, val_, text_, all_words

train_docs, val_docs, test_docs = [], [], []
all_words_in__all_topics = []
topic_wise_word_count = {}
for topic_name in all_topics:
    train_, val_, text_, all_words_ = preprocess_from_xml(topic_name=topic_name, 
                                                         remove_html_tags=True, 
                                                         train_cnt=500, 
                                                         val_cnt=200, 
                                                         test_cnt=500, 
                                                         debug=False)
    train_docs += train_
    val_docs += val_
    test_docs += text_
    all_words_in__all_topics += all_words_
    topic_wise_word_count[topic_name] = len(all_words_)


train_, val_, text_, all_words 500 200 500 28399
train_, val_, text_, all_words 500 200 500 38320
train_, val_, text_, all_words 500 200 500 33379
train_, val_, text_, all_words 500 200 500 33928
train_, val_, text_, all_words 500 200 500 26268
train_, val_, text_, all_words 500 200 500 36516
train_, val_, text_, all_words 500 200 500 17819
train_, val_, text_, all_words 500 200 500 40604
train_, val_, text_, all_words 500 200 500 34790
train_, val_, text_, all_words 500 200 500 19907
train_, val_, text_, all_words 500 200 500 31722


In [5]:
print(len(train_docs), len(val_docs), len(test_docs), len(all_words_in__all_topics))
def create_dictionary_from_training_data(words):
    vocab = {}
    for w in words:
        if w in vocab:
            vocab[w] += 1
        else:
            vocab[w] = 1
    return vocab
vocab = create_dictionary_from_training_data(words=all_words_in__all_topics)
len(vocab), topic_wise_word_count

5500 2200 5500 341652


(20703,
 {'Anime': 28399,
  'Arduino': 38320,
  'Astronomy': 33379,
  'Biology': 33928,
  'Chess': 26268,
  'Coffee': 36516,
  'Cooking': 17819,
  'Law': 40604,
  'Space': 34790,
  'Windows_Phone': 19907,
  'Wood_Working': 31722})

In [6]:
# print(train_docs[1])
# print(all_words_in__all_topics[:10])
word_position_in_vocab = {}
i = 0
for w in vocab:
    word_position_in_vocab[w] = i
    i += 1
    
VOCAB_SIZE = len(word_position_in_vocab)
# VOCAB_SIZE
# vocab
# word_position_in_vocab

total_train_cnt = len(train_docs)
def get_IDF(train_docs, word_position_in_vocab, VOCAB_SIZE, alpha=0, beta=1e-5, beta2=1e-5):
    D = len(train_docs)
#     print('>', word_position_in_vocab)
    C_w = np.zeros(VOCAB_SIZE)
    for w in word_position_in_vocab:
        for document in train_docs:
            if w in document[0]:
                C_w[word_position_in_vocab[w]] += 1.
#                 print(w, document)
        C_w[word_position_in_vocab[w]] += 1
#         if C_w[word_position_in_vocab[w]] == D:
#             C_w[word_position_in_vocab[w]] += beta
#     print('C_w', C_w)
    IDF = np.log10((D + alpha) / (C_w))
    IDF = np.where(IDF<=0, beta2, IDF)
#     print(D)
    return IDF

In [7]:
dummy_doc = 'got got local coffe got'.split(' ')
def create_hamming_vect(word_position_in_vocab, VOCAB_SIZE, document):
    vect = np.zeros([VOCAB_SIZE])
    for w in document:
        if w in word_position_in_vocab:
            vect[word_position_in_vocab[w]] = 1.
    return vect

# vect = create_hamming_vect(word_position_in_vocab=word_position_in_vocab, VOCAB_SIZE=VOCAB_SIZE, document=dummy_doc)  
# print(sum(vect), len(dummy_doc), len(vect))

def create_euclidean_vect(word_position_in_vocab, VOCAB_SIZE, document):
    vect = np.zeros([VOCAB_SIZE])
    for w in document:
        if w in word_position_in_vocab:
            vect[word_position_in_vocab[w]] += 1.
    return vect

# vect = create_euclidean_vect(word_position_in_vocab=word_position_in_vocab, VOCAB_SIZE=VOCAB_SIZE, document=dummy_doc)  
# print(sum(vect), len(dummy_doc), len(vect))

def create_tfIdf_vect(word_position_in_vocab, VOCAB_SIZE, document, IDF):
    TF = np.zeros([VOCAB_SIZE])
    total_ = 0.
    if document == []:
        print('No words in doc. Skipping...')
        return TF
    for w in document:
        if w in word_position_in_vocab:
#             print(w)
            total_ += 1. # jokhon unknown word totally consider korbo na. total word count er moddheo na
            TF[word_position_in_vocab[w]] += 1.
#     print('TF.sum():', TF.sum())
    TF /= total_ # TF(d, w) = N(d, w) / W(d)
#     print(IDF.shape)
#     print('TF:', TF)
    TF_IDF = TF * IDF
#     print(TF_IDF.shape)
    return TF_IDF

# def create_tfIdf_vect2(word_position_in_vocab, VOCAB_SIZE, document, IDF):
#     TF = {}
#     total_ = 0.
#     for w in document:
#         if w not in word_position_in_vocab: # During TF calculation, You can simply omit the word when the word is new in test/validation set 
#             continue
#         total_ += 1.
#         if w in TF:
#             TF[w] += 1.
#         else:
#             TF[w] = 1.
#     for w in TF:
#         TF[w] /= total_ # TF(d, w) = N(d, w) / W(d)
#         TF[w] *= IDF[w]
#     return TF # tf_idf

# vect = create_tfIdf_vect(word_position_in_vocab=word_position_in_vocab, VOCAB_SIZE=VOCAB_SIZE, document=dummy_doc, IDF=IDF)  
# print(sum(vect), len(dummy_doc), len(vect))


# def create_hamming_vect(vocab, document):
#     vect = []
#     for w in vocab:
#         vect += [int(w in document)]
#     return vect

# vect = create_hamming_vect(vocab=vocab, document=train_docs[0][0])  
# sum(vect), len(train_docs[0][0]), len(vect)

In [8]:
# # np.log10(4/3)
# # np.save('IDF.npy', IDF)
# # debug..
# dummy_docs = [['play cricket play football'.split(), 'sports'],
#               ['play music'.split(), 'music'],
#               ['like singing'.split(), 'music'],
#               ['cricket very small insect'.split(), 'biology'],
#               ['want play cricket'.split(), None]
#              ]
# dummy_vocab_pos = {'play':0,'cricket':1,'football':2,'music':3,'singing':4,'very':5,'small':6,'insect':7,'like':8}
# IDF = get_IDF(train_docs=dummy_docs[:-1], 
#               word_position_in_vocab=dummy_vocab_pos,
#               VOCAB_SIZE=len(dummy_vocab_pos), 
#               alpha=0, 
#               beta=1e-5)
# # print(IDF.max(), IDF.min(), IDF.mean())
# print(IDF)

# vect = create_tfIdf_vect(word_position_in_vocab=dummy_vocab_pos, 
#                          VOCAB_SIZE=len(dummy_vocab_pos), 
#                          document=dummy_docs[-1][0], 
#                          IDF=IDF)  
# print(vect*2/3)


In [9]:
if RUN_KNN:
    IDF = get_IDF(train_docs=train_docs, 
                  word_position_in_vocab=word_position_in_vocab,
                  VOCAB_SIZE=VOCAB_SIZE, 
                  alpha=0, 
                  beta=1e-5)
    IDF.min(), IDF.max(), IDF.mean()

In [10]:
def euclidean_distance2(instance1, instance2): # provided code
    distance = 0.0
    for i in range(len(instance1)):
        distance += (instance1[i] - instance2[i])**2
    return np.sqrt(distance)

def euclidean_distance(instance1, instance2):
    distance = instance1 - instance2
    return np.sqrt(np.dot(distance, distance))

# a, b = np.array([1,2,3,6]),  np.array([5,6,7,8])
# print(euclidean_distance2(a,b), euclidean_distance(a,b))

def hamming_distance(bin_vect1, bin_vect2): ##??
    bin_vect1, bin_vect2 = bin_vect1.astype(int), bin_vect2.astype(int)
    return np.sum(np.array(bin_vect1)^np.array(bin_vect2))

# a, b = np.array([0,0,0,1,1]), np.array([1,1,1,1,1])
# print(hamming_distance(a,b))

def cosine_similarity(instance1, instance2):
    return np.dot(instance1, instance2) / (np.linalg.norm(instance1) * np.linalg.norm(instance2))


a, b = np.array([1,1,0,0]),  np.array([1,0,0,0])
print(cosine_similarity(a,b))

0.7071067811865475


In [11]:
# train_docs[0]

In [12]:
#Make prediction of the test points using training points

class KNN_Text_Classifier:
    def __init__(self, train_docs):
        self.train_hamming_vect = []
        self.train_euclid_vect = []
        self.train_cosine_vect = []
        self.train_labels = []
        self.train_docs = train_docs
        
        for i, doc in enumerate(train_docs):
            self.train_labels += [doc[1]]
            if doc[0] == []:
                print(i)
                continue
            self.train_hamming_vect += [create_hamming_vect(word_position_in_vocab=word_position_in_vocab, 
                                                       VOCAB_SIZE=VOCAB_SIZE, document=doc[0])]
            self.train_euclid_vect += [create_euclidean_vect(word_position_in_vocab=word_position_in_vocab, 
                                                        VOCAB_SIZE=VOCAB_SIZE, document=doc[0])]
            self.train_cosine_vect += [create_tfIdf_vect(word_position_in_vocab=word_position_in_vocab, 
                                                    VOCAB_SIZE=VOCAB_SIZE, document=doc[0], IDF=IDF)]
        self.unique_labels = list(set(self.train_labels))
        self.unique_labels_cnt = len(self.unique_labels)
        self.label_positions_in_unique_labels = {}
        for i, label in enumerate(self.unique_labels):
            self.label_positions_in_unique_labels[label] = i
            
#         self.voteCount = np.zeros([self.unique_labels_cnt])

    def knn_prediction_single_sample(self, doc_test, dist_type, n_neighbors): #based on given code
        if dist_type == 0:
            dist_func = hamming_distance
            X_train = self.train_hamming_vect
            testInput = create_hamming_vect(word_position_in_vocab=word_position_in_vocab, 
                                          VOCAB_SIZE=VOCAB_SIZE, document=doc_test)
        if dist_type == 1:
            dist_func = euclidean_distance
            X_train = self.train_euclid_vect
            testInput = create_euclidean_vect(word_position_in_vocab=word_position_in_vocab, 
                                          VOCAB_SIZE=VOCAB_SIZE, document=doc_test)
        if dist_type == 2:
            dist_func = cosine_similarity
            X_train = self.train_cosine_vect
            testInput = create_tfIdf_vect(word_position_in_vocab=word_position_in_vocab, 
                                        VOCAB_SIZE=VOCAB_SIZE, document=doc_test, IDF=IDF)

        #calculate for earch test data points
        allDistances = []
        for i, (trainInput, trainActualOutput) in enumerate(zip(X_train, self.train_labels)):
            distance = dist_func(testInput, trainInput)
            allDistances.append((i, trainActualOutput, distance))
        #Sort (in ascending for 0 and 1, descending for 2) the training data points based on distances from the test point
        allDistances.sort(key=lambda x: x[2], reverse = dist_type==2)
#         print(len(allDistances))
#         if dist_type==2:
#             print(allDistances[:10], allDistances[-10:])

        #Assuming output labels are from 0 to self.unique_labels_cnt-1
        voteCount = np.zeros(self.unique_labels_cnt)
        neighbor_indices = []
        for n in range(n_neighbors):
            neighbor_indices.append(allDistances[n][0])
            voteCount[self.label_positions_in_unique_labels[allDistances[n][1]]] += 1

        #Determine the Majority Voting (Equal weight considered)
#         print('voteCount:', voteCount)
        predictedOutput = np.argmax(voteCount)
        predictedOutput = self.unique_labels[predictedOutput]

        return predictedOutput, neighbor_indices


    def performanceEvaluation(self, test_docs, dist_type, n_neighbors):
        totalCount = 0
        correctCount = 0

        for i, doc_test_and_label in enumerate(test_docs):
            if doc_test_and_label[0] == []:
                print(i)
                continue
            predictedOutput, _ = self.knn_prediction_single_sample(doc_test_and_label[0], dist_type, n_neighbors)

            if predictedOutput == doc_test_and_label[1]:
                correctCount += 1
            totalCount += 1
#             print(predictedOutput, doc_test_and_label[1])

        print("Total Correct Count: ",correctCount," Total Wrong Count: ",totalCount-correctCount," Accuracy: ",(correctCount*100)/(totalCount))


In [13]:
if RUN_KNN:
    knn = KNN_Text_Classifier(train_docs=train_docs)
    knn

In [14]:
if RUN_KNN:
    print('VAL::')
    print('hamming')
    knn.performanceEvaluation(val_docs[:], dist_type=0, n_neighbors=1)
    knn.performanceEvaluation(val_docs[:], dist_type=0, n_neighbors=3)
    knn.performanceEvaluation(val_docs[:], dist_type=0, n_neighbors=5)
    print('\neuclidean')
    knn.performanceEvaluation(val_docs[:], dist_type=1, n_neighbors=1)
    knn.performanceEvaluation(val_docs[:], dist_type=1, n_neighbors=3)
    knn.performanceEvaluation(val_docs[:], dist_type=1, n_neighbors=5)
    print('\ncosine')
    knn.performanceEvaluation(val_docs[:], dist_type=2, n_neighbors=1)
    knn.performanceEvaluation(val_docs[:], dist_type=2, n_neighbors=3)
    knn.performanceEvaluation(val_docs[:], dist_type=2, n_neighbors=5)

In [15]:
if RUN_KNN:
    print('TEST::')
    print('hamming')
    knn.performanceEvaluation(test_docs[:], dist_type=0, n_neighbors=1)
    knn.performanceEvaluation(test_docs[:], dist_type=0, n_neighbors=3)
    knn.performanceEvaluation(test_docs[:], dist_type=0, n_neighbors=5)
    print('\neuclidean')
    knn.performanceEvaluation(test_docs[:], dist_type=1, n_neighbors=1)
    knn.performanceEvaluation(test_docs[:], dist_type=1, n_neighbors=3)
    knn.performanceEvaluation(test_docs[:], dist_type=1, n_neighbors=5)
    print('\ncosine')
    knn.performanceEvaluation(test_docs[:], dist_type=2, n_neighbors=1)
    knn.performanceEvaluation(test_docs[:], dist_type=2, n_neighbors=3)
    knn.performanceEvaluation(test_docs[:], dist_type=2, n_neighbors=5)

In [16]:
from copy import deepcopy
class NB_Text_Classifier:
    def __init__(self, train_docs, topic_wise_word_count, smoothing_factor, consider_oov):
        self.total_samples = len(train_docs)
        self.consider_oov = consider_oov
        self.smoothing_factor = smoothing_factor
        self.probab_topic = {}
        self.probab_word_given_topic = {}
        self.topic_wise_word_count = topic_wise_word_count
#         self.min_probab = smoothing_factor / smoothing_factor*VOCAB_SIZE # for cases when 𝑁_(𝑤_𝑖,𝐶_𝑘 )=0  ⇒𝑝(𝑤_𝑖│𝐶_𝑘 )=0 ⇒𝑝(𝐶_𝑘 |𝐷_𝑡 )=0 
        dummy_topic_map_for_words = {}
        for topic in topic_wise_word_count:
            dummy_topic_map_for_words[topic] = smoothing_factor
        
        for i, doc in enumerate(train_docs):
            if doc[0] == []:
                print(i)
                continue
            if doc[1] in self.probab_topic:
                self.probab_topic[doc[1]] += 1.
            else:
                self.probab_topic[doc[1]] = 1.
            
            for w in doc[0]:
                if w not in self.probab_word_given_topic:
                    self.probab_word_given_topic[w] = deepcopy(dummy_topic_map_for_words)
                self.probab_word_given_topic[w][doc[1]] += 1.
        
        for w in self.probab_word_given_topic:
            for topic in topic_wise_word_count:
                self.probab_word_given_topic[w][topic] /= (topic_wise_word_count[topic] + smoothing_factor * VOCAB_SIZE)
        
        for topic in topic_wise_word_count:
            self.probab_topic[topic] /= self.total_samples
            
    def nb_prediction_single_sample(self, doc_test): #based on given code
        prediction = [0.0, None] # probab, predicted_topic
        for topic in self.probab_topic:
            temp_probab = self.predict_probability(topic=topic, doc=doc_test)
            if temp_probab >= prediction[0]:
                prediction = [temp_probab, topic]
        return prediction
            
    def predict_probability(self, topic, doc): # P(C|x1,x2,...) = P(C) * P(x1|C) * P(x2|C) * ... / CONSTANT
        probab = self.probab_topic[topic] # P(C)
        for w in doc:
            if w not in vocab:
                if self.consider_oov:
                    probab *= self.smoothing_factor / (self.topic_wise_word_count[topic] + self.smoothing_factor * VOCAB_SIZE)
                continue
            probab *= self.probab_word_given_topic[w][topic]
        return probab
    
    def performanceEvaluation(self, test_docs):
        totalCount = 0
        correctCount = 0

        for i, doc_test_and_label in enumerate(test_docs):
            if doc_test_and_label[0] == []:
                print(i)
                continue
            predictedOutput = self.nb_prediction_single_sample(doc_test_and_label[0])

            if predictedOutput[1] == doc_test_and_label[1]:
                correctCount += 1
#             else:
#                 print(predictedOutput[1], doc_test_and_label[1])
            totalCount += 1
#             print(predictedOutput, doc_test_and_label[1])

        print("Total Correct Count: ",correctCount," Total Wrong Count: ",totalCount-correctCount," Accuracy: ",(correctCount*100)/(totalCount))    
        return (correctCount*100)/(totalCount)

In [17]:
if RUN_NB:
    nb = NB_Text_Classifier(train_docs=train_docs, 
                            topic_wise_word_count=topic_wise_word_count, 
                            smoothing_factor=1., 
                            consider_oov=False) #α: smoothing factor. If α = 1, then we call it Laplace Smoothing Factor. If α <1, then we call it Lidstone Smoothing Factor.

In [18]:
if RUN_NB:
    # nb.probab_topic, nb.probab_word_given_topic
    # nb.predict_probability(list(topic_wise_word_count)[0], test_docs[0][0])
    GRANULARITY = 10
    best_smoothing_factor = [-1, -1]
    for x in range(1, GRANULARITY+1):
        smoothing_factor = x/GRANULARITY
        print('smoothing_factor:', smoothing_factor)
        nb = NB_Text_Classifier(train_docs=train_docs, 
                                topic_wise_word_count=topic_wise_word_count, 
                                smoothing_factor=smoothing_factor,
                                consider_oov=False) # where to consider out of vocab words during computation
        val_acc = nb.performanceEvaluation(test_docs=val_docs)
        test_acc = nb.performanceEvaluation(test_docs=test_docs)
        if val_acc > best_smoothing_factor[1]:
            best_smoothing_factor = [smoothing_factor, val_acc]
    print(best_smoothing_factor)

smoothing_factor: 0.1
Total Correct Count:  1656  Total Wrong Count:  544  Accuracy:  75.27272727272727
Total Correct Count:  4190  Total Wrong Count:  1310  Accuracy:  76.18181818181819
smoothing_factor: 0.2
Total Correct Count:  1651  Total Wrong Count:  549  Accuracy:  75.04545454545455
Total Correct Count:  4178  Total Wrong Count:  1322  Accuracy:  75.96363636363637
smoothing_factor: 0.3
Total Correct Count:  1650  Total Wrong Count:  550  Accuracy:  75.0
Total Correct Count:  4171  Total Wrong Count:  1329  Accuracy:  75.83636363636364
smoothing_factor: 0.4
Total Correct Count:  1644  Total Wrong Count:  556  Accuracy:  74.72727272727273
Total Correct Count:  4168  Total Wrong Count:  1332  Accuracy:  75.78181818181818
smoothing_factor: 0.5
Total Correct Count:  1641  Total Wrong Count:  559  Accuracy:  74.5909090909091
Total Correct Count:  4156  Total Wrong Count:  1344  Accuracy:  75.56363636363636
smoothing_factor: 0.6
Total Correct Count:  1632  Total Wrong Count:  568  Accu

In [19]:
# 1 / np.array([2,2,2])
# word_position_in_vocab
# [1, 2, 3, 4, 1, 4, 1].count(4)

In [20]:
# cnt=0
# for document in train_docs:
#     if 'got' in document[0]:
#         cnt+=1
# cnt, C_w[0], D, D/C_w[0], IDF[0]

In [21]:
# np.sum(np.array([1.,1.,0.,1.]).astype(int)^np.array([0.,0.,0.,1.]).astype(int))

In [22]:
# a = 1
# b = a==1
# b