In [0]:
import os 
import re
import string
import math
import pandas as pd
import numpy as np


target_names = ['conan','jane']


In [0]:
#here we just read the text files into a dataframe

jane = pd.read_fwf('jane.txt')
conan = pd.read_fwf('conan.txt')

In [4]:
conan.shape

(10385, 1)

In [0]:
#here we create labels for the authors, 0 -> Arthur Coanan, 1-> Jane Austen
zero =  pd.DataFrame(np.zeros(conan.shape[0]))
ones = pd.DataFrame(np.ones(jane.shape[0]))

In [0]:
#here we just add the labels to the data

In [0]:
conan=pd.concat([conan,zero],axis=1)


In [0]:
jane = pd.concat([jane,ones],axis=1)

In [0]:
jane.columns=['text','label']
conan.columns = ['text','label']


In [0]:
#now we concatenate botht the files for easier parsing

In [0]:
merge = pd.concat([jane,conan])

In [0]:
from sklearn.utils import shuffle
merge = shuffle(merge)

In [11]:
merge

Unnamed: 0,text,label
15184,"stamp, for he had easy manners, excellent spir...",1.0
10305,A reverie succeeded this conviction--and when ...,1.0
9214,"""There are four umbrellas up already. How I ha...",1.0
39830,must return it by him.,1.0
33897,if you saw it. I dare not let my mother know h...,1.0
66896,hear from him again. I earnestly pressed his c...,1.0
7029,most fleeting glance of the front of the house...,0.0
44634,for the uncertain and unequal Amusements of th...,1.0
43787,"were concerning Philippa and her Husband, the ...",1.0
50657,de Bourgh she derived this comfort for Miss Bi...,1.0


In [36]:
merge.groupby('label').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0.0,10385,10308,"""Yes.""",12
1.0,67876,67289,Churchhill.,17


In [0]:
text = merge['text']
target = merge['label']

In [0]:
class naivebayes(object):
    
    def fit(self, X, Y):

      self.num_messages = {}
      self.log_class_priors = {}
      self.word_counts = {}
      self.vocab = set()
 
      n = len(X)
      self.num_messages['jane'] = sum(1 for label in Y if label == 1)
      self.num_messages['conan'] = sum(1 for label in Y if label == 0)
      #here the log priors are calculated 
      self.log_class_priors['jane'] = math.log(self.num_messages['jane'] / n)
      self.log_class_priors['conan'] = math.log(self.num_messages['conan'] / n)
      self.word_counts['jane'] = {}
      self.word_counts['conan'] = {}

      for x, y in zip(X, Y):
          c = 'jane' if y == 1 else 'conan'
          counts = self.get_word_counts(self.tokenize(x))
          for word, count in counts.items():
              if word not in self.vocab:
                  self.vocab.add(word)
              if word not in self.word_counts[c]:
                  self.word_counts[c][word] = 0.0

              self.word_counts[c][word] += count
    def clean(self, s):
        translator = str.maketrans("", "", string.punctuation)
        return s.translate(translator)
 
    def tokenize(self, text):
        text = self.clean(text).lower()
        return re.split("\W+", text) #we tokenize the lines in the text
 
    def get_word_counts(self, words):
        word_counts = {}
        for word in words:
            word_counts[word] = word_counts.get(word, 0.0) + 1.0
        return word_counts #we get the word counts of all the words present in the lines
    
    def predict(self, X):
      result = []
      for x in X:
          counts = self.get_word_counts(self.tokenize(x))
          j_score = 0
          c_score = 0
          for word, _ in counts.items():
              if word not in self.vocab: continue

              # here we add laplace smoothing which will prevent divison by 0 and also add
              # log to help with the computation of the values
              log_w_given_j = math.log( (self.word_counts['jane'].get(word, 0.0) + 1) / (self.num_messages['jane'] + len(self.vocab)) )
              log_w_given_c = math.log( (self.word_counts['conan'].get(word, 0.0) + 1) / (self.num_messages['conan'] + len(self.vocab)) )

              j_score += log_w_given_j
              c_score += log_w_given_c

          j_score += self.log_class_priors['jane']
          c_score += self.log_class_priors['conan']

          if j_score > c_score: #here we check if proba of jane is > prob of conan we append the results  appropriately
              result.append(1)
          else:
              result.append(0)
      return result


In [0]:
Mnn= naivebayes()


In [0]:
naive_bayes = naivebayes()
naive_bayes.fit(text[100:], target[100:]) #we call the fit function on all the lines except the first 
pred_Values = naive_bayes.predict(text[:1000]) #we predict for the the first 1000 lines
true_values = target[:1000]
    

In [64]:
from collections import Counter
Counter(pred_Values)

Counter({0: 12, 1: 988})

In [66]:
Counter(true_values)

Counter({0.0: 138, 1.0: 862})

In [67]:
np.shape(pred_Values)

(1000,)

In [68]:
np.shape(true_values)

(1000,)

In [0]:
from sklearn.metrics import accuracy_score

In [70]:
print("Accuracy is:",accuracy_score(true_values,pred_Values)*100,'%')

Accuracy is: 87.4 %
