In [1]:
from sklearn.datasets import fetch_20newsgroups
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.naive_bayes import MultinomialNB

In [2]:
newsgroups=fetch_20newsgroups() 

stops=set(stopwords.words('english'))

punctuations=list(string.punctuation)

stops.update(punctuations)

newsgroups.keys()


dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mohan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
more_stops=np.loadtxt("datasets/stopwords.txt", dtype=str, delimiter=" ")
stops.update(more_stops)
len(stops)

517

In [5]:
len(newsgroups.data)

11314

In [6]:
all_documents=newsgroups.data

all_categories=newsgroups.target

all_documents_modified=[word_tokenize(doc) for doc in all_documents]

In [7]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mohan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
x_train, x_test, y_train, y_test=train_test_split(all_documents_modified, all_categories, random_state=1)

In [9]:
all_words=[]

for doc in x_train:
    for word in doc:
        if (word.lower() not in stops) and len(word)!=1 and len(word)!=2 and word[0]!="'" and word!="n't" and word[0]!=".":
            all_words.append(word)

In [10]:
len(all_words)

1179675

In [11]:
def freq_dict(all_words):
    dic=dict()
    for word in all_words:
        if word in dic.keys():
            dic[word]+=1
        else:
            dic[word]=1
    return dic

dic=freq_dict(all_words)

In [12]:
import numpy as np
freq=np.array([i for i in dic.values()])
words=np.array([i for i in dic.keys()])

In [13]:
words=words[np.argsort(freq)][::-1]
freq=np.sort(freq)[::-1]

In [14]:
features=words[20:4000]
#features variable contains all the top words which are most frequently used in all our documents. 
freq[20], freq[3999]

(1242, 47)

In [15]:
def data_modifier(x_data, features):
    modified_data=np.zeros((len(x_data), len(features)))
    
    for i in range(len(x_data)):
        
        current_doc=x_data[i]
        
        for word in current_doc:
           
            if word in features:
               
                for j in range(len(features)):
                    
                    if features[j]==word:
                        modified_data[i][j]+=1
    
    return modified_data 

In [16]:
x_train_modified = data_modifier(x_train, features)

In [17]:
x_test_modified= data_modifier(x_test, features)

## Trying out the inbuilt Multinomial Naive Bayes classifier

In [18]:
clf=MultinomialNB()
clf.fit(x_train_modified, y_train)
clf.score(x_test_modified, y_test)


0.8193708024036762

## Writing our own Naive Bayes Classifier

In [19]:
def fit(x_train, y_train):
    count=dict()
    for i in range(20):
        needed_docs=x_train[y_train==i]
    
        count[i]=dict()
        
        count[i]['total']=0
        
        for j in range(len(features)):
            count[i][features[j]]=needed_docs[:, j].sum()
            
            count[i]['total']+=count[i][features[j]]
    return count


def probability(dictionary, x, current_class):
    probas_for_each_word=[]

    for i in range(len(x)):
        
        if x[i]!=0:
            
            numerator=dictionary[current_class][features[i]]
            
            denominator=dictionary[current_class]['total']
            
            proba=np.log((numerator+1)/(denominator+len(x)))
            
            probas_for_each_word.append(proba)
            
    return sum(probas_for_each_word)


def predict_single(dic, x):
    classes = dictionary.keys()
    
    best_p = -1000
    best_class = -1
    
    first_run = True
   
    for current_class in classes:
        
        p_current_class = probability(dic, x, current_class)
        if (first_run or p_current_class > best_p):
            
            best_p = p_current_class
            best_class = current_class
            
        first_run = False
    
    return best_class


def predict(x_test, dic):
    y_pred=[]
    
    for doc in x_test:
        
        y_pred.append(predict_single(dic, doc))
    return y_pred

In [20]:
dictionary=fit(x_train_modified, y_train)

In [21]:
y_predicted=predict(x_test_modified, dictionary)

In [22]:
from sklearn.metrics import confusion_matrix
#printing the confusion matrix for our own naive bayes classifier.
#here i am manually printing the confusion matrix for a more clear view.
for i in confusion_matrix(y_true=y_test, y_pred=y_predicted):
    for j in i:
        print(j, end="    ")
    print()

91    0    0    0    0    0    0    0    0    0    0    0    0    1    0    8    1    0    1    12    
0    118    4    11    5    7    2    0    0    0    0    1    1    1    0    1    0    0    0    1    
0    11    99    17    2    9    0    1    0    0    0    0    0    0    0    0    0    0    0    0    
0    9    3    122    7    2    4    2    0    0    0    1    1    0    0    1    0    0    0    0    
1    4    0    16    108    0    5    2    1    0    0    0    0    1    0    0    0    0    0    0    
0    25    2    6    3    113    0    1    2    0    0    0    1    0    0    0    0    0    0    0    
0    7    0    8    0    1    112    5    2    3    1    1    4    1    0    1    0    0    1    0    
0    2    0    2    0    0    6    115    6    1    0    0    4    1    0    0    0    0    0    0    
0    1    0    0    0    0    4    11    111    0    0    0    1    0    0    0    3    0    0    0    
0    1    0    2    0    0    0    0    0    127    1    0    1    0

In [23]:
from sklearn.metrics import classification_report
print(classification_report(y_true=y_test, y_pred=y_predicted))

              precision    recall  f1-score   support

           0       0.86      0.80      0.83       114
           1       0.53      0.78      0.63       152
           2       0.90      0.71      0.80       139
           3       0.58      0.80      0.68       152
           4       0.78      0.78      0.78       138
           5       0.84      0.74      0.79       153
           6       0.75      0.76      0.75       147
           7       0.77      0.84      0.80       137
           8       0.88      0.85      0.86       131
           9       0.86      0.94      0.90       135
          10       0.98      0.88      0.93       136
          11       0.93      0.95      0.94       145
          12       0.86      0.62      0.72       157
          13       0.93      0.91      0.92       151
          14       0.98      0.83      0.90       155
          15       0.85      0.91      0.88       159
          16       0.81      0.89      0.84       140
          17       0.97    