In [14]:
import numpy as np                                                  # to preform numerical operations
from sklearn import datasets                                        # to import required dataset
from sklearn import model_selection                                 # to split data into training and testing data
from nltk.corpus import stopwords                                   # for removing stopwords
import itertools                                                    # for slicing vocabulary dictionary into required length
import  pandas as pd                                                # for dataset manipulation
from sklearn.metrics import classification_report, confusion_matrix # for getting info of result precision and accuracy
from nltk.tokenize import word_tokenize                             # for tokenizing  
import math as ma

In [15]:
stop_words=set(stopwords.words('english'))                                            # get the stopwords 
news=datasets.fetch_20newsgroups()                                                    # load data into news 
x_train,x_test,y_train,y_test=model_selection.train_test_split(news.data,news.target,test_size=0.1) # split data into test and train    

In [16]:
print(stop_words)                                                   # Just checking what are the stopwords

{'should', 'such', 'which', 'in', "hasn't", 'of', 'his', 'them', 'the', 'll', "you've", 'too', 'me', 'above', 'was', "doesn't", 'being', 'by', 'myself', 'their', 'couldn', 'against', 'won', 'out', 'into', "you'd", 'same', 'have', "it's", 'both', 'most', 'hasn', 'so', 'mightn', 'shan', 'i', 'haven', 'shouldn', 'himself', 'own', 'hers', 'but', 't', 'for', 'all', 'any', 'is', 'wouldn', "needn't", 'where', 'each', 'to', 'off', 'doesn', 'didn', 'theirs', "should've", 'these', 'yourselves', 'he', 'an', 'from', 'it', 'who', 'with', 'on', 'under', 'very', 'why', 'does', 'there', 'while', "shan't", 'doing', 'him', "don't", 'ma', 'hadn', 'ourselves', 'about', 's', 'than', 'we', "weren't", "mightn't", 'this', 'did', 'be', 'mustn', "that'll", 'ours', 'ain', 'now', 'needn', 'how', 'her', 'other', 'over', 'will', 'those', 'themselves', 'its', 'down', 'your', 'are', 'what', 'before', 'a', 'after', 'then', 'more', 'they', 'or', "haven't", "shouldn't", 'd', 'no', "wouldn't", 'just', 'am', 'were', "hadn

In [17]:
#import nltk
#nltk.download('punkt')                                             # needed to be done first time
d={}                                                                # d is going to be our vocabulary dictionary
for i in range(len(x_train)):                                       # this function filters stopwords and creates vocabulary
    data=x_train[i]                                                 # taken one row at a time of training data
    words=word_tokenize(data)                                       # tokenize the data
    filtered_sentence=[ele for ele in words if ele not in stop_words] # get filtered sentence
    for ele in filtered_sentence:                                   # create vocabulary from filtered sentence    
        d[ele]=d.get(ele,0)+1                                         
new_dict={}                                                         # this will be our final vocabulary dictionary
for key,value in sorted(d.items(), key=lambda kv: kv[1],reverse=True): # put the key,value pair from d in decreasing order of value
    new_dict[key]=value
new_dict=dict(itertools.islice(new_dict.items(),6000))             # Take the top 6000 elements into final vocabulary
features=[]
for i in new_dict:
    features.append(i)

In [18]:
print(new_dict)                                                     # just checking final created vocabulary dictionary
print(features.shape)



AttributeError: 'list' object has no attribute 'shape'

In [20]:
def updatedRow(words,features,i):                                   
    lst=np.array([0]*6000)                                         # Initialise the lst to be np.zeros with dimensions 1 X 6000                                                           # i will help us iterate our lst 
    for ele in words:
        if ele in features:
            lst[features.index(ele)]=words.count(ele)
    return lst

def updateXTrain(x_train,features):
    x_train_updated=pd.DataFrame(np.zeros((len(x_train),6000)))     # Intialise the np.zeros 2-D array to x_train_updated
    x_train_updated.columns=features                                # Add columns to the x_train_updated
    for i in range(len(x_train)):                                   # Now operate on each row
        curr_x=x_train[i]                                           # curr_x contains the row i content
        words=word_tokenize(curr_x)                                 # extract all words of curr_x into words
        x_train_updated.iloc[i,:]=updatedRow(words,features,i)      # replace row i with the updateRow() function's list
    return x_train_updated                                          # return x_train_updated

x_train_updated=updateXTrain(x_train,features)                      # get updated x_train_updated
x_train_updated.columns                                             # just checking the columns

Index(['>', ',', '.', '--', ':', ')', '(', ''AX', '@', 'I',
       ...
       'laid', 'prize', 'interview', 'mono', '508', 'Ross', 'throttle',
       'Title', 'Sea', 'silver'],
      dtype='object', length=6000)

In [21]:
def fit(X_train, Y_train):                                          # This function is used to fit training data into our model
    result = {}                                                     # This dictionary is going to be useful in later calculations
    class_values = set(Y_train) 
    for current_class in class_values:                              # We create keys for all the possible classes
        result[current_class] = {}                                  # We create a dictionary as value for each key itself
        result["total"] = len(Y_train)                              # It gives total element present in our dictionary
        current_class_rows = (Y_train == current_class)             # Obtain rows for the current class
        X_train_current = X_train[current_class_rows]               # Filter the x_train for current class
        Y_train_current = Y_train[current_class_rows]               # Filter the y_train for current class
        num_features = X_train.shape[1]                             
        result[current_class]["count"] = len(Y_train_current)       # It gives total number of features of our data
        a=0                                                         # To get total number of a particular feature 
        for j in range(num_features):                               # traverse each feature
            result[current_class][j]=X_train_current[:,j].sum()     # Get total number of current feature
            a+=result[current_class][j]                             # Increment a by total number of current feature
        result[current_class]['total']=a                            # Assign a, will be used in later calculations
    return result                                                   # Return result


x_train_updated=np.array(x_train_updated)                           # Change pandas dataframe to numpy array
d=fit(x_train_updated,y_train)                                      # Fit ur trainig data 

In [22]:
def probability(dictionary, x, current_class): 
    output = ma.log(dictionary[current_class]["total"]) - ma.log(dictionary["total"])
    num_features = len(dictionary[current_class].keys()) - 2
    for j in range( num_features ):
        if x[j]==0:
            continue
        count_current_class_with_value_j = dictionary[current_class][j]+1
        count_current_class = dictionary[current_class]['total']+num_features
        current_j_probablity = ma.log(count_current_class_with_value_j) - ma.log(count_current_class)
        output = output + current_j_probablity
    return output

def doSinglePrediction(x,dictionary):                                  # Function to predict class for a single row
    classes = dictionary.keys()                                        # Get all possible classes
    best_p = -100                                                      # Initialise best probablity & class to some -ve number
    best_class = -100 
    first_run = True                                                   # Running for first time = True
    for current_class in classes:                                      # Iterate over each possible class
        if (current_class == "total"):                                 # Ignore 'total' key
            continue
        p_current_class = probability(dictionary, x, current_class)    # Get probablity for x belonging to current class
        if (first_run or p_current_class > best_p):                    # If this is greatest probablity till now, change the
            best_p = p_current_class                                    # value of greatest probability & predicted class
            best_class = current_class
        first_run = False                                              # First run complete
    return best_class                                                  # Return the predicted class out of all classes

def y_predict(x_test,d,features):                                       # Function to predict the output values for test data
    y_pred=np.zeros(len(x_test))                                        # We will use this numpy array to store predictions
    x_test_updated=updateXTrain(x_test,features)                        # Update x_test so that our model can operate on it
    x_test_updated=np.array(x_test_updated)                             # Change pandas dataframe to numpy array
    for i in range(len(x_test)):                                                                                                        # Extract one-one row from data
        curr_x=x_test_updated[i,:]                                      # Extract one-one row 
        y_pred[i]=doSinglePrediction(curr_x,d)                          # Predict class for current row, store the prediction
    return y_pred                                                       # Return predictions


In [23]:
y_pred=y_predict(x_test,d,features)                                     # Call function to predict classes for x_test
print(classification_report(y_test,y_pred))                             # Print classification report
print(confusion_matrix(y_test,y_pred))                                  # Print confusion matrix


              precision    recall  f1-score   support

           0       0.93      0.88      0.90        42
           1       0.50      0.69      0.58        54
           2       1.00      0.06      0.11        54
           3       0.56      0.61      0.58        66
           4       0.53      0.84      0.65        56
           5       0.92      0.70      0.79        66
           6       0.71      0.88      0.79        50
           7       0.75      0.85      0.80        61
           8       0.86      0.85      0.86        60
           9       0.91      0.89      0.90        66
          10       0.96      0.87      0.91        55
          11       0.91      0.96      0.94        54
          12       0.71      0.83      0.77        60
          13       0.90      0.83      0.86        52
          14       0.86      0.92      0.89        60
          15       0.89      0.93      0.91        60
          16       0.96      0.81      0.88        62
          17       0.97    

In [24]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))

0.7941696113074205


In [25]:
print('\n\n\t\tCOMPARING WITH LIBRARY IMPLEMENTATION')      # Compare built model with the sklearn's inbuilt model

from sklearn.naive_bayes import MultinomialNB               # Import MultinomialDB from sklearn
clf = MultinomialNB()                                       # Create object of MultinomialDB
clf.fit(x_train_updated, y_train)                           # Fit the training data
x_test_updated=updateXTrain(x_test,features)                # Update x_test so that model can operate on it
x_test_updated=np.array(x_test_updated)                     # Convert to np array
y_pred = clf.predict(x_test_updated)                        # Do predictions
print(classification_report(y_test,y_pred))                 # Print classification report
print(confusion_matrix(y_test,y_pred))                      # Print confusion matrix
print(accuracy_score(y_test,y_pred))



		COMPARING WITH LIBRARY IMPLEMENTATION
              precision    recall  f1-score   support

           0       0.90      0.88      0.89        42
           1       0.47      0.65      0.55        54
           2       0.60      0.06      0.10        54
           3       0.60      0.65      0.62        66
           4       0.59      0.82      0.69        56
           5       0.64      0.67      0.65        66
           6       0.65      0.88      0.75        50
           7       0.75      0.74      0.74        61
           8       0.75      0.90      0.82        60
           9       0.89      0.88      0.89        66
          10       0.96      0.87      0.91        55
          11       0.94      0.89      0.91        54
          12       0.73      0.78      0.76        60
          13       0.93      0.83      0.88        52
          14       0.91      0.85      0.88        60
          15       0.95      0.92      0.93        60
          16       0.84      0.76      

On comparing above results we can see that our implementation is performing better than the library implementation of Naive Bayes.