# Project: Text Classification

## Neccessary Imports

In [1]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import enchant
import re

In [2]:
# Fetching the data on which Naive Bayes is to be used

newsgroup = fetch_20newsgroups()

In [3]:
# Splitting the data between the test and train

x_train, x_test, y_train, y_test=train_test_split(newsgroup.data, newsgroup.target, random_state=1)

In [4]:
# 1. Making a list of all the stopwords present in the English language 
# 2. I have written all the stopwords in a text file and then loading it here to make the document
# look clean

stops = np.loadtxt("data/stop_words_english.txt", dtype=str, delimiter=" ")
stopwords = list(stops)

In [5]:
# 1. 'find_valid_words_in_text_list' is function which return a dictionary with all words 
# used in the training dataset with its frequency
# 2. I have lowercase all the words in the training data set to have a consistent dictionary
# 3. I have imported 'enchant' to check if the word in the document is valid english words or not
# 4. I have excluded the digits in the data because they give very little info about the classification
# 5. I have excluded words with length less that 3 to have more informative words in dictionary

def find_valid_words_in_text_list(txt_list, stopwords) :
    wordsList = []
    for i in range(len(txt_list)) :
        res = re.findall(r'\w+', txt_list[i])
        wordsList.extend(res)
        
    d = enchant.Dict('en-US')

    wordsFreq = {}
    for word in wordsList :
        if (word.lower() in wordsFreq) or ((word.lower() not in stopwords) 
                                       and (word.isdigit() is False) 
                                       and (d.check(word)) 
                                       and len(word)!= 1 and len(word)!=2) :
            wordsFreq[word.lower()] = wordsFreq.get(word.lower(), 0) + 1
            
    return wordsFreq

In [6]:
wordsFreq = find_valid_words_in_text_list(x_test, stopwords)

# reversing the dictionary according to the values to get most frequent words at starting

desc_wordsFreq = sorted(wordsFreq.items(), key = lambda x : x[1], reverse=True)
desc_wordsFreq = dict(desc_wordsFreq)

In [7]:
# 'words' is a list containing all the words used in training data set
# 'freq' is a list conataining the frequency of those words

words=np.array([i for i in desc_wordsFreq.keys()])
freq=np.array([i for i in desc_wordsFreq.values()])

In [8]:
# 'gen_data_table' function return the 2d array where columns represent 'features' or 'words'
# that are selected from the training data set text documents and rows represent the number of
# times those 'features' or 'words' appear in each document 

def gen_data_table(x_data, features):
    
    word_feature_map = np.zeros((len(x_data), len(features)))
    
    for i in range(len(x_data)):
        current_words = re.findall(r'\w+', x_data[i])
        tmpdct = {}
        for word in current_words :
            tmpdct[word] = tmpdct.get(word, 0) + 1
            
        for j in range(len(features)) :
            if features[j] in tmpdct:
                word_feature_map[i][j] = tmpdct[features[j]]
                
    return word_feature_map 

In [9]:
# selecting 4000 features for the classification

features = words[0:4000]

x_train_2d = gen_data_table(x_train, features)

x_test_2d = gen_data_table(x_test, features)

In [10]:
# Initializing scikit learn's Naive Bayes' algorithm
# MultinomialNB is used because multiple features are present in the training set

clf=MultinomialNB()
clf.fit(x_train_2d, y_train)
clf.score(x_test_2d, y_test)
y_pred_func = clf.predict(x_test_2d)

## Own Implementation Of Naive Bayes

In [11]:
# 1. The fit function takes the training data as input and creates a dictionary of dictionries 
# with name 'result'
# 2. The base dictionary contains all the possible classes in which to classify our text doc
# 3. The second level dictionary contains how many times a particular feature is coming
# corresponding to a particular class
# 4. A 'total' key in included to store the total number of words that occurs in a particular class

def fit(x_train, y_train):
    result = {}
    all_classes = set(y_train)
    for current_class in all_classes:
        
        result[current_class] = {}
        x_train_current = x_train[y_train == current_class]

        result[current_class]['total']=0

        for j in range(len(features)):
            result[current_class][features[j]]=x_train_current[:, j].sum()
            
            result[current_class]['total'] += result[current_class][features[j]]

    return result

In [12]:
# 1.'probaility' function first stores all the probabilities for each word in features which is 
# available in the current document and then return the logarithmic sum of all these probabilities
# for that particular document.
# 2. skipping those features whose value corresponding to the current document is 0, 
# that is, they are not present
# 3. 'numerator' - it denotes that how many times that ith feature is occuring in "current_class"
#this is the numerator of our NON LOGARITHMIC PROBABILITY
# 4. 'denominator' - it denotes that what is the total number of words in the current class.
# this is the denominator of our NON LOGARITHMIC PROBABILITY
# 5. 'proba' - stores the logarithmic probability which also includes the laplace crrection.
# here "1" with the numerator and len(x) with denominator denotes the laplace correction.

def probability(dictionary, x, current_class):
    probas_for_each_word=[]

    for i in range(len(x)):
        if x[i]!=0:

            numerator=dictionary[current_class][features[i]]
            
            denominator=dictionary[current_class]['total']
            
            proba=np.log((numerator+1)/(denominator+len(x)))

            probas_for_each_word.append(proba)

    return sum(probas_for_each_word)

In [13]:
# 1. 'Predict_single' function returns the best class to which a particular text document best
# belongs to according to the maximum probability score
# 2. the loop inside this function finds out the probability of a document corresponding to 
# each class if the probability of current class is better than the best probability then 
# it will update the best probability and best_class accordingly

def predict_single(dictionary, x):
    classes = dictionary.keys()

    best_p = -1000
    best_class = -1
    
    first_run = True

    for current_class in classes:
        #iterating through each and every class in all possible classes.
        p_current_class = probability(dictionary, x, current_class)
        #p_current_class denotes the probability of current class.
        if (first_run or p_current_class > best_p):
    
            best_p = p_current_class
            best_class = current_class

        first_run = False

    return best_class

In [14]:
# 'predict' function return a list with the predicted classes or all testing data set

def predict(x_test, dictionary):
    y_pred=[]
    
    for doc in x_test:
        
        y_pred.append(predict_single(dictionary, doc))
        
    return y_pred

In [15]:
# Fitting training data

dictionary=fit(x_train_2d, y_train)

In [16]:
y_predicted=predict(x_test_2d, dictionary)

## Comparing MultinomialNB with our own Implementation of Naive Bayes

In [17]:
#printing the classification report for inbuilt naive bayes classifier.

print(classification_report(y_true=y_test, y_pred=y_pred_func))

              precision    recall  f1-score   support

           0       0.78      0.79      0.79       114
           1       0.66      0.72      0.69       152
           2       0.67      0.72      0.69       139
           3       0.64      0.68      0.66       152
           4       0.67      0.67      0.67       138
           5       0.78      0.75      0.76       153
           6       0.75      0.78      0.76       147
           7       0.74      0.80      0.77       137
           8       0.79      0.85      0.82       131
           9       0.84      0.87      0.85       135
          10       0.88      0.88      0.88       136
          11       0.94      0.90      0.92       145
          12       0.77      0.65      0.71       157
          13       0.90      0.91      0.90       151
          14       0.91      0.83      0.87       155
          15       0.80      0.84      0.82       159
          16       0.81      0.86      0.83       140
          17       0.84    

In [18]:
# printing the classification report for our own naive bayes classifier.

print(classification_report(y_true=y_test, y_pred=y_predicted))

              precision    recall  f1-score   support

           0       0.83      0.75      0.79       114
           1       0.58      0.72      0.64       152
           2       0.65      0.75      0.70       139
           3       0.58      0.70      0.64       152
           4       0.60      0.63      0.62       138
           5       0.78      0.67      0.72       153
           6       0.76      0.69      0.73       147
           7       0.72      0.80      0.76       137
           8       0.86      0.79      0.83       131
           9       0.85      0.87      0.86       135
          10       0.90      0.90      0.90       136
          11       0.91      0.94      0.93       145
          12       0.81      0.56      0.66       157
          13       0.93      0.90      0.92       151
          14       0.95      0.81      0.87       155
          15       0.76      0.89      0.82       159
          16       0.83      0.84      0.84       140
          17       0.85    