# Naive Bayes with Not-Predetermined Vocabulary

Reference code: https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html

# Pre-processing

Imports and loading dataset

In [None]:
import pandas as pd
import nltk
import seaborn as sns
import re
import numpy as np
import math
import warnings
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.metrics import confusion_matrix
warnings.filterwarnings("ignore")

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
df = pd.read_csv('train1_full.csv') 
df.head(10)

In [None]:
# Here we get transform the documents into sentences
def preprocess(df):
    df['comment_text'] = df.comment_text.str.lower()
    df['document_sentences'] = df.comment_text.str.split('.') 
    df['tokenized_sentences'] = list(map(lambda sentences: list(map(nltk.word_tokenize, sentences)), df.document_sentences))  
    df['tokenized_sentences'] = list(map(lambda sentences: list(filter(lambda lst: lst, sentences)), df.tokenized_sentences))

preprocess(df)

Split data into training and test sets

In [None]:
from sklearn.model_selection import train_test_split
train, test, y_train, y_test = train_test_split(df.drop(columns='label'), df['label'], test_size=.2)

In [None]:
def remove_items(test_list, item):
    # utility function to remove stop words
    for i in test_list:
        if(i == item):
            test_list.remove(i)
  
    return test_list

In [None]:
def preprocess_text(sen):
    sentence = sen.lower()
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    # Remove extra spaces
    sentence = re.sub(' +', ' ', sentence)
    sentence_list = sentence.split()

    # Removing stop words
    stop_words = ['u', 'ur', 'im', 'can', 'cant', 'i', 'me', 'my', 'myself', 'we', 'go', 'our', 'ours', 'ourselves', 'you', "youre", "youve", "youll", "youd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "thatll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "shouldve", 'now', 'd', 'll', 'm', 'o', 're', 'r', 'ur', 've', 'y', 'ain', 'aren', "arent", 'couldn', "couldnt", 'didn', "didnt", 'doesn', "doesnt", 'hadn', "hadnt", 'hasn', "hasnt", 'haven', "havent", 'isn', "isnt", 'ma', 'mightn', "mightnt", 'mustn', "mustnt", 'needn', "neednt", 'shan', "shant", 'shouldn', "shouldnt", 'wasn', "wasnt", 'weren', "werent", 'won', "wont", 'wouldn', "wouldnt"]
    for stop_word in stop_words:
      if stop_word in sentence_list:
        sentence_list = remove_items(sentence_list, stop_word)

    # Join back to list
    sentence = " ".join(sentence_list)

    # Remove extra spaces
    sentence = re.sub(' +', ' ', sentence)
    return sentence.lstrip()

Get a list of all messages and labels

In [None]:
X_train = []
sentences = list(train["comment_text"])
for sen in sentences:
    X_train.append(preprocess_text(sen))

y_train = np.asarray(y_train)

In [None]:
X_test = []
sentences1 = list(test["comment_text"])
for sen in sentences1:
    X_test.append(preprocess_text(sen))
  
y_test = np.asarray(y_test)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X_train_matrix = vectorizer.fit_transform(X_train)
X_test_matrix = vectorizer.transform(X_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB
NBClf = MultinomialNB()
NBClf.fit(X_train_matrix, y_train)

In [None]:
predictions = NBClf.predict(X_test_matrix)

In [None]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, predictions))

In [None]:
print(confusion_matrix(y_test,predictions,normalize='true'))
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))