In [478]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import BernoulliNB


# Creating NLP Pipeline

In [427]:
###Tokenize
token = RegexpTokenizer('[a-zA-Z]+')

In [428]:
###USEFUL WORDS
def usefulWords(words):
    st = list(stopwords.words('english'))
    useful_word = []
    for word in words:
        if word not in st:
            useful_word.append(word)
    return useful_word
    

In [429]:
### stemming
def stem(words):
    sb = SnowballStemmer('english')
    for i in range(len(words)):
        words[i] = sb.stem(words[i])
    return words

In [430]:
def cleaned_doc(review):
    review = review.lower()
    review = token.tokenize(review)
    review = usefulWords(review)
    review = stem(review)
    return review

In [495]:
df = pd.read_csv('Train.csv')
X = df.iloc[:,0].values
Y = df.iloc[:,1].values
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.4)


# Using custom Function to calculate probability

In [496]:
def prior_probab(Y,Y_val):
    num = np.sum(Y==Y_val)
    denom = len(Y)
    return num/float(denom)

In [497]:
def cond_probab(X_train,Y_train,X_feature,Y_val):
    X_filter = X_train[Y_train==Y_val]
    num = np.sum(X_filter[:X_feature])
    denom = np.sum(X_filter)
    return num/float(denom)

In [498]:
def probab(X_train,Y_train,x_test):
    x_test = x_test.reshape((-1,))
    pred = []
    predictions = np.unique(Y_train)
    for val in predictions:
        cond=1.0
        for i in range(x_test.shape[0]):
            if x_test[i]!=0:
                cond1 =cond_probab(X_train,Y_train,i,val)
                cond = cond*cond1
        cond = cond*prior_probab(Y_train,val)
        pred.append(cond)   
    ans = np.argmax(pred)
    if(ans==0):
        return "neg"
    else:
        return "pos"

In [499]:
cv = CountVectorizer(tokenizer=cleaned_doc)
vec = cv.fit_transform(X_train).toarray()


# Using MultiNomial Naive Bayes Classifier 

In [None]:
mnb = MultinomialNB()
mnb.fit(vec,Y_train)

In [463]:
def prediction(X_test):
    Y_pred = []
    for i in range(len(X_test)):
        vec_ = cv.transform(np.array([X_test[i]]))
        ans = mnb.predict(vec_)
        Y_pred.append(ans)
    return Y_pred

In [468]:
Y_pred = prediction(X_test)

In [469]:
cnf_matrix = confusion_matrix(Y_test,Y_pred)

In [470]:
cnf_matrix

array([[165,  33],
       [ 58, 144]], dtype=int64)

# Using MultiVariate Burnoulli Naive Bayes'

In [489]:
bn = BernoulliNB()

In [490]:
bn.fit(vec,Y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [491]:
def prediction_(X_test):
    Y_pred = []
    for i in range(len(X_test)):
        vec_ = cv.transform(np.array([X_test[i]]))
        ans = bn.predict(vec_)
        Y_pred.append(ans)
    return Y_pred

In [492]:
Y_pred_ = prediction_(X_test)

In [493]:
cnf_matrix = confusion_matrix(Y_test,Y_pred)

In [494]:
cnf_matrix

array([[165,  33],
       [ 58, 144]], dtype=int64)