In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import datetime,time
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV,SGDClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,TfidfTransformer, HashingVectorizer
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.svm import SVC
import math
import statistics
from collections import defaultdict
from imblearn import over_sampling
from imblearn.over_sampling import SVMSMOTE
import re


In [2]:

data=pd.read_csv('data/train.csv')
X = data['text']
y = data['target']


# different Vectorizers
def Hash_vec(X):
    Hvect=HashingVectorizer(lowercase=True,ngram_range=(1,1))
    X=Hvect.fit_transform(X)
    
    return X

def Count_Vec(X):
    CountV=CountVectorizer(lowercase=True)
    X=CountV.fit_transform(X)
    
    return X

def TFIDF_vec(X):
    tfidf=TfidfVectorizer(use_idf=True,lowercase=True)
    X=tfidf.fit_transform(X)
    
    return X


def overSample(X,y):
    
    oversample = SVMSMOTE(n_jobs=-1,random_state=47)
    X, y = oversample.fit_resample(X, y)
    
    return X,y



# ML methods
NB1 = BernoulliNB(alpha=0.0)
NB2 = BernoulliNB(alpha=0.1)
NB3 = BernoulliNB(alpha=0.2)
NB4 = BernoulliNB(alpha=0.3)
NB5 = BernoulliNB(alpha=0.4)
NB6 = BernoulliNB(alpha=0.5)
NB7 = BernoulliNB(alpha=0.6)
NB8 = BernoulliNB(alpha=0.7)
NB9 = BernoulliNB(alpha=0.8)
NB10 = BernoulliNB(alpha=0.9)
NB11 = BernoulliNB(alpha=1.0)
          
list_of_Vectorizers = {'Hashing Vectorizer':Hash_vec,'Count Vectorizer':Count_Vec,
                       'TFIDF Vectorizer':TFIDF_vec}

list_of_models = {'Bernoulli Naive Bayes classifier with 0 Laplace Smoothing':NB1,
                  'Bernoulli Naive Bayes classifier with 0.1 Laplace Smoothing':NB2, 
                  'Bernoulli Naive Bayes classifier with 0.2 Laplace Smoothing':NB3, 
                  'Bernoulli Naive Bayes classifier with 0.3 Laplace Smoothing':NB4, 
                  'Bernoulli Naive Bayes classifier with 0.4 Laplace Smoothing':NB5,
                  'Bernoulli Naive Bayes classifier with 0.5 Laplace Smoothing':NB6,
                  'Bernoulli Naive Bayes classifier with 0.6 Laplace Smoothing':NB7, 
                  'Bernoulli Naive Bayes classifier with 0.7 Laplace Smoothing':NB8, 
                  'Bernoulli Naive Bayes classifier with 0.8 Laplace Smoothing':NB9, 
                  'Bernoulli Naive Bayes classifier with 0.9 Laplace Smoothing':NB10,
                  'Bernoulli Naive Bayes classifier with 1 Laplace Smoothing':NB11}


models_accuracy = defaultdict()
models_built_time=defaultdict()
models_prediction_time=defaultdict()



def get_Model_results(models,vectorizers,X,y):
    
    for Vectorizer_name, vectorizer in vectorizers.items():
        print('\n...........Results for {}..........'.format(Vectorizer_name))
        X_vec=vectorizer(X)
        X_samp,y_samp=overSample(X_vec,y)

        X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.30, random_state=42)
        
        for model_name, model in models.items():
            start_time =  time.time()%60
            text_clf = model.fit(X_train, y_train)
            end_time= time.time()%60

            if(end_time>start_time):
                time_to_build_the_model='%.2f' %(end_time-start_time)
            else:
                time_to_build_the_model='%.2f' %(start_time-end_time)


            start_time = time.time()%60
            predicted = text_clf.predict(X_test)
            end_time= time.time()%60

            if(end_time>start_time):
                time_to_predict='%.2f' %(end_time-start_time)
            else:
                time_to_predict='%.2f' %(start_time-end_time)


            print('\n-----------------\nModel: {}'.format(model_name))
            print(confusion_matrix(y_test,predicted))
            print(classification_report(y_test,predicted))
            print('\nAccuracy on Training:\n{00:.2f} %'.format(text_clf.score(X_train,y_train)*100))

            print('\nAccuracy on Testing:\n{00:.2f} %'.format(accuracy_score(y_test,predicted)*100))

            print('\nTime taken to build the model is {} Seconds'.format(time_to_build_the_model))
            print('\nTime taken for prediction is {} Seconds'.format(time_to_predict))



get_Model_results(list_of_models,list_of_Vectorizers,X,y)




...........Results for Hashing Vectorizer..........





-----------------
Model: Bernoulli Naive Bayes classifier with 0 Laplace Smoothing
[[1083  228]
 [ 304  991]]
              precision    recall  f1-score   support

           0       0.78      0.83      0.80      1311
           1       0.81      0.77      0.79      1295

    accuracy                           0.80      2606
   macro avg       0.80      0.80      0.80      2606
weighted avg       0.80      0.80      0.80      2606


Accuracy on Training:
95.38 %

Accuracy on Testing:
79.59 %

Time taken to build the model is 0.09 Seconds

Time taken for prediction is 0.12 Seconds

-----------------
Model: Bernoulli Naive Bayes classifier with 0.1 Laplace Smoothing
[[1109  202]
 [ 305  990]]
              precision    recall  f1-score   support

           0       0.78      0.85      0.81      1311
           1       0.83      0.76      0.80      1295

    accuracy                           0.81      2606
   macro avg       0.81      0.81      0.81      2606
weighted avg       0.81   




-----------------
Model: Bernoulli Naive Bayes classifier with 0 Laplace Smoothing
[[1071  240]
 [ 338  957]]
              precision    recall  f1-score   support

           0       0.76      0.82      0.79      1311
           1       0.80      0.74      0.77      1295

    accuracy                           0.78      2606
   macro avg       0.78      0.78      0.78      2606
weighted avg       0.78      0.78      0.78      2606


Accuracy on Training:
97.25 %

Accuracy on Testing:
77.82 %

Time taken to build the model is 0.01 Seconds

Time taken for prediction is 0.00 Seconds

-----------------
Model: Bernoulli Naive Bayes classifier with 0.1 Laplace Smoothing
[[1145  166]
 [ 322  973]]
              precision    recall  f1-score   support

           0       0.78      0.87      0.82      1311
           1       0.85      0.75      0.80      1295

    accuracy                           0.81      2606
   macro avg       0.82      0.81      0.81      2606
weighted avg       0.82   




-----------------
Model: Bernoulli Naive Bayes classifier with 0 Laplace Smoothing
[[1072  239]
 [ 242 1053]]
              precision    recall  f1-score   support

           0       0.82      0.82      0.82      1311
           1       0.82      0.81      0.81      1295

    accuracy                           0.82      2606
   macro avg       0.82      0.82      0.82      2606
weighted avg       0.82      0.82      0.82      2606


Accuracy on Training:
98.68 %

Accuracy on Testing:
81.54 %

Time taken to build the model is 0.01 Seconds

Time taken for prediction is 0.00 Seconds

-----------------
Model: Bernoulli Naive Bayes classifier with 0.1 Laplace Smoothing
[[1137  174]
 [ 256 1039]]
              precision    recall  f1-score   support

           0       0.82      0.87      0.84      1311
           1       0.86      0.80      0.83      1295

    accuracy                           0.83      2606
   macro avg       0.84      0.83      0.83      2606
weighted avg       0.84   