In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import datetime,time
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV,SGDClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,TfidfTransformer, HashingVectorizer
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.svm import SVC
import math
import statistics
from collections import defaultdict
from imblearn import over_sampling
from imblearn.over_sampling import SVMSMOTE
import re


In [2]:

data=pd.read_csv('data/train.csv')
X = data['text']
y = data['target']


# different Vectorizers
def Hash_vec(X):
    Hvect=HashingVectorizer(lowercase=True,ngram_range=(1,1))
    X=Hvect.fit_transform(X)
    
    return X

def Count_Vec(X):
    CountV=CountVectorizer(lowercase=True)
    X=CountV.fit_transform(X)
    
    return X

def TFIDF_vec(X):
    tfidf=TfidfVectorizer(use_idf=True,lowercase=True)
    X=tfidf.fit_transform(X)
    
    return X


def overSample(X,y):
    
    oversample = SVMSMOTE(n_jobs=-1,random_state=47)
    X, y = oversample.fit_resample(X, y)
    
    return X,y



# ML methods
SVM1 = SVC()
SVM2 = SVC(kernel='poly', degree=3)
SVM3 = SVC(kernel='poly', degree=4)
SVM4 = SVC(kernel='poly', degree=5)
SVM5 = SVC(kernel='rbf')
SVM6 = SVC(kernel='sigmoid')

          
list_of_Vectorizers = {'Hashing Vectorizer':Hash_vec,'Count Vectorizer':Count_Vec,
                       'TFIDF Vectorizer':TFIDF_vec}

# list of thr models as pipes
list_of_models = {'SVM Classifier with Linear Kernel':SVM1,
                  'SVM Classifier with 3rd order Polynomial Kernel':SVM2, 
                  'SVM Classifier with 4th order Polynomial Kernel':SVM3, 
                  'SVM Classifier with 5th order Polynomial Kernel':SVM4, 
                  'SVM Classifier with RBF Kernel':SVM5, 
                  'SVM Classifier with Sigmoid Kernel':SVM6}


models_accuracy = defaultdict()
models_built_time=defaultdict()
models_prediction_time=defaultdict()



def get_Model_results(models,vectorizers,X,y):
    
    for Vectorizer_name, vectorizer in vectorizers.items():
        print('\n...........Results for {}..........'.format(Vectorizer_name))
        X_vec=vectorizer(X)
        X_samp,y_samp=overSample(X_vec,y)

        X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.30, random_state=42)
        
        for model_name, model in models.items():
            start_time =  time.time()%60
            text_clf = model.fit(X_train, y_train)
            end_time= time.time()%60

            if(end_time>start_time):
                time_to_build_the_model='%.2f' %(end_time-start_time)
            else:
                time_to_build_the_model='%.2f' %(start_time-end_time)


            start_time = time.time()%60
            predicted = text_clf.predict(X_test)
            end_time= time.time()%60

            if(end_time>start_time):
                time_to_predict='%.2f' %(end_time-start_time)
            else:
                time_to_predict='%.2f' %(start_time-end_time)


            print('\n-----------------\nModel: {}'.format(model_name))
            print(confusion_matrix(y_test,predicted))
            print(classification_report(y_test,predicted))
            print('\nAccuracy on Training:\n{00:.2f} %'.format(text_clf.score(X_train,y_train)*100))

            print('\nAccuracy on Testing:\n{00:.2f} %'.format(accuracy_score(y_test,predicted)*100))

            print('\nTime taken to build the model is {} Seconds'.format(time_to_build_the_model))
            print('\nTime taken for prediction is {} Seconds'.format(time_to_predict))



get_Model_results(list_of_models,list_of_Vectorizers,X,y)




...........Results for Hashing Vectorizer..........

-----------------
Model: SVM Classifier with Linear Kernel
[[1153  158]
 [ 307  988]]
              precision    recall  f1-score   support

           0       0.79      0.88      0.83      1311
           1       0.86      0.76      0.81      1295

    accuracy                           0.82      2606
   macro avg       0.83      0.82      0.82      2606
weighted avg       0.83      0.82      0.82      2606


Accuracy on Training:
95.94 %

Accuracy on Testing:
82.16 %

Time taken to build the model is 7.86 Seconds

Time taken for prediction is 2.37 Seconds

-----------------
Model: SVM Classifier with 3rd order Polynomial Kernel
[[1205  106]
 [ 429  866]]
              precision    recall  f1-score   support

           0       0.74      0.92      0.82      1311
           1       0.89      0.67      0.76      1295

    accuracy                           0.79      2606
   macro avg       0.81      0.79      0.79      2606
weighted 


Accuracy on Training:
98.47 %

Accuracy on Testing:
71.76 %

Time taken to build the model is 7.60 Seconds

Time taken for prediction is 2.62 Seconds

-----------------
Model: SVM Classifier with 5th order Polynomial Kernel
[[1290   21]
 [ 780  515]]
              precision    recall  f1-score   support

           0       0.62      0.98      0.76      1311
           1       0.96      0.40      0.56      1295

    accuracy                           0.69      2606
   macro avg       0.79      0.69      0.66      2606
weighted avg       0.79      0.69      0.66      2606


Accuracy on Training:
97.65 %

Accuracy on Testing:
69.26 %

Time taken to build the model is 10.11 Seconds

Time taken for prediction is 3.72 Seconds

-----------------
Model: SVM Classifier with RBF Kernel
[[1188  123]
 [ 286 1009]]
              precision    recall  f1-score   support

           0       0.81      0.91      0.85      1311
           1       0.89      0.78      0.83      1295

    accuracy         