In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import datetime,time
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV,SGDClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,TfidfTransformer, HashingVectorizer
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.svm import SVC
import math
import statistics
from collections import defaultdict
from imblearn import over_sampling
from imblearn.over_sampling import SVMSMOTE
import re


In [2]:

data=pd.read_csv('data/train.csv')
X = data['text']
y = data['target']


# different Vectorizers
def Hash_vec(X):
    Hvect=HashingVectorizer(lowercase=True,ngram_range=(1,1))
    X=Hvect.fit_transform(X)
    
    return X

def Count_Vec(X):
    CountV=CountVectorizer(lowercase=True)
    X=CountV.fit_transform(X)
    
    return X

def TFIDF_vec(X):
    tfidf=TfidfVectorizer(use_idf=True,lowercase=True)
    X=tfidf.fit_transform(X)
    
    return X


def overSample(X,y):
    
    oversample = SVMSMOTE(n_jobs=-1,random_state=47)
    X, y = oversample.fit_resample(X, y)
    
    return X,y



# ML methods
SGD1 = SGDClassifier(loss='log')
SGD2 = SGDClassifier(loss='modified_huber')
SGD3 = SGDClassifier(loss='perceptron')
SGD4 = SGDClassifier()
          
list_of_Vectorizers = {'Hashing Vectorizer':Hash_vec,'Count Vectorizer':Count_Vec,
                       'TFIDF Vectorizer':TFIDF_vec}

list_of_models = {'SGD Classifier with Hinge Loss':SGD4,
                  'SGD Classifier with Log Loss':SGD1, 
                  'SGD Classifier with Modified Huber Loss':SGD2, 
                  'SGD Classifier with perceptron Loss':SGD3}


models_accuracy = defaultdict()
models_built_time=defaultdict()
models_prediction_time=defaultdict()



def get_Model_results(models,vectorizers,X,y):
    
    for Vectorizer_name, vectorizer in vectorizers.items():
        print('\n...........Results for {}..........'.format(Vectorizer_name))
        X_vec=vectorizer(X)
        X_samp,y_samp=overSample(X_vec,y)

        X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.30, random_state=42)
        
        for model_name, model in models.items():
            start_time =  time.time()%60
            text_clf = model.fit(X_train, y_train)
            end_time= time.time()%60

            if(end_time>start_time):
                time_to_build_the_model='%.2f' %(end_time-start_time)
            else:
                time_to_build_the_model='%.2f' %(start_time-end_time)


            start_time = time.time()%60
            predicted = text_clf.predict(X_test)
            end_time= time.time()%60

            if(end_time>start_time):
                time_to_predict='%.2f' %(end_time-start_time)
            else:
                time_to_predict='%.2f' %(start_time-end_time)


            print('\n-----------------\nModel: {}'.format(model_name))
            print(confusion_matrix(y_test,predicted))
            print(classification_report(y_test,predicted))
            print('\nAccuracy on Training:\n{00:.2f} %'.format(text_clf.score(X_train,y_train)*100))

            print('\nAccuracy on Testing:\n{00:.2f} %'.format(accuracy_score(y_test,predicted)*100))

            print('\nTime taken to build the model is {} Seconds'.format(time_to_build_the_model))
            print('\nTime taken for prediction is {} Seconds'.format(time_to_predict))



get_Model_results(list_of_models,list_of_Vectorizers,X,y)




...........Results for Hashing Vectorizer..........

-----------------
Model: SGD Classifier with Hinge Loss
[[1127  184]
 [ 270 1025]]
              precision    recall  f1-score   support

           0       0.81      0.86      0.83      1311
           1       0.85      0.79      0.82      1295

    accuracy                           0.83      2606
   macro avg       0.83      0.83      0.83      2606
weighted avg       0.83      0.83      0.83      2606


Accuracy on Training:
93.55 %

Accuracy on Testing:
82.58 %

Time taken to build the model is 0.26 Seconds

Time taken for prediction is 0.00 Seconds

-----------------
Model: SGD Classifier with Log Loss
[[1104  207]
 [ 286 1009]]
              precision    recall  f1-score   support

           0       0.79      0.84      0.82      1311
           1       0.83      0.78      0.80      1295

    accuracy                           0.81      2606
   macro avg       0.81      0.81      0.81      2606
weighted avg       0.81      0.