In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import datetime,time
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV,SGDClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,TfidfTransformer, HashingVectorizer
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.svm import SVC
import math
import statistics
from collections import defaultdict
from imblearn import over_sampling
from imblearn.over_sampling import SVMSMOTE
import re


In [2]:

data=pd.read_csv('data/train.csv')
X = data['text']
y = data['target']


# different Vectorizers
def Hash_vec(X):
    Hvect=HashingVectorizer(lowercase=True,ngram_range=(1,1))
    X=Hvect.fit_transform(X)
    
    return X

def Count_Vec(X):
    CountV=CountVectorizer(lowercase=True)
    X=CountV.fit_transform(X)
    
    return X

def TFIDF_vec(X):
    tfidf=TfidfVectorizer(use_idf=True,lowercase=True)
    X=tfidf.fit_transform(X)
    
    return X


def overSample(X,y):
    
    oversample = SVMSMOTE(n_jobs=-1,random_state=47)
    X, y = oversample.fit_resample(X, y)
    
    return X,y



# ML methods
LR1 = LogisticRegression()
LR2 = LogisticRegression(solver='newton-cg')
LR3 = LogisticRegression(solver='liblinear')
LR4 = LogisticRegression(solver='sag')
LR5 = LogisticRegression(solver='saga')
LR6 = LogisticRegression(penalty='l1',solver='liblinear')
LR7 = LogisticRegression(penalty='l1',solver='saga')
          
list_of_Vectorizers = {'Hashing Vectorizer':Hash_vec,'Count Vectorizer':Count_Vec,
                       'TFIDF Vectorizer':TFIDF_vec}

list_of_models = {'Logistic Regression with LBFGS solver and L2 penalty':LR1,
                  'Logistic Regression with Newton-CG solver and L2 penalty':LR2, 
                  'Logistic Regression with lib linear solver and L2 penalty':LR3, 
                  'Logistic Regression with SAG solver and L2 penalty':LR4, 
                  'Logistic Regression with SAGA solver and L2 penalty':LR5, 
                  'Logistic Regression with lib linear solver and L1 penalty':LR6, 
                  'Logistic Regression with SAG solver and L1 penalty':LR7 }


models_accuracy = defaultdict()
models_built_time=defaultdict()
models_prediction_time=defaultdict()



def get_Model_results(models,vectorizers,X,y):
    
    for Vectorizer_name, vectorizer in vectorizers.items():
        print('\n...........Results for {}..........'.format(Vectorizer_name))
        X_vec=vectorizer(X)
        X_samp,y_samp=overSample(X_vec,y)

        X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.30, random_state=42)
        
        for model_name, model in models.items():
            start_time =  time.time()%60
            text_clf = model.fit(X_train, y_train)
            end_time= time.time()%60

            if(end_time>start_time):
                time_to_build_the_model='%.2f' %(end_time-start_time)
            else:
                time_to_build_the_model='%.2f' %(start_time-end_time)


            start_time = time.time()%60
            predicted = text_clf.predict(X_test)
            end_time= time.time()%60

            if(end_time>start_time):
                time_to_predict='%.2f' %(end_time-start_time)
            else:
                time_to_predict='%.2f' %(start_time-end_time)


            print('\n-----------------\nModel: {}'.format(model_name))
            print(confusion_matrix(y_test,predicted))
            print(classification_report(y_test,predicted))
            print('\nAccuracy on Training:\n{00:.2f} %'.format(text_clf.score(X_train,y_train)*100))

            print('\nAccuracy on Testing:\n{00:.2f} %'.format(accuracy_score(y_test,predicted)*100))

            print('\nTime taken to build the model is {} Seconds'.format(time_to_build_the_model))
            print('\nTime taken for prediction is {} Seconds'.format(time_to_predict))



get_Model_results(list_of_models,list_of_Vectorizers,X,y)




...........Results for Hashing Vectorizer..........

-----------------
Model: Logistic Regression with LBFGS solver and L2 penalty
[[1073  238]
 [ 288 1007]]
              precision    recall  f1-score   support

           0       0.79      0.82      0.80      1311
           1       0.81      0.78      0.79      1295

    accuracy                           0.80      2606
   macro avg       0.80      0.80      0.80      2606
weighted avg       0.80      0.80      0.80      2606


Accuracy on Training:
86.61 %

Accuracy on Testing:
79.82 %

Time taken to build the model is 11.35 Seconds

Time taken for prediction is 0.00 Seconds

-----------------
Model: Logistic Regression with Newton-CG solver and L2 penalty
[[1073  238]
 [ 288 1007]]
              precision    recall  f1-score   support

           0       0.79      0.82      0.80      1311
           1       0.81      0.78      0.79      1295

    accuracy                           0.80      2606
   macro avg       0.80      0.80 




-----------------
Model: Logistic Regression with SAG solver and L1 penalty
[[1072  239]
 [ 316  979]]
              precision    recall  f1-score   support

           0       0.77      0.82      0.79      1311
           1       0.80      0.76      0.78      1295

    accuracy                           0.79      2606
   macro avg       0.79      0.79      0.79      2606
weighted avg       0.79      0.79      0.79      2606


Accuracy on Training:
81.23 %

Accuracy on Testing:
78.70 %

Time taken to build the model is 3.77 Seconds

Time taken for prediction is 0.00 Seconds

...........Results for Count Vectorizer..........


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



-----------------
Model: Logistic Regression with LBFGS solver and L2 penalty
[[1057  254]
 [ 254 1041]]
              precision    recall  f1-score   support

           0       0.81      0.81      0.81      1311
           1       0.80      0.80      0.80      1295

    accuracy                           0.81      2606
   macro avg       0.81      0.81      0.81      2606
weighted avg       0.81      0.81      0.81      2606


Accuracy on Training:
96.56 %

Accuracy on Testing:
80.51 %

Time taken to build the model is 0.66 Seconds

Time taken for prediction is 0.00 Seconds

-----------------
Model: Logistic Regression with Newton-CG solver and L2 penalty
[[1057  254]
 [ 255 1040]]
              precision    recall  f1-score   support

           0       0.81      0.81      0.81      1311
           1       0.80      0.80      0.80      1295

    accuracy                           0.80      2606
   macro avg       0.80      0.80      0.80      2606
weighted avg       0.80      0.80 




-----------------
Model: Logistic Regression with SAG solver and L2 penalty
[[1059  252]
 [ 254 1041]]
              precision    recall  f1-score   support

           0       0.81      0.81      0.81      1311
           1       0.81      0.80      0.80      1295

    accuracy                           0.81      2606
   macro avg       0.81      0.81      0.81      2606
weighted avg       0.81      0.81      0.81      2606


Accuracy on Training:
96.40 %

Accuracy on Testing:
80.58 %

Time taken to build the model is 59.58 Seconds

Time taken for prediction is 0.00 Seconds





-----------------
Model: Logistic Regression with SAGA solver and L2 penalty
[[1061  250]
 [ 254 1041]]
              precision    recall  f1-score   support

           0       0.81      0.81      0.81      1311
           1       0.81      0.80      0.81      1295

    accuracy                           0.81      2606
   macro avg       0.81      0.81      0.81      2606
weighted avg       0.81      0.81      0.81      2606


Accuracy on Training:
96.10 %

Accuracy on Testing:
80.66 %

Time taken to build the model is 0.39 Seconds

Time taken for prediction is 0.00 Seconds

-----------------
Model: Logistic Regression with lib linear solver and L1 penalty
[[1064  247]
 [ 313  982]]
              precision    recall  f1-score   support

           0       0.77      0.81      0.79      1311
           1       0.80      0.76      0.78      1295

    accuracy                           0.79      2606
   macro avg       0.79      0.78      0.78      2606
weighted avg       0.79      0.79 




-----------------
Model: Logistic Regression with SAG solver and L1 penalty
[[1074  237]
 [ 308  987]]
              precision    recall  f1-score   support

           0       0.78      0.82      0.80      1311
           1       0.81      0.76      0.78      1295

    accuracy                           0.79      2606
   macro avg       0.79      0.79      0.79      2606
weighted avg       0.79      0.79      0.79      2606


Accuracy on Training:
86.59 %

Accuracy on Testing:
79.09 %

Time taken to build the model is 4.70 Seconds

Time taken for prediction is 0.00 Seconds

...........Results for TFIDF Vectorizer..........

-----------------
Model: Logistic Regression with LBFGS solver and L2 penalty
[[1109  202]
 [ 273 1022]]
              precision    recall  f1-score   support

           0       0.80      0.85      0.82      1311
           1       0.83      0.79      0.81      1295

    accuracy                           0.82      2606
   macro avg       0.82      0.82      0.82

