In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import datetime,time
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,TfidfTransformer
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingClassifier
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.svm import SVC
import math
import statistics
from sklearn.linear_model import SGDClassifier

In [0]:

# 4 ML models without Cross-Validation
df=pd.read_csv('/data/workspace_files/train.csv')
X = df['text']
y = df['target']



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# pipe for each model
pipe1 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', LogisticRegression()),])

pipe2 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', LogisticRegression(solver='newton-cg')),])

pipe3 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', LogisticRegression(solver='liblinear')),])

pipe4 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', LogisticRegression(solver='sag')),])

pipe5 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', LogisticRegression(solver='saga')),])

pipe6 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', LogisticRegression(penalty='l1',solver='liblinear')),])

pipe7 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', LogisticRegression(penalty='l1',solver='saga')),])




# list of thr models as pipes
list_of_models = {'Logistic Regression with LBFGS solver and L2 penalty':pipe1,
                  'Logistic Regression with Newton-CG solver and L2 penalty':pipe2, 
                  'Logistic Regression with lib linear solver and L2 penalty':pipe3, 
                  'Logistic Regression with SAG solver and L2 penalty':pipe4, 
                  'Logistic Regression with SAGA solver and L2 penalty':pipe5, 
                  'Logistic Regression with lib linear solver and L1 penalty':pipe6, 
                  'Logistic Regression with SAG solver and L1 penalty':pipe7 }


def get_Model_results(models,X_train,y_train,X_test,y_test):
  
    for model_name, model in list_of_models.items():
        
        start_time =  time.time()%60
        text_clf = model.fit(X_train, y_train)
        end_time= time.time()%60
        
        if(end_time>start_time):
            time_to_build_the_model='%.2f' %(end_time-start_time)
        else:
            time_to_build_the_model='%.2f' %(start_time-end_time)
                
        
        start_time = time.time()%60
        predicted = text_clf.predict(X_test)
        end_time= time.time()%60
        
        if(end_time>start_time):
            time_to_predict='%.2f' %(end_time-start_time)
        else:
            time_to_predict='%.2f' %(start_time-end_time)
                
        
        print('\n................................................\nModel {}'.format(model_name))
        print(confusion_matrix(y_test,predicted))
        print(classification_report(y_test,predicted))
        print('\nAccuracy on Training:\n{00:.2f}'.format(text_clf.score(X_train,y_train)*100))
    
        print('\nAccuracy on Testing:\n{00:.2f}'.format(accuracy_score(y_test,predicted)*100))
       
        print('\nTime taken to build the model is {} Seconds'.format(time_to_build_the_model))
        print('\nTime taken for prediction is {} Seconds'.format(time_to_predict))



get_Model_results(list_of_models,X_train,y_train,X_test,y_test)



................................................
Model Logistic Regression with LBFGS solver and L2 penalty
[[1157  161]
 [ 280  686]]
              precision    recall  f1-score   support

           0       0.81      0.88      0.84      1318
           1       0.81      0.71      0.76       966

    accuracy                           0.81      2284
   macro avg       0.81      0.79      0.80      2284
weighted avg       0.81      0.81      0.80      2284


Accuracy on Training:
88.87

Accuracy on Testing:
80.69

Time taken to build the model is 1.07 Seconds

Time taken for prediction is 0.19 Seconds

................................................
Model Logistic Regression with Newton-CG solver and L2 penalty
[[1157  161]
 [ 280  686]]
              precision    recall  f1-score   support

           0       0.81      0.88      0.84      1318
           1       0.81      0.71      0.76       966

    accuracy                           0.81      2284
   macro avg       0.81      0.7



In [0]:

from collections import defaultdict
# Same 4 models, but with 10-Fold Cross Validation including building and testing time of the model 

df=pd.read_csv('/data/workspace_files/train.csv')
X = df['text']
y = df['target']



# pipe for each model
pipe1 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', LogisticRegression()),])

pipe2 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', LogisticRegression(solver='newton-cg')),])

pipe3 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', LogisticRegression(solver='liblinear')),])

pipe4 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', LogisticRegression(solver='sag')),])

pipe5 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', LogisticRegression(solver='saga')),])

pipe6 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', LogisticRegression(penalty='l1',solver='liblinear')),])

pipe7 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', LogisticRegression(penalty='l1',solver='saga')),])



# list of thr models as pipes
list_of_models = {'Logistic Regression with LBFGS solver and L2 penalty':pipe1,
                  'Logistic Regression with Newton-CG solver and L2 penalty':pipe2, 
                  'Logistic Regression with lib linear solver and L2 penalty':pipe3, 
                  'Logistic Regression with SAG solver and L2 penalty':pipe4, 
                  'Logistic Regression with SAGA solver and L2 penalty':pipe5, 
                  'Logistic Regression with lib linear solver and L1 penalty':pipe6, 
                  'Logistic Regression with SAG solver and L1 penalty':pipe7 }


models_accuracy = defaultdict()
models_built_time=defaultdict()
models_prediction_time=defaultdict()



def CV_model_score(models,X,y):
    kf = KFold(n_splits=10,random_state=42,shuffle=True)
    for model_name, model in list_of_models.items():
        single_modele_scores=[]
        single_modele_build_time=[]
        single_modele_predict_time=[]
        single_modele_scores_for_training=[]
        for train_index, test_index in kf.split(X,y):
         
            
            start_time =  time.time()%60
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            text_clf = model.fit(X_train, y_train)
            end_time= time.time()%60
            if(end_time>start_time):
                 time_to_build_the_model=float('%.2f' %(end_time-start_time))
            else:
                time_to_build_the_model=float('%.2f' %(start_time-end_time))
            
            single_modele_build_time.append(time_to_build_the_model)
            start_time = time.time()%60
            predicted = text_clf.predict(X_test)
            end_time= time.time()%60
            if(end_time>start_time):
                time_to_predict=float('%.2f' %(end_time-start_time))
            else:
                time_to_predict=float('%.2f' %(start_time-end_time))
            single_modele_predict_time.append(time_to_predict)
            
            testing_accuracy=float('%.2f' %(accuracy_score(y_test,predicted)*100))
            single_modele_scores.append(testing_accuracy)
            
            training_accuracy=float('%.2f' %(text_clf.score(X_train,y_train)*100))
            single_modele_scores_for_training.append(training_accuracy)
        
        models_accuracy[model_name]=single_modele_scores
        testing_accuracy=statistics.mean(single_modele_scores)
        training_accuracy=statistics.mean(single_modele_scores_for_training)
        models_built_time=statistics.mean(single_modele_build_time)
        models_prediction_time=statistics.mean(single_modele_predict_time)
        
        print('\n--------------------------------------\nModel {}'.format(model_name))
        print('Accuracy on Training\n')
        print('%.2f' %training_accuracy)
        print('\nAccuracy on Testing\n')
        print('%.2f' %testing_accuracy)
        print('\nTime taken to build the model is {00:.2f} Seconds'.format(models_built_time))
        print('\nTime taken to predict is {00:.2f} Seconds'.format(models_prediction_time))
       



CV_model_score(list_of_models,X,y)


--------------------------------------
Model Logistic Regression with LBFGS solver and L2 penalty
Accuracy on Training

88.72

Accuracy on Testing

80.26

Time taken to build the model is 7.00 Seconds

Time taken to predict is 0.10 Seconds


In [0]:
# Method to plot the result

def resutls_visulization(list_of_resutls):
   
    model_names = list(list_of_resutls.keys())
    results = [list_of_resutls[model] for model in model_names]
    fig = go.Figure()
    for model, result in zip(model_names, results):
        fig.add_trace(go.Box(
            y=result,
            name=model,
            boxpoints='all',
            jitter=0.8,
            whiskerwidth=0.9,
            marker_size=5,
            line_width=2)
        )
        
    
    fig.update_layout(
    title='Performance of 4 ML Models Using 10-Fold Cross-Validation',
    paper_bgcolor='rgb(243, 243, 243)',
    plot_bgcolor='rgb(243, 243, 243)',
    showlegend=True)
    fig.update_yaxes(title_text="<b>Accuracy % </b>")
    fig.update_xaxes(title_text="<b>ML Model</b>")
    fig.show()

In [0]:
resutls_visulization(models_accuracy)

Unsupported