In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import datetime,time
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,TfidfTransformer
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingClassifier
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.svm import SVC
import math
import statistics
from sklearn.linear_model import SGDClassifier

In [0]:

# 4 ML models without Cross-Validation
df=pd.read_csv('/data/workspace_files/train.csv')
X = df['text']
y = df['target']



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# pipe for each model
pipe1 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', SVC()),])

pipe2 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', SVC(kernel='poly', degree=3)),])

pipe3 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', SVC(kernel='poly', degree=4)),])

pipe4 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', SVC(kernel='poly', degree=5)),])

pipe5 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', SVC(kernel='rbf')),])

pipe6 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', SVC(kernel='sigmoid')),])


# list of thr models as pipes
list_of_models = {'SVM Classifier with Linear Kernel':pipe1,
                  'SVM Classifier with 3rd order Polynomial Kernel':pipe2, 
                  'SVM Classifier with 4th order Polynomial Kernel':pipe3, 
                  'SVM Classifier with 5th order Polynomial Kernel':pipe4, 
                  'SVM Classifier with RBF Kernel':pipe5, 
                  'SVM Classifier with Sigmoid Kernel':pipe6}


def get_Model_results(models,X_train,y_train,X_test,y_test):
  
    for model_name, model in list_of_models.items():
        
        start_time =  time.time()%60
        text_clf = model.fit(X_train, y_train)
        end_time= time.time()%60
        
        if(end_time>start_time):
            time_to_build_the_model='%.2f' %(end_time-start_time)
        else:
            time_to_build_the_model='%.2f' %(start_time-end_time)
                
        
        start_time = time.time()%60
        predicted = text_clf.predict(X_test)
        end_time= time.time()%60
        
        if(end_time>start_time):
            time_to_predict='%.2f' %(end_time-start_time)
        else:
            time_to_predict='%.2f' %(start_time-end_time)
                
        
        print('\n................................................\nModel {}'.format(model_name))
        print(confusion_matrix(y_test,predicted))
        print(classification_report(y_test,predicted))
        print('\nAccuracy on Training:\n{00:.2f}'.format(text_clf.score(X_train,y_train)*100))
    
        print('\nAccuracy on Testing:\n{00:.2f}'.format(accuracy_score(y_test,predicted)*100))
       
        print('\nTime taken to build the model is {} Seconds'.format(time_to_build_the_model))
        print('\nTime taken for prediction is {} Seconds'.format(time_to_predict))



get_Model_results(list_of_models,X_train,y_train,X_test,y_test)



................................................
Model SVM Classifier with Linear Kernel
[[1185  133]
 [ 294  672]]
              precision    recall  f1-score   support

           0       0.80      0.90      0.85      1318
           1       0.83      0.70      0.76       966

    accuracy                           0.81      2284
   macro avg       0.82      0.80      0.80      2284
weighted avg       0.82      0.81      0.81      2284


Accuracy on Training:
97.30

Accuracy on Testing:
81.30

Time taken to build the model is 5.26 Seconds

Time taken for prediction is 1.71 Seconds

................................................
Model SVM Classifier with 3rd order Polynomial Kernel
[[1295   23]
 [ 654  312]]
              precision    recall  f1-score   support

           0       0.66      0.98      0.79      1318
           1       0.93      0.32      0.48       966

    accuracy                           0.70      2284
   macro avg       0.80      0.65      0.64      2284
weight

In [0]:

from collections import defaultdict
# Same 4 models, but with 10-Fold Cross Validation including building and testing time of the model 

df=pd.read_csv('/data/workspace_files/train.csv')
X = df['text']
y = df['target']



# pipe for each model
pipe1 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', SVC()),])

pipe2 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', SVC(kernel='poly', degree=3)),])

pipe3 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', SVC(kernel='poly', degree=4)),])

pipe4 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', SVC(kernel='poly', degree=5)),])

pipe5 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', SVC(kernel='rbf')),])

pipe6 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', SVC(kernel='sigmoid')),])


# list of thr models as pipes
list_of_models = {'SVM Classifier with Linear Kernel':pipe1,
                  'SVM Classifier with 3rd order Polynomial Kernel':pipe2, 
                  'SVM Classifier with 4th order Polynomial Kernel':pipe3, 
                  'SVM Classifier with 5th order Polynomial Kernel':pipe4, 
                  'SVM Classifier with RBF Kernel':pipe5, 
                  'SVM Classifier with Sigmoid Kernel':pipe6}

models_accuracy = defaultdict()
models_built_time=defaultdict()
models_prediction_time=defaultdict()



def CV_model_score(models,X,y):
    kf = KFold(n_splits=10,random_state=42,shuffle=True)
    for model_name, model in list_of_models.items():
        single_modele_scores=[]
        single_modele_build_time=[]
        single_modele_predict_time=[]
        single_modele_scores_for_training=[]
        for train_index, test_index in kf.split(X,y):
         
            
            start_time =  time.time()%60
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            text_clf = model.fit(X_train, y_train)
            end_time= time.time()%60
            if(end_time>start_time):
                 time_to_build_the_model=float('%.2f' %(end_time-start_time))
            else:
                time_to_build_the_model=float('%.2f' %(start_time-end_time))
            
            single_modele_build_time.append(time_to_build_the_model)
            start_time = time.time()%60
            predicted = text_clf.predict(X_test)
            end_time= time.time()%60
            if(end_time>start_time):
                time_to_predict=float('%.2f' %(end_time-start_time))
            else:
                time_to_predict=float('%.2f' %(start_time-end_time))
            single_modele_predict_time.append(time_to_predict)
            
            testing_accuracy=float('%.2f' %(accuracy_score(y_test,predicted)*100))
            single_modele_scores.append(testing_accuracy)
            
            training_accuracy=float('%.2f' %(text_clf.score(X_train,y_train)*100))
            single_modele_scores_for_training.append(training_accuracy)
        
        models_accuracy[model_name]=single_modele_scores
        testing_accuracy=statistics.mean(single_modele_scores)
        training_accuracy=statistics.mean(single_modele_scores_for_training)
        models_built_time=statistics.mean(single_modele_build_time)
        models_prediction_time=statistics.mean(single_modele_predict_time)
        
        print('\n--------------------------------------\nModel {}'.format(model_name))
        print('Accuracy on Training\n')
        print('%.2f' %training_accuracy)
        print('\nAccuracy on Testing\n')
        print('%.2f' %testing_accuracy)
        print('\nTime taken to build the model is {00:.2f} Seconds'.format(models_built_time))
        print('\nTime taken to predict is {00:.2f} Seconds'.format(models_prediction_time))
       



CV_model_score(list_of_models,X,y)


--------------------------------------
Model SVM Classifier with Linear Kernel
Accuracy on Training

96.83

Accuracy on Testing

80.80

Time taken to build the model is 12.84 Seconds

Time taken to predict is 0.71 Seconds

--------------------------------------
Model SVM Classifier with 3rd order Polynomial Kernel
Accuracy on Training

98.93

Accuracy on Testing

71.18

Time taken to build the model is 25.93 Seconds

Time taken to predict is 7.65 Seconds

--------------------------------------
Model SVM Classifier with 4th order Polynomial Kernel
Accuracy on Training

99.08

Accuracy on Testing

68.92

Time taken to build the model is 29.91 Seconds

Time taken to predict is 2.52 Seconds

--------------------------------------
Model SVM Classifier with 5th order Polynomial Kernel
Accuracy on Training

99.22

Accuracy on Testing

68.00

Time taken to build the model is 29.35 Seconds

Time taken to predict is 2.21 Seconds

--------------------------------------
Model SVM Classifier with 

In [0]:
# Method to plot the result

def resutls_visulization(list_of_resutls):
   
    model_names = list(list_of_resutls.keys())
    results = [list_of_resutls[model] for model in model_names]
    fig = go.Figure()
    for model, result in zip(model_names, results):
        fig.add_trace(go.Box(
            y=result,
            name=model,
            boxpoints='all',
            jitter=0.8,
            whiskerwidth=0.9,
            marker_size=5,
            line_width=2)
        )
        
    
    fig.update_layout(
    title='Performance of 4 ML Models Using 10-Fold Cross-Validation',
    paper_bgcolor='rgb(243, 243, 243)',
    plot_bgcolor='rgb(243, 243, 243)',
    showlegend=True)
    fig.update_yaxes(title_text="<b>Accuracy % </b>")
    fig.update_xaxes(title_text="<b>ML Model</b>")
    fig.show()

In [0]:
resutls_visulization(models_accuracy)

Unsupported