In [43]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import datetime,time
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV,SGDClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,TfidfTransformer, HashingVectorizer
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.svm import SVC
import math
import statistics
from collections import defaultdict
from imblearn import over_sampling
from imblearn.over_sampling import SVMSMOTE
import re


In [39]:

data=pd.read_csv('/Users/joe/Desktop/language-models-sprint1/data/train.csv')
X = data['text']
y = data['target']


# different Vectorizers
def Hash_vec(X):
    Hvect=HashingVectorizer(lowercase=True,ngram_range=(1,1))
    X=Hvect.fit_transform(X)
    
    return X

def Count_Vec(X):
    CountV=CountVectorizer(lowercase=True)
    X=CountV.fit_transform(X)
    
    return X

def TFIDF_vec(X):
    tfidf=TfidfVectorizer(use_idf=True,lowercase=True)
    X=tfidf.fit_transform(X)
    
    return X


def overSample(X,y):
    
    oversample = SVMSMOTE(n_jobs=-1,random_state=47)
    X, y = oversample.fit_resample(X, y)
    
    return X,y



# ML methods
LG=LogisticRegression(solver='liblinear')
RF=RandomForestClassifier(n_estimators=265,max_depth=85,ccp_alpha=0.0001
                          ,criterion='entropy',n_jobs=-1)

SVM=SVC(kernel='linear')

SGD=SGDClassifier(n_jobs=-1,loss='hinge',learning_rate='adaptive',eta0=0.4,early_stopping=True)

NB=BernoulliNB()
          
list_of_Vectorizers = {'Hashing Vectorizer':Hash_vec,'Count Vectorizer':Count_Vec,
                       'TFIDF Vectorizer':TFIDF_vec}

list_of_models = {'Logisitc Regression':LG,'Random Forest':RF,'Support Vector_Machine':SVM,'Naive Bayes':NB,
                  'SGD':SGD}


models_accuracy = defaultdict()
models_built_time=defaultdict()
models_prediction_time=defaultdict()



def get_Model_results(models,vectorizers,X,y):
    
    for Vectorizer_name, vectorizer in vectorizers.items():
        print('\n...........Results for {}..........'.format(Vectorizer_name))
        X_vec=vectorizer(X)
        X_samp,y_samp=overSample(X_vec,y)

        X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.30, random_state=42)
        
        for model_name, model in models.items():
            start_time =  time.time()%60
            text_clf = model.fit(X_train, y_train)
            end_time= time.time()%60

            if(end_time>start_time):
                time_to_build_the_model='%.2f' %(end_time-start_time)
            else:
                time_to_build_the_model='%.2f' %(start_time-end_time)


            start_time = time.time()%60
            predicted = text_clf.predict(X_test)
            end_time= time.time()%60

            if(end_time>start_time):
                time_to_predict='%.2f' %(end_time-start_time)
            else:
                time_to_predict='%.2f' %(start_time-end_time)


            print('\n-----------------\nModel: {}'.format(model_name))
            print(confusion_matrix(y_test,predicted))
            print(classification_report(y_test,predicted))
            print('\nAccuracy on Training:\n{00:.2f} %'.format(text_clf.score(X_train,y_train)*100))

            print('\nAccuracy on Testing:\n{00:.2f} %'.format(accuracy_score(y_test,predicted)*100))

            print('\nTime taken to build the model is {} Seconds'.format(time_to_build_the_model))
            print('\nTime taken for prediction is {} Seconds'.format(time_to_predict))



get_Model_results(list_of_models,list_of_Vectorizers,X,y)




...........Results for Hashing Vectorizer..........

-----------------
Model: Logisitc Regression
[[1073  238]
 [ 288 1007]]
              precision    recall  f1-score   support

           0       0.79      0.82      0.80      1311
           1       0.81      0.78      0.79      1295

    accuracy                           0.80      2606
   macro avg       0.80      0.80      0.80      2606
weighted avg       0.80      0.80      0.80      2606


Accuracy on Training:
86.62 %

Accuracy on Testing:
79.82 %

Time taken to build the model is 0.40 Seconds

Time taken for prediction is 0.00 Seconds

-----------------
Model: Random Forest
[[1200  111]
 [ 369  926]]
              precision    recall  f1-score   support

           0       0.76      0.92      0.83      1311
           1       0.89      0.72      0.79      1295

    accuracy                           0.82      2606
   macro avg       0.83      0.82      0.81      2606
weighted avg       0.83      0.82      0.81      2606


A

In [44]:
#Over Sample is applied here and the code prodcues results of each method using all vectorizers
# Same 5 models, but with 10-Fold Cross Validation including building and testing time of the model  

df=pd.read_csv('/Users/joe/Desktop/language-models-sprint1/data/train.csv')
X = df['text']
y = df['target']



# different Vectorizers
def Hash_vec(X):
    Hvect=HashingVectorizer(lowercase=True,ngram_range=(1,1))
    X=Hvect.fit_transform(X)
    
    return X

def Count_Vec(X):
    CountV=CountVectorizer(lowercase=True)
    X=CountV.fit_transform(X)
    
    return X

def TFIDF_vec(X):
    tfidf=TfidfVectorizer(use_idf=True,lowercase=True)
    X=tfidf.fit_transform(X)
    
    return X


def overSample(X,y):
    
    oversample = SVMSMOTE(n_jobs=-1,random_state=47)
    X, y = oversample.fit_resample(X, y)
    return X,y



# ML methods
LG=LogisticRegression(solver='liblinear')

RF=RandomForestClassifier(n_estimators=265,max_depth=60,ccp_alpha=0.0001,criterion='entropy',n_jobs=-1)

SVM=SVC(kernel='linear')

SGD=SGDClassifier(n_jobs=-1,loss='hinge',learning_rate='adaptive',eta0=0.4,early_stopping=True)

NB=BernoulliNB()


models_accuracy = defaultdict()
models_built_time=defaultdict()
models_prediction_time=defaultdict()


          
list_of_Vectorizers = {'Hashing Vectorizer':Hash_vec,'Count Vectorizer':Count_Vec,
                       'TFIDF Vectorizer':TFIDF_vec}

list_of_models = {'Logisitc Regression':LG,'Random Forest':RF,'Support Vector Machine':SVM,'Naive Bayes':NB,
                  'SGD':SGD}




def CV_model_score(models,Vectorizers,X,y):
    kf = KFold(n_splits=10,random_state=42,shuffle=True)
    for Vectorizer_name, vectorizer in Vectorizers.items():
        print('\n...........Results for {}..........'.format(Vectorizer_name))
        X_vec=vectorizer(X)
        X_samp,y_samp=overSample(X_vec,y)
        for model_name, model in models.items():
            single_modele_scores=[]
            single_modele_build_time=[]
            single_modele_predict_time=[]
            single_modele_scores_for_training=[]
            for train_index, test_index in kf.split(X_samp,y_samp):

                start_time =  time.time()%60
                X_train, X_test = X_samp[train_index], X_samp[test_index]
                y_train, y_test = y_samp[train_index], y_samp[test_index]
                text_clf = model.fit(X_train, y_train)
                end_time= time.time()%60
                if(end_time>start_time):
                     time_to_build_the_model=float('%.2f' %(end_time-start_time))
                else:
                    time_to_build_the_model=float('%.2f' %(start_time-end_time))

                single_modele_build_time.append(time_to_build_the_model)
                start_time = time.time()%60
                predicted = text_clf.predict(X_test)
                end_time= time.time()%60
                if(end_time>start_time):
                    time_to_predict=float('%.2f' %(end_time-start_time))
                else:
                    time_to_predict=float('%.2f' %(start_time-end_time))
                single_modele_predict_time.append(time_to_predict)

                testing_accuracy=float('%.2f' %(accuracy_score(y_test,predicted)*100))
                single_modele_scores.append(testing_accuracy)

                training_accuracy=float('%.2f' %(text_clf.score(X_train,y_train)*100))
                single_modele_scores_for_training.append(training_accuracy)

            models_accuracy[model_name+" "+Vectorizer_name]=single_modele_scores
            testing_accuracy=statistics.mean(single_modele_scores)
            training_accuracy=statistics.mean(single_modele_scores_for_training)
            models_built_time=statistics.mean(single_modele_build_time)
            models_prediction_time=statistics.mean(single_modele_predict_time)

            print('\n--------------------------------------\nModel: {}'.format(model_name))
            print('\nMean Accuracy on Training\n')
            print('%.2f' %training_accuracy)
            print('\nMean Accuracy on Testing\n')
            print('%.2f' %testing_accuracy)
            print('\n Average Time taken to build the model is {00:.2f} Seconds'.format(models_built_time))
            print('\n Average Time taken to predict is {00:.2f} Seconds'.format(models_prediction_time))
       


CV_model_score(list_of_models,list_of_Vectorizers,X,y)



...........Results for Hashing Vectorizer..........

--------------------------------------
Model: Logisitc Regression

Mean Accuracy on Training

86.75

Mean Accuracy on Testing

80.73

 Average Time taken to build the model is 0.48 Seconds

 Average Time taken to predict is 0.00 Seconds

--------------------------------------
Model: Random Forest

Mean Accuracy on Training

91.37

Mean Accuracy on Testing

81.52

 Average Time taken to build the model is 29.98 Seconds

 Average Time taken to predict is 0.17 Seconds

--------------------------------------
Model: Support Vector Machine

Mean Accuracy on Training

91.08

Mean Accuracy on Testing

82.89

 Average Time taken to build the model is 15.91 Seconds

 Average Time taken to predict is 0.61 Seconds

--------------------------------------
Model: Naive Bayes

Mean Accuracy on Training

85.03

Mean Accuracy on Testing

76.61

 Average Time taken to build the model is 0.04 Seconds

 Average Time taken to predict is 0.05 Seconds

---

In [40]:
# Method to plot the result

def resutls_visulization(list_of_resutls):
   
    model_names = list(list_of_resutls.keys())
    results = [list_of_resutls[model] for model in model_names]
    fig = go.Figure()
    for model, result in zip(model_names, results):
        fig.add_trace(go.Box(
            y=result,
            name=model,
            boxpoints='all',
            jitter=0.8,
            whiskerwidth=0.9,
            marker_size=5,
            line_width=2)
        )
        
    
    fig.update_layout(
    title='Performance of 5 ML Models Using 10-Fold Cross-Validation Using 3 Victorizers',
    paper_bgcolor='rgb(243, 243, 243)',
    plot_bgcolor='rgb(243, 243, 243)',
    showlegend=False)
    fig.update_yaxes(title_text="<b> Folds Accuracy % </b>")
    fig.update_xaxes(title_text="<b>ML Model and Type of Applied Victorizer </b>")
    fig.show()


In [45]:
resutls_visulization(models_accuracy)