In [962]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import datetime,time
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,TfidfTransformer
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingClassifier
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [870]:

# 10-fold CV for 4 models
df=pd.read_csv('/Users/joe/Desktop/language-models-sprint1/data/train.csv')
X = df['text']
y = df['target']

pipe1 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', LogisticRegression()),])
pipe2 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', RandomForestClassifier(n_estimators=14,max_depth=100,ccp_alpha=0.0001,
                                                    random_state=42,criterion='entropy')),])

pipe3 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', SVC()),])
pipe4 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', MultinomialNB()),])



list_of_models = {'Logisitc Regression':pipe1,'Random_Forest':pipe2,'Support_Vecotr_Machine':pipe3,
                  'Naive_Bayes':pipe4}


models_accuracy = defaultdict()


def CV_model_accuracy(list_of_models,X,y):
    cross_validation = KFold(n_splits=10, shuffle=True, random_state=42)
    accuracies = cross_val_score(model, X, y, scoring='accuracy', cv=cross_validation, verbose=0, 
                                 n_jobs=-1,error_score='raise')    
    return accuracies


for name, model in list_of_models.items():
    print('Model: {}'.format(name))
    accuracies = CV_model_accuracy(model, X, y)
    accuracy='%.2f' %(accuracies.mean()*100)
    print("Mean accuracy is {}\n".format(accuracy))
    models_accuracy[name] = accuracies
    
    



Model: Logisitc Regression
Mean accuracy is 80.26

Model: Random_Forest
Mean accuracy is 76.62

Model: Support_Vecotr_Machine
Mean accuracy is 80.80

Model: Naive_Bayes
Mean accuracy is 79.80



In [831]:
# Method to plot the result

def resutls_visulization(list_of_resutls):
   
    model_names = list(list_of_resutls.keys())
    results = [list_of_resutls[model] for model in model_names]
    fig = go.Figure()
    for model, result in zip(model_names, results):
        fig.add_trace(go.Box(
            y=result,
            name=model,
            boxpoints='all',
            jitter=0.8,
            whiskerwidth=0.9,
            marker_size=5,
            line_width=2)
        )
        
    
    fig.update_layout(
    title='Performance of 4 ML Models Using 10-Fold Cross-Validation',
    paper_bgcolor='rgb(243, 243, 243)',
    plot_bgcolor='rgb(243, 243, 243)',
    showlegend=True)
    fig.update_yaxes(title_text="<b>Accuracy % </b>")
    fig.update_xaxes(title_text="<b>ML Model</b>")
    fig.show()





In [849]:
resutls_visulization(models_accuracy)

In [818]:

# 10-Fold Cross Validation including building and testing time of the model and Other important Reports
df=pd.read_csv('/Users/joe/Desktop/language-models-sprint1/data/train.csv')
X = df['text']
y = df['target']

kf = KFold(n_splits=8,random_state=42,shuffle=True) 

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
#splitting the dataset with cross-validationn
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]


pipe1 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', LogisticRegression()),])
pipe2 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', RandomForestClassifier(n_estimators=14,max_depth=100,ccp_alpha=0.0001,
                                                    random_state=42,criterion='entropy')),])

pipe3 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', SVC()),])
pipe4 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', MultinomialNB()),])


models_accuracy = defaultdict()
models_built_time=defaultdict()
models_prediction_time=defaultdict()

list_of_models = {'Logisitc Regression':pipe1,'Random_Forest':pipe2,'Support_Vecotr_Machine':pipe3,
                  'Naive_Bayes':pipe4}


def CV_model_score(models,X_train,y_train,X_test,y_test):
  
    for model_name, model in list_of_models.items():
        start_time =  time.time()%60
        text_clf = model.fit(X_train, y_train)
        end_time= time.time()%60
        time_to_build_the_model='%.2f' %(end_time-start_time)
        
        models_built_time[model_name]=time_to_build_the_model
        
        start_time = time.time()%60
        predicted = text_clf.predict(X_test)
        end_time= time.time()%60
        time_to_predict='%.2f' %(end_time-start_time)
        models_prediction_time[model_name]=time_to_predict
        print('Model {}'.format(model_name))
        print(confusion_matrix(y_test,predicted))
        print(classification_report(y_test,predicted))
        print('Accuracy on Training\n')
        training_accuracy='%.2f' %(text_clf.score(X_train,y_train)*100)
        print(training_accuracy)
        print('\nAccuracy on Testing\n')
        testing_accuracy='%.2f' %(accuracy_score(y_test,predicted)*100)
        models_accuracy[model_name]=testing_accuracy
        print(testing_accuracy)
        print('\nTime taken to build the model is {} Seconds'.format(time_to_build_the_model))
        print('\nTime taken for prediction is {} Seconds'.format(time_to_predict))



CV_model_score(list_of_models,X_train,y_train,X_test,y_test)


Model Logisitc Regression
[[464  59]
 [125 303]]
              precision    recall  f1-score   support

           0       0.79      0.89      0.83       523
           1       0.84      0.71      0.77       428

    accuracy                           0.81       951
   macro avg       0.81      0.80      0.80       951
weighted avg       0.81      0.81      0.80       951

Accuracy on Training

88.85

Accuracy on Testing

80.65

Time taken to build the model is 0.31 Seconds

Time taken for prediction is 0.01 Seconds
Model Random_Forest
[[484  39]
 [171 257]]
              precision    recall  f1-score   support

           0       0.74      0.93      0.82       523
           1       0.87      0.60      0.71       428

    accuracy                           0.78       951
   macro avg       0.80      0.76      0.77       951
weighted avg       0.80      0.78      0.77       951

Accuracy on Training

91.31

Accuracy on Testing

77.92

Time taken to build the model is 0.75 Seconds

Time