In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import datetime,time
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,TfidfTransformer, HashingVectorizer
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.svm import SVC
import math
import statistics
from collections import defaultdict
from imblearn import over_sampling
from imblearn.over_sampling import SVMSMOTE
import re

In [20]:

data=pd.read_csv('/Users/joe/Desktop/language-models-sprint1/data/train.csv')
X = data['text']
y = data['target']


# different Vectorizers
def Hash_vec(X):
    Hvect=HashingVectorizer(lowercase=True,ngram_range=(1,1))
    X=Hvect.fit_transform(X)
    
    return X

def Count_Vec(X):
    CountV=CountVectorizer(lowercase=True)
    X=CountV.fit_transform(X)
    
    return X

def TFIDF_vec(X):
    tfidf=TfidfVectorizer(use_idf=True,lowercase=True)
    X=tfidf.fit_transform(X)
    
    return X


def overSample(X,y):
    
    oversample = SVMSMOTE(n_jobs=-1,random_state=47)
    X, y = oversample.fit_resample(X, y)
    
    return X,y



# Random Forest starting from 100 tree to 400 trees
RF100=RandomForestClassifier(n_estimators=100,n_jobs=-1,max_depth=100)
RF110=RandomForestClassifier(n_estimators=110,n_jobs=-1,max_depth=100)
RF120=RandomForestClassifier(n_estimators=120,n_jobs=-1,max_depth=100)
RF130=RandomForestClassifier(n_estimators=130,n_jobs=-1,max_depth=100)
RF140=RandomForestClassifier(n_estimators=140,n_jobs=-1,max_depth=100)
RF150=RandomForestClassifier(n_estimators=150,n_jobs=-1,max_depth=100)
RF160=RandomForestClassifier(n_estimators=160,n_jobs=-1,max_depth=100)
RF170=RandomForestClassifier(n_estimators=170,n_jobs=-1,max_depth=100)
RF180=RandomForestClassifier(n_estimators=180,n_jobs=-1,max_depth=100)
RF190=RandomForestClassifier(n_estimators=190,n_jobs=-1,max_depth=100)
RF200=RandomForestClassifier(n_estimators=200,n_jobs=-1,max_depth=100)


RF210=RandomForestClassifier(n_estimators=210,n_jobs=-1,max_depth=100)
RF220=RandomForestClassifier(n_estimators=220,n_jobs=-1,max_depth=100)
RF230=RandomForestClassifier(n_estimators=230,n_jobs=-1,max_depth=100)
RF240=RandomForestClassifier(n_estimators=240,n_jobs=-1,max_depth=100)
RF250=RandomForestClassifier(n_estimators=250,n_jobs=-1,max_depth=100)
RF260=RandomForestClassifier(n_estimators=260,n_jobs=-1,max_depth=100)
RF270=RandomForestClassifier(n_estimators=270,n_jobs=-1,max_depth=100)
RF280=RandomForestClassifier(n_estimators=280,n_jobs=-1,max_depth=100)
RF290=RandomForestClassifier(n_estimators=290,n_jobs=-1,max_depth=100)
RF300=RandomForestClassifier(n_estimators=300,n_jobs=-1,max_depth=100)
                      
RF310=RandomForestClassifier(n_estimators=310,n_jobs=-1,max_depth=100)
RF320=RandomForestClassifier(n_estimators=320,n_jobs=-1,max_depth=100)
RF330=RandomForestClassifier(n_estimators=330,n_jobs=-1,max_depth=100)
RF340=RandomForestClassifier(n_estimators=340,n_jobs=-1,max_depth=100)
RF350=RandomForestClassifier(n_estimators=350,n_jobs=-1,max_depth=100)
RF360=RandomForestClassifier(n_estimators=360,n_jobs=-1,max_depth=100)
RF370=RandomForestClassifier(n_estimators=370,n_jobs=-1,max_depth=100)
RF380=RandomForestClassifier(n_estimators=380,n_jobs=-1,max_depth=100)
RF390=RandomForestClassifier(n_estimators=390,n_jobs=-1,max_depth=100)
RF400=RandomForestClassifier(n_estimators=400,n_jobs=-1,max_depth=100)



list_of_Vectorizers = {'Hashing Vectorizer':Hash_vec,'Count Vectorizer':Count_Vec,
                       'TFIDF Vectorizer':TFIDF_vec}



list_of_models = {'Random Forest100':RF100,'Random Forest110':RF110,'Random Forest120':RF120,'Random Forest130':RF130,
                 'Random Forest140':RF140,'Random Forest150':RF150,
                 'Random Forest160':RF160,'Random Forest170':RF170,'Random Forest180':RF180,'Random Forest190':RF190,
                 'Random Forest200':RF200,'Random Forest210':RF210,
                 'Random Forest220':RF220,'Random Forest230':RF230,'Random Forest240':RF240,'Random Forest250':RF250
                 ,'Random Forest260':RF260,'Random Forest270':RF270,'Random Forest280':RF280,
                 'Random Forest290':RF290,'Random Forest300':RF300,
                 'Random Forest310':RF310,'Random Forest320':RF320,'Random Forest330':RF330,
                 'Random Forest340':RF340,'Random Forest350':RF350,'Random Forest360':RF360,'Random Forest370':RF370,
                 'Random Forest380':RF380,'Random Forest390':RF390,'Random Forest400':RF400}

models_accuracy = defaultdict()
models_built_time=defaultdict()
models_prediction_time=defaultdict()



def get_Model_results(models,vectorizers,X,y):
    
    for Vectorizer_name, vectorizer in vectorizers.items():
        print('\n...........Results for {}..........'.format(Vectorizer_name))
        X_vec=vectorizer(X)
        X_samp,y_samp=overSample(X_vec,y)

        X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.30, random_state=42)
        
        for model_name, model in models.items():
            start_time =  time.time()%60
            text_clf = model.fit(X_train, y_train)
            end_time= time.time()%60

            if(end_time>start_time):
                time_to_build_the_model='%.2f' %(end_time-start_time)
            else:
                time_to_build_the_model='%.2f' %(start_time-end_time)


            start_time = time.time()%60
            predicted = text_clf.predict(X_test)
            end_time= time.time()%60

            if(end_time>start_time):
                time_to_predict='%.2f' %(end_time-start_time)
            else:
                time_to_predict='%.2f' %(start_time-end_time)


            print('\n-----------------\nModel: {}'.format(model_name))
            print(confusion_matrix(y_test,predicted))
            print(classification_report(y_test,predicted))
            print('\nAccuracy on Training:\n{00:.2f} %'.format(text_clf.score(X_train,y_train)*100))

            print('\nAccuracy on Testing:\n{00:.2f} %'.format(accuracy_score(y_test,predicted)*100))

            print('\nTime taken to build the model is {} Seconds'.format(time_to_build_the_model))
            print('\nTime taken for prediction is {} Seconds'.format(time_to_predict))


get_Model_results(list_of_models,list_of_Vectorizers,X,y)





...........Results for Hashing Vectorizer..........

-----------------
Model: Random Forest100
[[1220   91]
 [ 373  922]]
              precision    recall  f1-score   support

           0       0.77      0.93      0.84      1311
           1       0.91      0.71      0.80      1295

    accuracy                           0.82      2606
   macro avg       0.84      0.82      0.82      2606
weighted avg       0.84      0.82      0.82      2606


Accuracy on Training:
95.26 %

Accuracy on Testing:
82.19 %

Time taken to build the model is 20.98 Seconds

Time taken for prediction is 0.14 Seconds

-----------------
Model: Random Forest110
[[1162  149]
 [ 367  928]]
              precision    recall  f1-score   support

           0       0.76      0.89      0.82      1311
           1       0.86      0.72      0.78      1295

    accuracy                           0.80      2606
   macro avg       0.81      0.80      0.80      2606
weighted avg       0.81      0.80      0.80      2606


