In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold

import pickle

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from yellowbrick.model_selection import feature_importances

#===============================================================================================#

# Classification Models Class

#===============================================================================================#

class Classification():
    
    """
    This class is for performing classifcation algorithms such as Logistic Regression, Decision Tree, Random Forest, and SVM.
    
    Parameters
    ----------
    model_type: 'Logistic Regression', 'Decision Tree', 'Random Forest', 'SVM'
    the type of classifcation algorithm you would like to apply 
    
    x_train: dataframe
    the independant variables of the training data
    
    x_val: dataframe
    the independant variables of the validation data
    
    y_train: series
    the target variable of the training data
    
    y_val: series
    the target variable of the validation data
    
    """
    
    def __init__(self,model_type,x_train,x_val,y_train,y_val):

        self.model_type = model_type
        self.x_train = x_train
        self.y_train = y_train
        self.x_val = x_val
        self.y_val = y_val
        self.scores_table = pd.DataFrame()
        self.feature_importances = pd.DataFrame()
        self.name = self
        
        if self.model_type == 'Logistic Regression':
            self.technique = LogisticRegression(fit_intercept=False)
        elif self.model_type == 'Decision Tree':
            self.technique = DecisionTreeClassifier(random_state=42)
        elif self.model_type == 'Random Forest':
            self.technique = RandomForestClassifier(n_estimators=20,n_jobs=-1,random_state=42)
        elif self.model_type == 'SVM':
            self.technique = SVC()
        elif self.model_type == 'Naive Bayes':
            self.technique = GaussianNB()
        elif self.model_type == 'KNN':
            self.technique = KNeighborsClassifier(n_jobs=-1)
            
#===============================================================================================#

# Score Function

#===============================================================================================#

    def scores(self,model,x_train,x_val,y_train,y_val):
        
        """
        Gets the accuracy for the given data and creates a dataframe containing scores.
        Parameters
        ----------
        model: 'Logistic Regression', 'Decision Tree', 'Random Forest', 'SVM'
        the type of classifcation applied
        x_train: dataframe
        the independant variables of the training data
        x_val: dataframe
        the independant variables of the validation data
        y_train: series
        the target variable of the training data
        y_val: series
        the target variable of the validation data
        
        Returns
        ----------
        scores_table: a dataframe with the model used, the train accuracy and validation accuracy
        """
        
        self.acc_train = self.best_model.score(x_train,y_train)
        self.acc_val = self.best_model.score(x_val,y_val)
        
        d = {'Model Name': [self.model_type],
             'Train Accuracy': [self.acc_train], 
             'Validation Accuracy': [self.acc_val],
             'Accuracy Difference':[self.acc_train-self.acc_val]}
        self.scores_table = pd.DataFrame(data=d)
        
        return self.scores_table


#===============================================================================================#

# Get Scores Function

#===============================================================================================#

    def get_scores(self,params,cv_type):
        
        """
        Performs a gridsearch cross validation with given hyperparameters and data.
        Gets the accuracy for the given data and creates a dataframe containing scores.
        Parameters
        ----------
        param_grid: dictionary 
        specified hyperparameters for chosen classification algorithm to be passed through gridsearch cross validation
        
        cv_type: 'skf'
        the type of cross validation split to be used for gridsearch
        """
        
        classifier = self.technique
        fit_classifier = classifier.fit(self.x_train,self.y_train)
        opt_model = GridSearchCV(fit_classifier,
                                 params,
                                 cv=cv_type,
                                 scoring='accuracy',
                                 return_train_score=True,
                                 n_jobs=-1)
        self.opt_model = opt_model.fit(self.x_train,self.y_train) 
        self.best_model = opt_model.best_estimator_
        self.scores = Classification.scores(self,self.best_model,self.x_train,self.x_val,self.y_train,self.y_val)
        self.best_params = opt_model.best_params_
        display(self.scores_table)
        if params == {}:
            pass
        else:
            print("The best hyperparameters are: ", self.best_params,'\n')
        self.y_validated = self.best_model.predict(self.x_val)
        self.classification_report = pd.DataFrame.from_dict(classification_report(self.y_val,self.y_validated,output_dict=True)).iloc[0:3,0:5]
        return self.classification_report

#===============================================================================================#

# Feature Importance Function

#===============================================================================================#
   
    def get_feature_importances(self):
        
        """
        Create a confusion matrix.
        Returns
        ----------
        feature_importances_bar : a bar chart with feature importance of given model
        """
        if (self.model_type == 'Decision Tree') or (self.model_type == 'Random Forest') or (self.model_type == 'SVM'):    
            self.feature_importances_table = pd.DataFrame(self.best_model.feature_importances_,
                                                    index = self.x_train.columns,
                                                    columns=['Importance']).sort_values('Importance',ascending =False)
            plt.figure(figsize=(9,7.5))
            self.feature_importances_bar = sns.barplot(y= self.feature_importances_table.index[:15], x= self.feature_importances_table['Importance'][:15])
            plt.show()
            return self.feature_importances_bar
        
        else:
            return print('This classification method does not have the attribute feature importance.')

#===============================================================================================#

# Confusion Matrix Function

#===============================================================================================#

    def conf_matrix(self):
        
        """
        Create a confusion matrix.
        
        Returns
        ----------
        scores_table: a confusion matrix
        """
        
        plt.figure(figsize=(9,9))
        ax = sns.heatmap(confusion_matrix(self.y_val, self.y_validated),
                         annot= True, 
                         fmt = '.4g', 
                         cbar=0,
                         xticklabels=[1,2,3,4,5],
                         yticklabels=[1,2,3,4,5])
        ax.set(xlabel='Predicted', ylabel='True')
        plt.show()



#===============================================================================================#

# Test Score Function

#===============================================================================================#

    def get_test_scores(self,X_test,y_test):
        
        """
        Gets a ROC AUC score for given data and creates a dataframe containing scores.
        Creates a ROC plot.
        
        Parameters
        ----------
        x_test: dataframe 
        independant variables of the test data
        
        y_test: dataframe 
        target variable of the test data
        """
            
        self.y_test = y_test
        self.x_test = X_test
        self.scores_table = pd.DataFrame()
        self.test_scores = Classification.scores(self,self.best_model,self.x_train,self.x_test,self.y_train,self.y_test)
        display(self.scores_table)
        self.y_tested = self.best_model.predict(self.x_test)
        self.test_classification_report = pd.DataFrame.from_dict(classification_report(self.y_test,self.y_tested,output_dict=True)).iloc[0:3,0:5]
        
        return self.test_classification_report
    
#===============================================================================================#

# Show Test Confusion Matrix Function

#===============================================================================================#

    def test_conf_matrix(self):
        
        """
        Create a confusion matrix for the test data.
        
        Returns
        ----------
        scores_table: a confusion matrix
        """
        plt.figure(figsize=(9,9))
        ax = sns.heatmap(confusion_matrix(self.y_test, self.y_tested),
                         annot= True, 
                         fmt = '.4g', 
                         cbar=0,
                         xticklabels=[1,2,3,4,5],
                         yticklabels=[1,2,3,4,5])
        ax.set(xlabel='Predicted', ylabel='True')
        plt.show()

In [7]:
import pandas as pd
from sklearn.ensemble import VotingClassifier, BaggingClassifier, AdaBoostClassifier, StackingClassifier
from xgboost.sklearn import XGBClassifier

#===============================================================================================#

# Ensemble Models Class

#===============================================================================================#

class Ensemble(Classification):
    
    """
    This class is for performing ensemble algorithms such as voting, adaboost, xgboost, or stacking.
    
    Parameters
    ----------
    ensemble_method: 'Voting', 'AdaBoost', 'XGBoost', 'Stacking'
    the type of ensemble algorithm you would like to apply
    
    estimators: list
    the classifcation models to be used by the ensemble algorithm
    
    x_train: dataframe
    the independant variables of the training data
    
    x_val: dataframe
    the independant variables of the validation data
    
    y_train: series
    the target variable of the training data
    
    y_val: series
    the target variable of the validation data
    
    """
    
    def __init__(self, ensemble_method, estimators, X_train, X_val, y_train, y_val):
        
        self.ensemble_method = ensemble_method
        self.x_train = X_train
        self.y_train = y_train
        self.x_val = X_val
        self.y_val = y_val
        self.model_type = ensemble_method
        self.scores_table = pd.DataFrame()
        
        if self.ensemble_method == "Voting":
            self.technique = VotingClassifier(estimators=estimators, voting='soft', n_jobs=-1)
        elif self.ensemble_method == "AdaBoost":
            self.technique = AdaBoostClassifier(estimators, algorithm='SAMME')
        elif self.ensemble_method == "XGBoost":
            self.technique = XGBClassifier(n_jobs=-1)
        elif self.ensemble_method == "Stacking":
            self.technique = StackingClassifier(estimators)

In [9]:
sns.set_context('poster')

In [17]:
x_train = pd.read_csv('/Users/morganmccorkle/Documents/Springboard/Capstones/Capstone2New/tripadvisor/x_train_data.csv')
y_train = pd.read_csv('/Users/morganmccorkle/Documents/Springboard/Capstones/Capstone2New/tripadvisor/y_train_data.csv')

In [18]:
print(x_train.shape)
print(y_train.shape)

(15368, 232)
(15368, 1)


In [19]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train['Rating'], test_size=0.25, random_state=42)

In [20]:
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

## Decision Trees:

In [21]:
params = {'min_samples_leaf': [3,5,10,15,30,50,100], 'max_depth': [3,4,5,6,7,8,9]}

In [23]:
dec_tree1 = Classification('Decision Tree', x_train, x_val, y_train, y_val)

In [24]:
dec_tree1.get_scores(params, skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Decision Tree,0.526288,0.480219,0.04607


The best hyperparameters are:  {'max_depth': 9, 'min_samples_leaf': 50} 



Unnamed: 0,1,2,3,4,5
precision,0.44,0.281081,0.257576,0.378408,0.543079
recall,0.349823,0.147727,0.03972,0.318641,0.786982
f1-score,0.389764,0.193669,0.068826,0.345962,0.642667


In [33]:
params = {'min_samples_leaf': [46, 48, 50, 52, 54], 'max_depth': [7,8,9,10]}

In [34]:
dec_tree2 = Classification('Decision Tree',x_train,x_val,y_train,y_val)

In [35]:
dec_tree2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Decision Tree,0.532882,0.479178,0.053705


The best hyperparameters are:  {'max_depth': 10, 'min_samples_leaf': 52} 



Unnamed: 0,1,2,3,4,5
precision,0.425439,0.272727,0.245098,0.379731,0.540326
recall,0.342756,0.136364,0.058411,0.285583,0.804734
f1-score,0.379648,0.181818,0.09434,0.325996,0.646541


## Random Forest:

In [36]:
params = {'min_samples_leaf': [3,5,10,15,30,50,100], 'max_depth': [3,5,7,9,11,13,15]}

In [37]:
rand_forest1 = Classification('Random Forest',x_train,x_val,y_train,y_val)

In [38]:
rand_forest1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Random Forest,0.691567,0.517699,0.173868


The best hyperparameters are:  {'max_depth': 15, 'min_samples_leaf': 3} 



Unnamed: 0,1,2,3,4,5
precision,0.61194,0.507042,0.412698,0.412342,0.542118
recall,0.289753,0.102273,0.060748,0.269972,0.917751
f1-score,0.393285,0.170213,0.105906,0.326304,0.681608


In [53]:
params = {'min_samples_leaf': [1,2,3,4,5], 'max_depth': [13,14,15,16,17,18]}
rand_forest2 = Classification('Random Forest',x_train,x_val,y_train,y_val)
rand_forest2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Random Forest,0.699289,0.507028,0.192261


The best hyperparameters are:  {'max_depth': 18, 'min_samples_leaf': 5} 



Unnamed: 0,1,2,3,4,5
precision,0.566265,0.348485,0.362069,0.378316,0.551955
recall,0.332155,0.065341,0.049065,0.301194,0.876923
f1-score,0.418708,0.110048,0.08642,0.335378,0.677486


In [54]:
params = {'min_samples_leaf':[7,8,9,10,11,12,13,14], 'max_depth':[17,18,19,20,21,22]}
rand_forest3 = Classification('Random Forest',x_train,x_val,y_train,y_val)
rand_forest3.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Random Forest,0.695905,0.526288,0.169617


The best hyperparameters are:  {'max_depth': 21, 'min_samples_leaf': 7} 



Unnamed: 0,1,2,3,4,5
precision,0.578378,0.465753,0.41791,0.410317,0.564189
recall,0.378092,0.096591,0.065421,0.321396,0.889349
f1-score,0.457265,0.16,0.113131,0.360453,0.6904


## Logistic Regression

In [55]:
params = {'penalty':['l1','l2'], 'C':[0.01,0.05,0.1,0.5,1,5,10]}
log_reg1 = Classification('Logistic Regression',x_train,x_val,y_train,y_val)
log_reg1.get_scores(params,skf)

        nan 0.56498392        nan 0.56507069        nan 0.56238081
        nan 0.56168674]
        nan 0.60246399        nan 0.60662852        nan 0.60955667
        nan 0.6103592 ]


Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Logistic Regression,0.600035,0.572618,0.027416


The best hyperparameters are:  {'C': 1, 'penalty': 'l2'} 



Unnamed: 0,1,2,3,4,5
precision,0.541667,0.398438,0.378641,0.477022,0.669625
recall,0.5053,0.289773,0.182243,0.476584,0.80355
f1-score,0.522852,0.335526,0.246057,0.476803,0.7305


In [56]:
params = {'penalty':['l1','l2'], 'C':[0.3,0.4,0.5,0.6,0,7]}
log_reg2 = Classification('Logistic Regression',x_train,x_val,y_train,y_val)
log_reg2.get_scores(params,skf)

        nan 0.56411628        nan        nan        nan 0.56177351]
        nan 0.60383045        nan        nan        nan 0.60994709]


Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Logistic Regression,0.597258,0.568454,0.028804


The best hyperparameters are:  {'C': 0.5, 'penalty': 'l2'} 



Unnamed: 0,1,2,3,4,5
precision,0.545455,0.404255,0.38172,0.464801,0.662621
recall,0.487633,0.269886,0.165888,0.472911,0.807692
f1-score,0.514925,0.32368,0.23127,0.468821,0.728


In [57]:
params = {'penalty':['l2'], 'C':[0.25,0.26,0.27,0.28,0.29,0.30,0.31,0.32]}
log_reg3 = Classification('Logistic Regression',x_train,x_val,y_train,y_val)
log_reg3.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Logistic Regression,0.595783,0.571838,0.023946


The best hyperparameters are:  {'C': 0.32, 'penalty': 'l2'} 



Unnamed: 0,1,2,3,4,5
precision,0.555102,0.422535,0.387879,0.468218,0.658421
recall,0.480565,0.255682,0.149533,0.480257,0.818935
f1-score,0.515152,0.318584,0.215852,0.474161,0.729958


## Support Vector Machines (SVM)

In [58]:
params = {'kernel':['poly'], 'degree':[2,3]}
svm1 = Classification('SVM',x_train,x_val,y_train,y_val)
svm1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,SVM,0.893545,0.563509,0.330036


The best hyperparameters are:  {'degree': 2, 'kernel': 'poly'} 



Unnamed: 0,1,2,3,4,5
precision,0.533582,0.352518,0.38756,0.469067,0.666168
recall,0.5053,0.278409,0.189252,0.466483,0.789941
f1-score,0.519056,0.311111,0.254317,0.467772,0.722794


In [59]:
params = {'C':[0.2,0.3,0.4], 'kernel':['linear'], 'gamma':['scale','auto']}
svm2 = Classification('SVM',x_train,x_val,y_train,y_val)
svm2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,SVM,0.603245,0.570796,0.032448


The best hyperparameters are:  {'C': 0.4, 'gamma': 'scale', 'kernel': 'linear'} 



Unnamed: 0,1,2,3,4,5
precision,0.539033,0.401487,0.396947,0.465889,0.664886
recall,0.512367,0.306818,0.121495,0.476584,0.810059
f1-score,0.525362,0.347826,0.186047,0.471176,0.730328


In [60]:
params = {'C':[0.18,0.19,0.2,0.21], 'kernel':['linear'], 'gamma':['scale']}
svm3 = Classification('SVM',x_train,x_val,y_train,y_val)
svm3.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,SVM,0.590838,0.567413,0.023425


The best hyperparameters are:  {'C': 0.21, 'gamma': 'scale', 'kernel': 'linear'} 



Unnamed: 0,1,2,3,4,5
precision,0.578475,0.410448,0.430556,0.456597,0.650682
recall,0.45583,0.3125,0.07243,0.483012,0.818935
f1-score,0.509881,0.354839,0.124,0.469433,0.725177


## Gaussian Naive Bayes

In [61]:
params = {'var_smoothing':[1e-09,1e-06,1e-03,1e-01,1e2]}
gnb1 = Classification('Naive Bayes',x_train,x_val,y_train,y_val)
gnb1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Naive Bayes,0.51232,0.487246,0.025074


The best hyperparameters are:  {'var_smoothing': 0.1} 



Unnamed: 0,1,2,3,4,5
precision,0.334432,0.213296,0.287698,0.450051,0.721944
recall,0.717314,0.21875,0.338785,0.401286,0.597633
f1-score,0.45618,0.215989,0.311159,0.424272,0.653933


In [62]:
params = {'var_smoothing':[1e-02,1e-01,1]}
gnb2 = Classification('Naive Bayes',x_train,x_val,y_train,y_val)
gnb2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Naive Bayes,0.57444,0.549714,0.024727


The best hyperparameters are:  {'var_smoothing': 1} 



Unnamed: 0,1,2,3,4,5
precision,0.397638,0.296774,0.399015,0.468883,0.68386
recall,0.713781,0.130682,0.189252,0.505051,0.729586
f1-score,0.510746,0.18146,0.256735,0.486295,0.705983


## KNN

In [63]:
params = {'n_neighbors':[5,10,50,100,200,300]}
knn1 = Classification('KNN',x_train,x_val,y_train,y_val)
knn1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,KNN,0.535138,0.521864,0.013274


The best hyperparameters are:  {'n_neighbors': 100} 



Unnamed: 0,1,2,3,4,5
precision,0.527027,0.291667,0.380952,0.401575,0.575348
recall,0.275618,0.039773,0.037383,0.374656,0.881065
f1-score,0.361949,0.07,0.068085,0.387648,0.69612


In [64]:
params = {'n_neighbors':[250,300,350]}
knn2 = Classification('KNN',x_train,x_val,y_train,y_val)
knn2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,KNN,0.518046,0.513535,0.004512


The best hyperparameters are:  {'n_neighbors': 250} 



Unnamed: 0,1,2,3,4,5
precision,0.587629,0.285714,0.5,0.386609,0.554399
recall,0.201413,0.011364,0.023364,0.328742,0.913609
f1-score,0.3,0.021858,0.044643,0.355335,0.690056


In [65]:
params = {'n_neighbors':[340,345,350,355,360]}
knn3 = Classification('KNN',x_train,x_val,y_train,y_val)
knn3.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,KNN,0.515096,0.508069,0.007028


The best hyperparameters are:  {'n_neighbors': 340} 



Unnamed: 0,1,2,3,4,5
precision,0.6,0.25,0.533333,0.385584,0.543964
recall,0.159011,0.008523,0.018692,0.309458,0.922485
f1-score,0.251397,0.016484,0.036117,0.343352,0.684372


## Adaboost

In [66]:
params = {'learning_rate':[0.1,1,10]}
adaboost1 = Ensemble('AdaBoost',log_reg2.best_model,x_train,x_val,y_train,y_val)
adaboost1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,AdaBoost,0.52759,0.516137,0.011452


The best hyperparameters are:  {'learning_rate': 1} 



Unnamed: 0,1,2,3,4,5
precision,0.493421,0.33101,0.307125,0.407656,0.659811
recall,0.530035,0.269886,0.292056,0.391185,0.702367
f1-score,0.511073,0.29734,0.299401,0.39925,0.680424


In [67]:
params = {'learning_rate':[0.01,0.05,0.1]}
adaboost2 = Ensemble('AdaBoost',log_reg2.best_model,x_train,x_val,y_train,y_val)
adaboost2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,AdaBoost,0.537133,0.520302,0.016832


The best hyperparameters are:  {'learning_rate': 0.1} 



Unnamed: 0,1,2,3,4,5
precision,0.916667,0.391304,0.381443,0.368536,0.678919
recall,0.038869,0.051136,0.086449,0.621671,0.743195
f1-score,0.074576,0.090452,0.140952,0.462748,0.709605


## XGBoost

In [72]:
params = {'eta':[0.001,0.005,0.1,0.5], 'min_child_weight':[1,5,10]}
xgboost1 = Ensemble('XGBoost',log_reg2.best_model,x_train,x_val,y_train,y_val)
xgboost1.get_scores(params,skf)









Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,XGBoost,0.853028,0.542426,0.310602


The best hyperparameters are:  {'eta': 0.001, 'min_child_weight': 10} 



Unnamed: 0,1,2,3,4,5
precision,0.513986,0.407725,0.356223,0.426276,0.643701
recall,0.519435,0.269886,0.193925,0.414141,0.773964
f1-score,0.516696,0.324786,0.251135,0.420121,0.702848


In [73]:
params = {'eta':[0.0001,0.0005,0.001], 'min_child_weight':[5]}
xgboost2 = Ensemble('XGBoost',log_reg1.best_model,x_train,x_val,y_train,y_val)
xgboost2.get_scores(params,skf)









Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,XGBoost,0.884001,0.545029,0.338973


The best hyperparameters are:  {'eta': 0.0001, 'min_child_weight': 5} 



Unnamed: 0,1,2,3,4,5
precision,0.503521,0.406699,0.354839,0.437383,0.64064
recall,0.5053,0.241477,0.179907,0.429752,0.781657
f1-score,0.504409,0.30303,0.23876,0.433534,0.704158


## Comparing models

In [75]:
all_models = pd.concat([dec_tree1.scores_table,
                        rand_forest3.scores_table,
                        log_reg2.scores_table,
                        svm3.scores_table,
                        gnb1.scores_table,
                        knn3.scores_table,
                        adaboost1.scores_table,
                        xgboost1.scores_table],
                        axis=0)
                        
all_models

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Decision Tree,0.526288,0.480219,0.04607
0,Random Forest,0.695905,0.526288,0.169617
0,Logistic Regression,0.597258,0.568454,0.028804
0,SVM,0.590838,0.567413,0.023425
0,Naive Bayes,0.51232,0.487246,0.025074
0,KNN,0.515096,0.508069,0.007028
0,AdaBoost,0.52759,0.516137,0.011452
0,XGBoost,0.853028,0.542426,0.310602


In [76]:
all_models.to_csv('/Users/morganmccorkle/Documents/Springboard/Capstones/Capstone2New/tripadvisor/all_models.csv')