In [5]:
# Base learners (i.e. individual models) comprose an 'ensemble' of models
    # bagging (Bootstrap AGGregatING) 
        # "bootstrapping" means sampling with replacement. Each base learner samples from the same data w/ replacement.
    # predictions of all individual models are *aggregated* to make an average (regression) or mode (classif.)
# "Random Forest" is when bagging is used with decision trees, randomizing the features used to 
    # train each base learner. The goal is to make each base learner as different from each other as possible.
    # (this is in attempt to prevent a given base learner from making the same mistakes as others)

    # ^ can still use GridSearchCV with random forest

# "%%time" is a magic command ('magics') built into python which is useful for measuring cell run time
# "pickling" saves the fit model to a specified location and can be quickly read back in

In [6]:
import numpy as np
import pandas as pd

import pickle as pkl

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, PredefinedSplit, GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

In [7]:
df = pd.read_csv('/Users/micahevalt/Downloads/Invistico_Airline(1).csv')

In [8]:
df.head(5)

Unnamed: 0,satisfaction,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,satisfied,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,2,...,2,3,3,0,3,5,3,2,0,0.0
1,satisfied,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,3,...,2,3,4,4,4,2,3,2,310,305.0
2,satisfied,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,3,...,2,2,3,3,4,4,4,2,0,0.0
3,satisfied,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,3,...,3,1,1,0,1,4,1,3,0,0.0
4,satisfied,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,3,...,4,2,2,0,2,4,2,5,0,0.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129880 entries, 0 to 129879
Data columns (total 22 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   satisfaction                       129880 non-null  object 
 1   Customer Type                      129880 non-null  object 
 2   Age                                129880 non-null  int64  
 3   Type of Travel                     129880 non-null  object 
 4   Class                              129880 non-null  object 
 5   Flight Distance                    129880 non-null  int64  
 6   Seat comfort                       129880 non-null  int64  
 7   Departure/Arrival time convenient  129880 non-null  int64  
 8   Food and drink                     129880 non-null  int64  
 9   Gate location                      129880 non-null  int64  
 10  Inflight wifi service              129880 non-null  int64  
 11  Inflight entertainment             1298

In [10]:
df.shape

(129880, 22)

In [26]:
# drop na's

df.isna().any(axis=1).sum()

393

In [27]:
df_sub = df.dropna()

In [28]:
# for random forest, don't encode y variable yet, and drop_first can be False (means extra columns)

df_sub = pd.get_dummies(df_sub, columns = [
    'Customer Type', 'Type of Travel', 'Class'],
                         dtype = 'float')

In [35]:
df_sub.shape

(129487, 26)

In [36]:
df_sub.info()

<class 'pandas.core.frame.DataFrame'>
Index: 129487 entries, 0 to 129879
Data columns (total 26 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   satisfaction                       129487 non-null  object 
 1   Age                                129487 non-null  int64  
 2   Flight Distance                    129487 non-null  int64  
 3   Seat comfort                       129487 non-null  int64  
 4   Departure/Arrival time convenient  129487 non-null  int64  
 5   Food and drink                     129487 non-null  int64  
 6   Gate location                      129487 non-null  int64  
 7   Inflight wifi service              129487 non-null  int64  
 8   Inflight entertainment             129487 non-null  int64  
 9   Online support                     129487 non-null  int64  
 10  Ease of Online booking             129487 non-null  int64  
 11  On-board service                   129487 no

In [37]:
# define x and y

x = df_sub.copy()
x = x.drop(columns = ['satisfaction'])

y = df_sub['satisfaction']

In [38]:
print(x.shape)
print(y.shape)

(129487, 25)
(129487,)


In [39]:
# separate out 25% for test data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)

# from the 75% train data, separate out 25% for validation data

x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size = 0.25, random_state = 0)

In [40]:
# establish set of hyperparameters to test

cv_params = {'n_estimators': [50,100],
             'max_depth': [10,50],
             'min_samples_leaf': [0.5,1],
             'min_samples_split': [0.001, 0.01],
             'max_features': ['sqrt'],
             'max_samples': [0.5, 0.9]}

In [41]:
# create list of split indices

split_index = [0 if x in x_val.index else -1 for x in x_train.index]
    #^ this gives us a list indicating if rows in x_train made it into x_val (0) or not (-1)

custom_split = PredefinedSplit(split_index)

In [42]:
# instantiate model

rf = RandomForestClassifier(random_state = 0)

In [43]:
# within model, grid search over all the hyperparameters with GridSearchCV

rf_val = GridSearchCV(rf, cv_params, cv = custom_split, refit = "f1", n_jobs = -1, verbose = 1)

In [45]:
%%time

# fit model

rf_val.fit(x_train, y_train)

Fitting 1 folds for each of 32 candidates, totalling 32 fits
CPU times: user 6.83 s, sys: 436 ms, total: 7.27 s
Wall time: 1min 2s


In [47]:
rf_val.best_params_

{'max_depth': 50,
 'max_features': 'sqrt',
 'max_samples': 0.9,
 'min_samples_leaf': 1,
 'min_samples_split': 0.001,
 'n_estimators': 50}

In [49]:
# instantiate random forest model with optimum parameters

rf_opt = RandomForestClassifier(n_estimators = 50, max_depth = 50, min_samples_leaf = 1,
                                min_samples_split = 0.001, max_features = "sqrt", 
                                max_samples = 0.9, random_state = 0)

In [50]:
# fit

rf_opt.fit(x_train, y_train)

In [51]:
# predict on test set

y_pred = rf_opt.predict(x_test)

In [55]:
pc_test = precision_score(y_test, y_pred, pos_label = 'satisfied')
print('Precision: {pc:.3f}'.format(pc = pc_test))

Precision: 0.950


In [56]:
rc_test = recall_score(y_test, y_pred, pos_label = 'satisfied')
print('Recall: {rc:.3f}'.format(rc = rc_test))

Recall: 0.945


In [58]:
ac_test = accuracy_score(y_test, y_pred)
print('Accuracy: {ac:.3f}'.format(ac = ac_test))

Accuracy: 0.942


In [59]:
f1_test = f1_score(y_test, y_pred, pos_label = 'satisfied')
print('F1: {f1:.3f}'.format(f1 = f1_test))

F1: 0.947
