In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import time

import warnings

from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    StratifiedKFold,
    RepeatedStratifiedKFold,
    RandomizedSearchCV,
    GridSearchCV,
    RepeatedKFold
)

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

from sklearn.metrics import balanced_accuracy_score, classification_report, accuracy_score

import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
#Prep data
X = pd.read_csv('train.csv')
X_final = pd.read_csv('test.csv')

#save passengerId for submission
pid = X_final['PassengerId']

y = X['Survived']
X.drop(['Survived'], axis=1, inplace=True)

#drop variables
#   Cabin - too many missing values
#   PassengerId - not relevant
#   Name - not relevant
#   Ticket - not relevant
X.drop(['Cabin'], axis=1, inplace=True)
X.drop(['PassengerId'], axis=1, inplace=True)
X.drop(['Name'], axis=1, inplace=True)
X.drop(['Ticket'], axis=1, inplace=True)

X_final.drop(['Cabin'], axis=1, inplace=True)
X_final.drop(['PassengerId'], axis=1, inplace=True)
X_final.drop(['Name'], axis=1, inplace=True)
X_final.drop(['Ticket'], axis=1, inplace=True)



#impute Age and Embarked
X['Age'] = X['Age'].fillna(X['Age'].mean())
X['Embarked'] = X['Embarked'].fillna('S')
X_final['Age'] = X_final['Age'].fillna(X_final['Age'].mean())
X_final['Embarked'] = X_final['Embarked'].fillna('S')

#Impute Fare - missing Fare values in final test
X_final['Fare'] = X_final['Fare'].fillna(X_final['Fare'].mean())



#Feature Engineering

#create family size continuous variable
X['family_size'] = X['Parch'] + X['SibSp']
X_final['family_size'] = X_final['Parch'] + X_final['SibSp']

bins = [0, 0.1, 3, 6, 20]
labels = ['None', 'Small', 'Medium', 'Large']

#create the categorical variable
X['family_size_cat'] = pd.cut(X['family_size'], bins=bins, labels=labels, right=False)
X_final['family_size_cat'] = pd.cut(X_final['family_size'], bins=bins, labels=labels, right=False)

#drop duplicate vars
X.drop(['SibSp'], axis=1, inplace=True)
X.drop(['Parch'], axis=1, inplace=True)
X.drop(['family_size'], axis=1, inplace=True)

X_final.drop(['SibSp'], axis=1, inplace=True)
X_final.drop(['Parch'], axis=1, inplace=True)
X_final.drop(['family_size'], axis=1, inplace=True)

#Get train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


print(X_train.info())
print(X_train.describe())
print(X_train.describe(include='object'))

print(y_train.value_counts(normalize=True))


<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 81 to 385
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Pclass           712 non-null    int64   
 1   Sex              712 non-null    object  
 2   Age              712 non-null    float64 
 3   Fare             712 non-null    float64 
 4   Embarked         712 non-null    object  
 5   family_size_cat  712 non-null    category
dtypes: category(1), float64(2), int64(1), object(2)
memory usage: 34.3+ KB
None
           Pclass         Age        Fare
count  712.000000  712.000000  712.000000
mean     2.327247   29.659892   32.197050
std      0.834178   12.860939   51.249828
min      1.000000    0.420000    0.000000
25%      2.000000   22.000000    7.895800
50%      3.000000   29.699118   13.895850
75%      3.000000   35.000000   31.068750
max      3.000000   80.000000  512.329200
         Sex Embarked
count    712      712
unique     2        3

In [6]:
#Encode categorical features
enc = OrdinalEncoder(
    handle_unknown="use_encoded_value",   # Allow unseen categories during transform
    unknown_value=-1,                     # Code for unseen categories
    encoded_missing_value=-2,             # Code for missing values (NaN)
    dtype=np.int64                        
)

#Get categorical features
cat_cols = X_train.select_dtypes(exclude=["number"]).columns.tolist()

#Encode training set, test set, and final set
X_train[cat_cols] = enc.fit_transform(X_train[cat_cols])
X_test[cat_cols] = enc.fit_transform(X_test[cat_cols])
X_final[cat_cols] = enc.fit_transform(X_final[cat_cols])

In [4]:
# Baseline Logistic Regression

# Track time
start = time.time()

# define model
baseline_logistic_regression = LogisticRegression() # leave max_iter at default (100) and ignore warning - increasing takes too long

# fit model
baseline_logistic_regression.fit(X_train, y_train)

# predict on test set and generate classification report
baseline_logistic_regression_preds = baseline_logistic_regression.predict(X_test)
baseline_logistic_regression_report = classification_report(y_test, baseline_logistic_regression_preds)

# get accuracy scores
baseline_logistic_regression_accs = cross_val_score(baseline_logistic_regression, X_train, y_train, scoring='accuracy', cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=5), n_jobs=-1)

end = time.time()

# print outputs
print("\nLogistic Regression")
print("Execution Time:", time.strftime("%H:%M:%S", time.gmtime(end-start)))
print(f"Average cross-validation accuracy: {baseline_logistic_regression_accs.mean()}")
print(baseline_logistic_regression_report)


Logistic Regression
Execution Time: 00:00:02
Average cross-validation accuracy: 0.8103870777110214
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       106
           1       0.75      0.70      0.72        73

    accuracy                           0.78       179
   macro avg       0.78      0.77      0.77       179
weighted avg       0.78      0.78      0.78       179



In [5]:
# Baseline Linear Support Vector Classification

# Track time
start = time.time()

# define model
baseline_lsvc = LinearSVC()

# fit model
baseline_lsvc.fit(X_train, y_train)

# predict on test set and generate classification report
baseline_lsvc_preds = baseline_lsvc.predict(X_test)
baseline_lsvc_report = classification_report(y_test, baseline_lsvc_preds)

# get accuracy scores
baseline_lsvc_accs = cross_val_score(baseline_lsvc, X_train, y_train, scoring='accuracy', cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=5), n_jobs=-1) # lower repeats if it takes too long

end = time.time()

# print outputs
print("Linear Support Vector Classification")
print("Execution Time:", time.strftime("%H:%M:%S", time.gmtime(end-start)))
print(f"Average cross-validation accuracy: {baseline_lsvc_accs.mean()}")
print(baseline_lsvc_report)

Linear Support Vector Classification
Execution Time: 00:00:00
Average cross-validation accuracy: 0.8050586033684624
              precision    recall  f1-score   support

           0       0.79      0.84      0.81       106
           1       0.74      0.67      0.71        73

    accuracy                           0.77       179
   macro avg       0.77      0.76      0.76       179
weighted avg       0.77      0.77      0.77       179



In [6]:
# Baseline Decision Tree

# Track time
start = time.time()

# define model
baseline_decision_tree = DecisionTreeClassifier()

# fit model
baseline_decision_tree.fit(X_train, y_train)

# predict on test set and generate classification report
baseline_decision_tree_preds = baseline_decision_tree.predict(X_test)
baseline_decision_tree_report = classification_report(y_test, baseline_decision_tree_preds)

# get accuracy scores
baseline_decision_tree_accs = cross_val_score(baseline_decision_tree, X_train, y_train, scoring='accuracy', cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=5), n_jobs=-1)

end = time.time()

# print outputs
print("Decision Tree")
print("Execution Time:", time.strftime("%H:%M:%S", time.gmtime(end-start)))
print(f"Average cross-validation accuracy: {baseline_decision_tree_accs.mean()}")
print(baseline_decision_tree_report)

Decision Tree
Execution Time: 00:00:00
Average cross-validation accuracy: 0.7783610755441741
              precision    recall  f1-score   support

           0       0.77      0.78      0.78       106
           1       0.68      0.66      0.67        73

    accuracy                           0.73       179
   macro avg       0.72      0.72      0.72       179
weighted avg       0.73      0.73      0.73       179



In [7]:
# Baseline Random Forest

# Track time
start = time.time()

# define model
baseline_random_forest = RandomForestClassifier()

# fit model
baseline_random_forest.fit(X_train, y_train)

# predict on test set and generate classification report
baseline_random_forest_preds = baseline_random_forest.predict(X_test)
baseline_random_forest_report = classification_report(y_test, baseline_random_forest_preds)

# get accuracy scores
baseline_random_forest_accs = cross_val_score(baseline_random_forest, X_train, y_train, scoring='accuracy', cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=5), n_jobs=-1)

end = time.time()

# print outputs
print("Random Forest")
print("Execution Time:", time.strftime("%H:%M:%S", time.gmtime(end-start)))
print(f"Average cross-validation accuracy: {baseline_random_forest_accs.mean()}")
print(baseline_random_forest_report)

Random Forest
Execution Time: 00:00:00
Average cross-validation accuracy: 0.7955244755244755
              precision    recall  f1-score   support

           0       0.83      0.84      0.84       106
           1       0.76      0.75      0.76        73

    accuracy                           0.80       179
   macro avg       0.80      0.80      0.80       179
weighted avg       0.80      0.80      0.80       179



In [8]:
# Baseline Gradient Boosting Classifier

# Track time
start = time.time()

# define model
baseline_gbc = GradientBoostingClassifier()

# fit model
baseline_gbc.fit(X_train, y_train)

# predict on test set and generate classification report
baseline_gbc_preds = baseline_gbc.predict(X_test)
baseline_gbc_report = classification_report(y_test, baseline_gbc_preds)

# get accuracy scores
baseline_gbc_accs = cross_val_score(baseline_gbc, X_train, y_train, scoring='accuracy', cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=5), n_jobs=-1)

end = time.time()

# print outputs
print("Gradient Boosting Classifier")
print("Execution Time:", time.strftime("%H:%M:%S", time.gmtime(end-start)))
print(f"Average cross-validation accuracy: {baseline_gbc_accs.mean()}")
print(baseline_gbc_report)

Gradient Boosting Classifier
Execution Time: 00:00:00
Average cross-validation accuracy: 0.8216586230670738
              precision    recall  f1-score   support

           0       0.82      0.89      0.85       106
           1       0.82      0.73      0.77        73

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179



In [9]:
# Baseline K-Nearest Neighbors Classifier

# Track time
start = time.time()

# define model
baseline_knn = KNeighborsClassifier()

# fit model
baseline_knn.fit(X_train, y_train)

# predict on test set and generate classification report
baseline_knn_preds = baseline_knn.predict(X_test)
baseline_knn_report = classification_report(y_test, baseline_knn_preds)

# get accuracy scores
baseline_knn_accs = cross_val_score(baseline_knn, X_train, y_train, scoring='accuracy', cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=5), n_jobs=-1, error_score='raise')

end = time.time()

# print outputs
print("K-Nearest Neighbors Classifier")
print("Execution Time:", time.strftime("%H:%M:%S", time.gmtime(end-start)))
print(f"Average cross-validation accuracy: {baseline_knn_accs.mean()}")
print(baseline_knn_report)

K-Nearest Neighbors Classifier
Execution Time: 00:00:00
Average cross-validation accuracy: 0.7118034078597459
              precision    recall  f1-score   support

           0       0.75      0.72      0.73       106
           1       0.61      0.64      0.63        73

    accuracy                           0.69       179
   macro avg       0.68      0.68      0.68       179
weighted avg       0.69      0.69      0.69       179



In [10]:
#Logistic Regression
#Use optuna to find best hyperparameters

#filter output
warnings.filterwarnings(action='ignore', category=UserWarning)
optuna.logging.set_verbosity(optuna.logging.WARNING)

#define objective
def objective(trial):

    #define hyperparameter space
    params = {
        'penalty'       : trial.suggest_categorical('penalty', [None, 'l2']),
        'solver'        : trial.suggest_categorical('solver', ['lbfgs', 'newton-cg', 'newton-cholesky', 'sag']),
        'C'             : trial.suggest_float('C', 0.25, 3),
        'max_iter'      : trial.suggest_int('max_iter', 100, 1000)
    }

    #create model
    optuna_model = LogisticRegression()
    optuna_model.set_params(**params)

    #perform cross validation
    score = cross_val_score(
        optuna_model, X_train, y_train,
        scoring='accuracy',
        n_jobs=1, #something broken about the warnings when setting n_jobs=-1
        cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5),
        error_score='raise' #show errors
    ).mean() #get mean accuracy score

    return(score)

#create optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000, n_jobs=-1)

#store best values
lr_best_params = study.best_params
lr_best_cv_score = study.best_value



In [11]:
#Linear Support Vector Classifier
#Use optuna to find best hyperparameters

#filter output
warnings.filterwarnings(action='ignore', category=UserWarning)
optuna.logging.set_verbosity(optuna.logging.WARNING)

#define objective
def objective(trial):

    #define hyperparameter space
    params = {
        'penalty'       : trial.suggest_categorical('penalty', ['l1', 'l2']),
        'C'             : trial.suggest_float('C', 0.25, 4),
        'max_iter'      : trial.suggest_int('max_iter', 100, 1500)
    }

    #create model
    optuna_model = LinearSVC()
    optuna_model.set_params(**params)

    #perform cross validation
    score = cross_val_score(
        optuna_model, X_train, y_train,
        scoring='accuracy',
        n_jobs=1, #something broken about the warnings when setting n_jobs=-1
        cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5),
        error_score='raise' #show errors
    ).mean() #get mean accuracy score

    return(score)

#create optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000, n_jobs=-1)

#store best values
lsvc_best_params = study.best_params
lsvc_best_cv_score = study.best_value

In [12]:
#Decision Tree
#Use optuna to find best hyperparameters

#filter output
warnings.filterwarnings(action='ignore', category=UserWarning)
optuna.logging.set_verbosity(optuna.logging.WARNING)

#define objective
def objective(trial):

    #define hyperparameter space
    params = {
        'criterion'         : trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss']),
        'splitter'          : trial.suggest_categorical('splitter', ['best', 'random']),
        'max_depth'         : trial.suggest_int('max_depth', 2, 100),
        'min_samples_split' : trial.suggest_int('min_samples_split', 2, 50),
        'min_samples_leaf'  : trial.suggest_int('min_samples_leaf', 1, 150),
        'max_features'      : trial.suggest_int('max_features', 2, 20)
    }

    #create model
    optuna_model = DecisionTreeClassifier()
    optuna_model.set_params(**params)

    #perform cross validation
    score = cross_val_score(
        optuna_model, X_train, y_train,
        scoring='accuracy',
        n_jobs=1, #something broken about the warnings when setting n_jobs=-1
        cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5),
        error_score='raise' #show errors
    ).mean() #get mean accuracy score

    return(score)

#create optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000, n_jobs=-1)

#store best values
dt_best_params = study.best_params
dt_best_cv_score = study.best_value

In [13]:
#Random Forest
#Use optuna to find best hyperparameters

#filter output
warnings.filterwarnings(action='ignore', category=UserWarning)
optuna.logging.set_verbosity(optuna.logging.WARNING)

#define objective
def objective(trial):

    #define hyperparameter space
    params = {
        'criterion'         : trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss']),
        'max_depth'         : trial.suggest_int('max_depth', 2, 100),
        'min_samples_split' : trial.suggest_int('min_samples_split', 2, 50),
        'min_samples_leaf'  : trial.suggest_int('min_samples_leaf', 1, 150),
        'max_features'      : trial.suggest_int('max_features', 2, 20)
    }

    #create model
    optuna_model = RandomForestClassifier()
    optuna_model.set_params(**params)

    #perform cross validation
    score = cross_val_score(
        optuna_model, X_train, y_train,
        scoring='accuracy',
        n_jobs=1, #something broken about the warnings when setting n_jobs=-1
        cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5),
        error_score='raise' #show errors
    ).mean() #get mean accuracy score

    return(score)

#create optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000, n_jobs=-1)

#store best values
rf_best_params = study.best_params
rf_best_cv_score = study.best_value


In [14]:
#Gradient Boosting Classifier
#Use optuna to find best hyperparameters

#filter output
warnings.filterwarnings(action='ignore', category=UserWarning)
optuna.logging.set_verbosity(optuna.logging.WARNING)

#define objective
def objective(trial):

    #define hyperparameter space
    params = {
        'loss'              : trial.suggest_categorical('loss', ['log_loss', 'exponential']),
        'max_depth'         : trial.suggest_int('max_depth', 2, 100),
        'min_samples_split' : trial.suggest_int('min_samples_split', 2, 50),
        'min_samples_leaf'  : trial.suggest_int('min_samples_leaf', 1, 150),
        'n_estimators'      : trial.suggest_int('n_estimators', 2, 500)
    }

    #create model
    optuna_model = GradientBoostingClassifier()
    optuna_model.set_params(**params)

    #perform cross validation
    score = cross_val_score(
        optuna_model, X_train, y_train,
        scoring='accuracy',
        n_jobs=1, #something broken about the warnings when setting n_jobs=-1
        cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5),
        error_score='raise' #show errors
    ).mean() #get mean accuracy score

    return(score)

#create optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000, n_jobs=-1)

#store best values
gbc_best_params = study.best_params
gbc_best_cv_score = study.best_value


In [15]:
#K-Nearest Neighbors Classifier
#Use optuna to find best hyperparameters

#filter output
warnings.filterwarnings(action='ignore', category=UserWarning)
optuna.logging.set_verbosity(optuna.logging.WARNING)

#define objective
def objective(trial):

    #define hyperparameter space
    params = {
        'weights'           : trial.suggest_categorical('weights', ['uniform', 'distance']),
        'algorithm'         : trial.suggest_categorical('algorithm', ['ball_tree', 'kd_tree', 'brute']),
        'p'                 : trial.suggest_int('p', 1, 3),
        'n_neighbors'       : trial.suggest_int('n_neighbors', 2, 15)
    }

    #create model
    optuna_model = KNeighborsClassifier()
    optuna_model.set_params(**params)

    #perform cross validation
    score = cross_val_score(
        optuna_model, X_train, y_train,
        scoring='accuracy',
        n_jobs=1, #something broken about the warnings when setting n_jobs=-1
        cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5),
        error_score='raise' #show errors
    ).mean() #get mean accuracy score

    return(score)

#create optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000, n_jobs=-1)

#store best values
knn_best_params = study.best_params
knn_best_cv_score = study.best_value


In [16]:
# Logistic Regression Grid Search

warnings.filterwarnings(action='ignore', category=UserWarning)

# define paramter space
lr_param_grid = {
    'penalty'       : [None, 'l2'],
    'solver'        : ['lbfgs', 'newton-cg', 'newton-cholesky', 'sag'],
    'C'             : [0.25, 0.5, 0.75, 1, 2, 4]
}

# define grid search
lr_grid_search = GridSearchCV(
    estimator=LogisticRegression(), 
    param_grid=lr_param_grid,
    scoring='accuracy',
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=5),
    n_jobs=1
)

# fit grid search
lr_grid_search.fit(X_train, y_train)

# get results
lr_results_df = pd.DataFrame(lr_grid_search.cv_results_)

# display top 10 models
#lr_top10 = lr_results_df.sort_values(by='mean_test_score', ascending=True).head(10)

# store best model
lr_best_params_gs = lr_grid_search.best_params_
lr_best_cv_score_gs = lr_grid_search.best_score_


In [17]:
# Linear Support Vector Classifier Grid Search

# define paramter space
lsvc_param_grid = {
    'penalty'       : ['l1', 'l2'],
    'C'             : [0.25, 0.5, 0.75, 1, 2, 4],
    'max_iter'      : [500, 750, 1000, 1250, 1500]
}

# define grid search
lsvc_grid_search = GridSearchCV(
    estimator=LinearSVC(), 
    param_grid=lsvc_param_grid,
    scoring='accuracy',
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=5), 
    n_jobs=-1,
    
)

# fit grid search
lsvc_grid_search.fit(X_train, y_train)

# get results
lsvc_results_df = pd.DataFrame(lsvc_grid_search.cv_results_)

# display top 10 models
#lsvc_top10 = lsvc_results_df.sort_values(by='mean_test_score', ascending=True).head(10)

# store best model
lsvc_best_params_gs = lsvc_grid_search.best_params_
lsvc_best_cv_score_gs = lsvc_grid_search.best_score_

In [18]:
# Decision Tree Grid Search

# define paramter space
dt_param_grid = {
    'criterion'         : ['gini', 'entropy', 'log_loss'],
    'splitter'          : ['best', 'random'],
    'max_depth'         : [10, 25, 50, 100, 150, None],
    'min_samples_split' : [2, 6, 10, 15, 20],
    'min_samples_leaf'  : [1, 25, 50, 100, 150],
    'max_features'      : [20, 50, 150, 300, 500, None]
}

# define grid search
dt_grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(), 
    param_grid=dt_param_grid,
    scoring='accuracy',
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=5),
    n_jobs=-1
)

# fit grid search
dt_grid_search.fit(X_train, y_train)

# get results
dt_results_df = pd.DataFrame(dt_grid_search.cv_results_)

# display top 10 models
#dt_top10 = dt_results_df.sort_values(by='mean_test_score', ascending=True).head(10)

# store best model
dt_best_params_gs = dt_grid_search.best_params_
dt_best_cv_score_gs = dt_grid_search.best_score_


In [19]:
# Random Forest Grid Search

# define paramter space
rf_param_grid = {
    'criterion'         : ['gini', 'entropy', 'log_loss'],
    'max_depth'         : [2, 4, 10, 30, 100, None],
    'min_samples_split' : [2, 6, 10, 15, 20],
    'min_samples_leaf'  : [1, 25, 50, 100, 150],
    'max_features'      : [20, 50, 150, 300, 500, None]
}

# define grid search
rf_grid_search = GridSearchCV(
    estimator=RandomForestClassifier(), 
    param_grid=rf_param_grid,
    scoring='accuracy',
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=5),
    n_jobs=-1
)

# fit grid search
rf_grid_search.fit(X_train, y_train)

# get results
rf_results_df = pd.DataFrame(rf_grid_search.cv_results_)

# display top 10 models
#rf_top10 = rf_results_df.sort_values(by='mean_test_score', ascending=True).head(10)

# store best model
rf_best_params_gs = rf_grid_search.best_params_
rf_best_cv_score_gs = rf_grid_search.best_score_

In [20]:
# Gradient Boosting Classifier Grid Search

# define paramter space
gbc_param_grid = {
    'loss'              : ['log_loss', 'exponential'],
    'n_estimators'      : [50, 100, 200, 500],
    'min_samples_split' : [2, 6, 10, 15, 20],
    'min_samples_leaf'  : [1, 25, 50, 100, 150],
    'max_depth'         : [2, 3, 5, 10, 30, None]
}

# define grid search
gbc_grid_search = GridSearchCV(
    estimator=GradientBoostingClassifier(), 
    param_grid=gbc_param_grid,
    scoring='accuracy',
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=5), 
    n_jobs=-1
)

# fit grid search
gbc_grid_search.fit(X_train, y_train)

# get results
gbc_results_df = pd.DataFrame(gbc_grid_search.cv_results_)

# display top 10 models
#gbc_top10 = gbc_results_df.sort_values(by='mean_test_score', ascending=True).head(10)

# store best model
gbc_best_params_gs = gbc_grid_search.best_params_
gbc_best_cv_score_gs = gbc_grid_search.best_score_

In [21]:
# K-Nearest Neighbors Classifier Grid Search

# define paramter space
knn_param_grid = {
    'n_neighbors'       : [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'weights'           : ['uniform', 'distance'],
    'algorithm'         : ['ball_tree', 'kd_tree', 'brute'],
    'p'                 : [1, 2]
}

# define grid search
knn_grid_search = GridSearchCV(
    estimator=KNeighborsClassifier(), 
    param_grid=knn_param_grid,
    scoring='accuracy',
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=5), 
    n_jobs=-1
)

# fit grid search
knn_grid_search.fit(X_train, y_train)

# get results
knn_results_df = pd.DataFrame(knn_grid_search.cv_results_)

# display top 10 models
#knn_top10 = knn_results_df.sort_values(by='mean_test_score', ascending=True).head(10)

# store best model
knn_best_params_gs = knn_grid_search.best_params_
knn_best_cv_score_gs = knn_grid_search.best_score_

In [22]:
# Summarize results

print("Results summary:\n")

print(f"Logistic regression baseline score: {baseline_logistic_regression_accs.mean()}")
print(f"Logistic regression optuna best score: {lr_best_cv_score}")
print(f"Logistic regression GridSearchCV best score: {lr_best_cv_score_gs}")
print(f"Logistic regression optuna best parameters: {lr_best_params}")
print(f"Logistic regression GridSearch CV best parameters: {lr_best_params_gs}")

print(f"Linear support vector classifier baseline score: {baseline_lsvc_accs.mean()}")
print(f"Linear support vector classifier optuna best score: {lsvc_best_cv_score}")
print(f"Linear support vector classifier GridSearchCV best score: {lsvc_best_cv_score_gs}")
print(f"Linear support vector classifier optuna best parameters: {lsvc_best_params}")
print(f"Linear support vector classifier GridSearch CV best parameters: {lsvc_best_params_gs}")

print(f"Decision tree baseline score: {baseline_decision_tree_accs.mean()}")
print(f"Decision tree optuna best score: {dt_best_cv_score}")
print(f"Decision tree GridSearchCV best score: {dt_best_cv_score_gs}")
print(f"Decision tree optuna best parameters: {dt_best_params}")
print(f"Decision tree GridSearch CV best parameters: {dt_best_params_gs}")

print(f"Random forest baseline score: {baseline_random_forest_accs.mean()}")
print(f"Random forest optuna best score: {rf_best_cv_score}")
print(f"Random forest GridSearchCV best score: {rf_best_cv_score_gs}")
print(f"Random forest optuna best parameters: {rf_best_params}")
print(f"Random forest GridSearch CV best parameters: {rf_best_params_gs}")

print(f"Gradient boosting classifier baseline score: {baseline_gbc_accs.mean()}")
print(f"Gradient boosting classifier optuna best score: {gbc_best_cv_score}")
print(f"Gradient boosting classifier GridSearchCV best score: {gbc_best_cv_score_gs}")
print(f"Gradient boosting classifier optuna best parameters: {gbc_best_params}")
print(f"Gradient boosting classifier GridSearch CV best parameters: {gbc_best_params_gs}")

print(f"K-nearest neighbors baseline score: {baseline_knn_accs.mean()}")
print(f"K-nearest neighbors optuna best score: {knn_best_cv_score}")
print(f"K-nearest neighbors GridSearchCV best score: {knn_best_cv_score_gs}")
print(f"K-nearest neighbors optuna best parameters: {knn_best_params}")
print(f"K-nearest neighbors GridSearch CV best parameters: {knn_best_params_gs}")



Results summary:

Logistic regression baseline score: 0.8103870777110214
Logistic regression optuna best score: 0.8151718703831381
Logistic regression GridSearchCV best score: 0.8100817492366787
Logistic regression optuna best parameters: {'penalty': 'l2', 'solver': 'newton-cg', 'C': 0.2519187354950386, 'max_iter': 806}
Logistic regression GridSearch CV best parameters: {'C': 0.75, 'penalty': 'l2', 'solver': 'lbfgs'}
Linear support vector classifier baseline score: 0.8050586033684624
Linear support vector classifier optuna best score: 0.8115158081355264
Linear support vector classifier GridSearchCV best score: 0.8087245149216981
Linear support vector classifier optuna best parameters: {'penalty': 'l2', 'C': 0.32068244899996284, 'max_iter': 913}
Linear support vector classifier GridSearch CV best parameters: {'C': 0.25, 'max_iter': 500, 'penalty': 'l2'}
Decision tree baseline score: 0.7783610755441741
Decision tree optuna best score: 0.8112124495223086
Decision tree GridSearchCV best sc

In [39]:
print(pid)

0       892
1       893
2       894
3       895
4       896
       ... 
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64


In [40]:
#define final models
lr_final_model = LogisticRegression()
lr_final_model.set_params(**lr_best_params)

lsvc_final_model = LinearSVC()
lsvc_final_model.set_params(**lsvc_best_params)

dt_final_model = DecisionTreeClassifier()
dt_final_model.set_params(**dt_best_params) #not choosing grid search despite higher cv score due to overfitting concerns

rf_final_model = RandomForestClassifier()
rf_final_model.set_params(**rf_best_params)

gbc_final_model = GradientBoostingClassifier()
gbc_final_model.set_params(**gbc_best_params_gs)

knn_final_model = KNeighborsClassifier()
knn_final_model.set_params(**knn_best_params)

#fit models
lr_final_model.fit(X_train, y_train)
lsvc_final_model.fit(X_train, y_train)
dt_final_model.fit(X_train, y_train)
rf_final_model.fit(X_train, y_train)
gbc_final_model.fit(X_train, y_train)
knn_final_model.fit(X_train, y_train)

#predict on final dataset
lr_pred =   lr_final_model.predict(X_final)
lsvc_pred = lsvc_final_model.predict(X_final)
dt_pred =   dt_final_model.predict(X_final)
rf_pred =   rf_final_model.predict(X_final)
gbc_pred =  gbc_final_model.predict(X_final)
knn_pred =  knn_final_model.predict(X_final)

#define submission data
lr_data =   {'PassengerId' : pid, 'Survived' : lr_pred}
lsvc_data = {'PassengerId' : pid, 'Survived' : lsvc_pred}
dt_data =   {'PassengerId' : pid, 'Survived' : dt_pred}
rf_data =   {'PassengerId' : pid, 'Survived' : rf_pred}
gbc_data =  {'PassengerId' : pid, 'Survived' : gbc_pred}
knn_data =  {'PassengerId' : pid, 'Survived' : knn_pred}
lr_submission =     pd.DataFrame(lr_data)
lsvc_submission =   pd.DataFrame(lsvc_data)
dt_submission =     pd.DataFrame(dt_data)
rf_submission =     pd.DataFrame(rf_data)
gbc_submission =    pd.DataFrame(gbc_data)
knn_submission =    pd.DataFrame(knn_data)

lr_submission.to_csv('logistic_regression_submission.csv', index=False)
lsvc_submission.to_csv('linear_svc_submission.csv', index=False)
dt_submission.to_csv('decision_tree_submission.csv', index=False)
rf_submission.to_csv('random_forest_submission.csv', index=False)
gbc_submission.to_csv('gradient_boosting_classifier_submission.csv', index=False)
knn_submission.to_csv('knearest_neighbors_submission.csv', index=False)


In [8]:
#create voting classifier from all of the models 

lr_best_params = {'penalty': 'l2', 'solver': 'newton-cg', 'C': 0.2519187354950386, 'max_iter': 806}
lsvc_best_params = {'penalty': 'l2', 'C': 0.32068244899996284, 'max_iter': 913}
dt_best_params = {'criterion': 'gini', 'splitter': 'best', 'max_depth': 43, 'min_samples_split': 12, 'min_samples_leaf': 38, 'max_features': 10}
rf_best_params = {'criterion': 'log_loss', 'max_depth': 11, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 7}
gbc_best_params = {'loss': 'log_loss', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 20, 'n_estimators': 100}
knn_best_params = {'weights': 'distance', 'algorithm': 'kd_tree', 'p': 1, 'n_neighbors': 14}

lr_voting_model = LogisticRegression()
lr_voting_model.set_params(**lr_best_params)

lsvc_voting_model = LinearSVC()
lsvc_voting_model.set_params(**lsvc_best_params)

dt_voting_model = DecisionTreeClassifier()
dt_voting_model.set_params(**dt_best_params) #not choosing grid search despite higher cv score due to overfitting concerns

rf_voting_model = RandomForestClassifier()
rf_voting_model.set_params(**rf_best_params)

gbc_voting_model = GradientBoostingClassifier()
gbc_voting_model.set_params(**gbc_best_params)

knn_voting_model = KNeighborsClassifier()
knn_voting_model.set_params(**knn_best_params)

#create voting classifiers
hard_voter = VotingClassifier(estimators=[('lr', lr_voting_model), 
                                     ('dt', dt_voting_model), 
                                     ('rf', rf_voting_model), 
                                     ('gbc', gbc_voting_model), 
                                     ('knn', knn_voting_model)], voting='hard')
hard_voter.fit(X_train, y_train)

soft_voter = VotingClassifier(estimators=[('lr', lr_voting_model),
                                     ('dt', dt_voting_model), 
                                     ('rf', rf_voting_model), 
                                     ('gbc', gbc_voting_model), 
                                     ('knn', knn_voting_model)], voting='soft')
soft_voter.fit(X_train, y_train)

#use voters to predict
hard_pred = hard_voter.predict(X_final)
soft_pred = soft_voter.predict(X_final)

#create submission files
hard_data = {'PassengerId' : pid, 'Survived' : hard_pred}
soft_data = {'PassengerId' : pid, 'Survived' : soft_pred}

hard_submission = pd.DataFrame(hard_data)
soft_submission = pd.DataFrame(soft_data)

hard_submission.to_csv('hard_voting_classifier_submission.csv', index=False)
soft_submission.to_csv('soft_voting_classifier_submission.csv', index=False)
