In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [3]:
df = sns.load_dataset('titanic')

X = df[['pclass', 'sex', 'age', 'fare', 'embarked']]
y = df['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
models = [
    ('Random Forest Classifier', RandomForestClassifier(random_state=42)),
    ('Gradient Boost Classifier', GradientBoostingClassifier(random_state=42)),
    ('XGBoost Classifier', XGBClassifier(random_state=42)),
    ('Support Vector Machine Classifier', SVC(random_state=42)),
    ('Logistic Regression Classifier', LogisticRegression(random_state=42))
]

best_model = None
best_accuracy = 0.0

for name, model in models:
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore')),
        (name, model)
    ])

    scores = cross_val_score(pipeline, 
                             X_train, 
                             y_train, 
                             cv=5)
    
    mean_accuracy = scores.mean()

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    print(f'{name} mean accuracy: {mean_accuracy}')
    print(f'{name} accuracy: {accuracy}')   
    print('')

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = pipeline

print("Best Model: ", best_model)


Random Forest Classifier mean accuracy: 0.7991529597163399
Random Forest Classifier accuracy: 0.8379888268156425

Gradient Boost Classifier mean accuracy: 0.8061952132374668
Gradient Boost Classifier accuracy: 0.7988826815642458

XGBoost Classifier mean accuracy: 0.8076233625529401
XGBoost Classifier accuracy: 0.7932960893854749

Support Vector Machine Classifier mean accuracy: 0.8160248202501723
Support Vector Machine Classifier accuracy: 0.8044692737430168

Logistic Regression Classifier mean accuracy: 0.7977839062346105
Logistic Regression Classifier accuracy: 0.8100558659217877

Best Model:  Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('Random Forest Classifier',
                 RandomForestClassifier(random_state=42))])
