In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve, confusion_matrix, RocCurveDisplay
import matplotlib.pyplot as plt


In [3]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [6]:
X = df[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']]
y = df['survived']


In [7]:
nums_cols = X.select_dtypes(include=['number']).columns.tolist()
cat_cols = X.select_dtypes(include=['category', 'object']).columns.tolist()

preprocessor = ColumnTransformer([
    ('imputer1', SimpleImputer(strategy='mean'), ['age']),
    ('cat', Pipeline([
        ('imputer2', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(drop='first')),
    ]), cat_cols)
], remainder='passthrough')

pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('scale', StandardScaler()),
    ('clf', RandomForestClassifier(random_state=42))
])

In [8]:
param_dist = {
    'clf__n_estimators': [50, 100, 200],
    'clf__max_depth': [None, 5, 10, ],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4],
    'clf__bootstrap': [True, False],
}

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

# Search
search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=20, cv=3, verbose=1, n_jobs=-1, random_state=42)

search.fit(X_train, y_train)

print("Best Parameters from RandomizedSearchCV: \n", search.best_params_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters from RandomizedSearchCV: 
 {'clf__n_estimators': 200, 'clf__min_samples_split': 5, 'clf__min_samples_leaf': 4, 'clf__max_depth': 5, 'clf__bootstrap': False}


In [11]:
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.81      0.91      0.86       105
           1       0.85      0.69      0.76        74

    accuracy                           0.82       179
   macro avg       0.83      0.80      0.81       179
weighted avg       0.82      0.82      0.82       179

