<a href="https://www.kaggle.com/code/ducklingming/v5-randomizedsearch-w-randomforestclassifier?scriptVersionId=131452178" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [2]:
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import set_config

## Config

In [3]:

set_config(transform_output="pandas")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
warnings.filterwarnings('ignore')

## Constants

In [4]:
EXCLUDED_COLUMNS = ['PassengerId', 'Cabin', 'Name']
RELATIVE_INPUT_PATH = '/kaggle/input/spaceship-titanic/'
RELATIVE_OUTPUT_PATH = '/kaggle/working/'

## Helper functions

In [5]:
def get_data():
    train = pd.read_csv(f'{RELATIVE_INPUT_PATH}/train.csv')
    test =  pd.read_csv(f'{RELATIVE_INPUT_PATH}/test.csv')
    return train, test


def calc_spend(df):
    return df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)


def split_cabin(df):
    deck = df['Cabin'].str.split(pat='/', expand=True)[0]
    side = df['Cabin'].str.split(pat='/', expand=True)[2]
    return deck, side


def drop_unwanted_columns(df):
    cols_to_drop = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin', 'PassengerId', 'Name',
                    'Spend', 'VIP', 'HomePlanet', 'Destination']
    return df.drop(cols_to_drop, axis=1)


def get_categorical_and_numerical_columns(df):
    cat_cols = df.select_dtypes(include=['object']).columns.tolist()
    num_cols = df.select_dtypes(exclude=['object']).columns.tolist()

    # deal with excluded
    cat_cols = [i for i in cat_cols if i not in EXCLUDED_COLUMNS]
    num_cols = [i for i in num_cols if i not in EXCLUDED_COLUMNS]
    return cat_cols, num_cols


def separate_features_from_label(df):
    label_name = 'Transported'
    labels = df[label_name]
    df.drop(label_name, axis='columns', inplace=True)
    return df, labels


def preprocess(df, train_set=True):
    df['Spend'] = calc_spend(df)
    df['Deck'], df['Side'] = split_cabin(df)
    df = drop_unwanted_columns(df)
    if train_set:
        df, dfy_train = separate_features_from_label(df)
        dfy_train = dfy_train.astype(int).fillna(value=0)
        return df, dfy_train
    else:
        return df


def write_preprocessed_data(train, test):
    train['LABEL'] = test
    train.to_csv(f'{RELATIVE_OUTPUT_PATH}/preprocessed.csv')


def write_results(test_pred, _X_test):
    pid = _X_test['PassengerId'].to_numpy()
    df = pd.DataFrame()
    df['PassengerId'] = pid
    df['Transported'] = test_pred.astype(bool)
    df = df.reset_index(drop=True)
    df.to_csv(f'{RELATIVE_OUTPUT_PATH}/submission.csv', index=False)


## Helper classes

In [6]:
# custom transformer
class Preprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, train_set):
        self.train_set = train_set

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X['Spend'] = calc_spend(X)
        X['Deck'], X['Side'] = split_cabin(X)
        return X

## Data Pipeline

### Load data

In [7]:
train, X_test = get_data()
X_train, y_train = separate_features_from_label(train)

### Define numerical and categorical pipelines

In [8]:
    # split columns into categorical and numerical
    cat_cols, num_cols = get_categorical_and_numerical_columns(X_train)
    print('cat_cols', cat_cols)
    print('num_cols', num_cols)

    # numerical pipeline
    num_pipeline = Pipeline([
        ('numerical_imputer', SimpleImputer(strategy='median')),
        ('std_scaler', StandardScaler()),
    ])

    # categorical pipeline
    cat_pipeline = Pipeline([
        ('categorical_imputer', SimpleImputer(strategy='most_frequent')),
        ('one_hot_encoder', OneHotEncoder(sparse=False))
    ])

    # joint categorical and numerical transformer
    column_transformer = ColumnTransformer([
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols),
    ])

cat_cols ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
num_cols ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']


### Full Pipeline

In [9]:
# full pipeline including preprocessing, column transformer and the classifier
full_pipeline = Pipeline([
    ('preprocessor', Preprocessor(train_set=True)),
    ('column_transformer', column_transformer),
    ('classifier', RandomForestClassifier())
])

### Fitting the training data into RandomizedSearchCV using hyperparameter grid

In [10]:
# hyperparameters to check for
param_grid = {
    'column_transformer__num__numerical_imputer__strategy': ['mean', 'median'],
    'classifier__criterion': ['gini', 'entropy', 'log_loss'],
    'classifier__n_estimators': [200, 300, 500],
    'classifier__max_features': ['auto', 'sqrt', 'log2'],
    'classifier__ccp_alpha': [.1, .01, .001, .0001, .00001, .000001],
}

# search and fit the training data
search = RandomizedSearchCV(full_pipeline, param_grid, scoring='f1', n_iter=10, error_score="raise")
search.fit(X_train, y_train)

# print tuned pipeline metadata
print(f'tuned hyperparameters :(best parameters) ', search.best_params_)
print(f'f1 score :', search.best_score_)
print(f' best estimator: {search.best_estimator_}')
print(f' whole search obj: {search}')

tuned hyperparameters :(best parameters)  {'column_transformer__num__numerical_imputer__strategy': 'median', 'classifier__n_estimators': 500, 'classifier__max_features': 'sqrt', 'classifier__criterion': 'gini', 'classifier__ccp_alpha': 0.001}
f1 score : 0.7990293344601956
 best estimator: Pipeline(steps=[('preprocessor', Preprocessor(train_set=True)),
                ('column_transformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('numerical_imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('std_scaler',
                                                                   StandardScaler())]),
                                                  ['Age', 'RoomService',
                                                   'FoodCourt', 'ShoppingMall',
                                      

## Predict test data and submit result

In [11]:
# predict and write test data results/submission
yhat = search.predict(X_test)
write_results(yhat, X_test)