In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
# Load data
df = pd.read_csv("train.csv")

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [3]:
# Feature groups
num_features = ['Age', 'Fare']
cat_features = ['Sex', 'Embarked', 'Pclass']

# Transformers
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Column Transformer
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

Create a pipeline to train Random Forest Classifier

In [None]:
X_train = train_data[num_features + cat_features]
y_train = train_data['Survived']

# Pipeline
rf_pipeline = Pipeline([
     ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# 5-fold cross-validation on training set
scores = cross_val_score(rf_pipeline, X_train, y_train, cv=5, scoring='accuracy')
print("Random Forest CV Accuracy:", scores.mean())

Random Forest CV Accuracy: 0.7992613020782035


In [None]:
X_test = test_data[num_features + cat_features]
y_test = test_data['Survived']

# After cross validation, fit the model on entire train dataset (80%)
rf_pipeline.fit(X_train, y_train)

# Predict
y_pred = rf_pipeline.predict(X_test)

# Use the remaining 20% to conduct Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.7932960893854749


In [6]:
test_raw = pd.read_csv("test.csv")

X_submission = test_raw[num_features + cat_features]

submission_preds = rf_pipeline.predict(X_submission)


submission = pd.DataFrame({
    "PassengerId": test_raw["PassengerId"],
    "Survived": submission_preds
})

submission.to_csv("submission_rf_no_fe.csv", index=False)
print("Submission file created: submission.csv")

Submission file created: submission.csv


#### This submission results in an accuracy of 0.76555.

https://www.kaggle.com/code/nargisbegum82/hyperparameter-tuning-in-random-forests

In [None]:
# hyper parameter tuning with RF Classifier

rf_pipeline = Pipeline([
     ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, max_features="sqrt"))
])

# 4 hyperparameters, each has some values, so the total permutation is 3*3*4*4=144
param_grid = {
    'classifier__n_estimators': [1000, 1100, 1500],
    'classifier__max_depth': [4, 5, 6],
    'classifier__min_samples_split': [4, 6, 8, 10],
    'classifier__min_samples_leaf': [1, 2, 5, 10]
}

# because we are using cross validation of 5 folds, we will have 720 fits in total.
grid_search = GridSearchCV(
    rf_pipeline,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

# ----- Evaluation -----
print("Best Parameters:", grid_search.best_params_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best Parameters: {'classifier__max_depth': 6, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 4, 'classifier__n_estimators': 1500}
Accuracy: 0.8100558659217877


In [19]:
submission_preds = best_model.predict(X_submission)


submission = pd.DataFrame({
    "PassengerId": test_raw["PassengerId"],
    "Survived": submission_preds
})

submission.to_csv("submission_rf_ht_no_fe.csv", index=False)
print("Submission file created: submission.csv")

Submission file created: submission.csv


#### This submission results in an accuracy of 0.77990