In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [2]:
def preprocess_data(file_path='E:\AIML Tasks\Titanic-Dataset.csv'):
    """Loads and fully preprocesses the Titanic dataset."""
    
    df = pd.read_csv(file_path)

    #  Step 1: Handle Missing Values (Part 1) & Drop Columns 
    embarked_mode = df['Embarked'].mode()[0]
    df['Embarked'] = df['Embarked'].fillna(embarked_mode)
    df = df.drop(columns=['Cabin', 'PassengerId', 'Ticket'])

    #  Step 2: Feature Engineering 
    def get_title(name):
        title_search = re.search(' ([A-Za-z]+)\.', name)
        if title_search:
            return title_search.group(1)
        return ""

    df['Title'] = df['Name'].apply(get_title)
    common_titles = ['Mr', 'Miss', 'Mrs', 'Master']
    df['Title'] = df['Title'].apply(lambda x: x if x in common_titles else 'Other')
    df = df.drop(columns=['Name'])

    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    df = df.drop(columns=['SibSp', 'Parch'])

    #  Step 3: Categorical Feature Encoding 
    le = LabelEncoder()
    df['Sex'] = le.fit_transform(df['Sex']) # male=1, female=0
    df = pd.get_dummies(df, columns=['Embarked', 'Title'], drop_first=True)

    #  Step 4: Define X and y, then Split 
    y = df['Survived']
    X = df.drop('Survived', axis=1)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    #  Step 5: Post-Split Imputation (Age) 
    age_imputer = SimpleImputer(strategy='median')
    X_train.loc[:, 'Age'] = age_imputer.fit_transform(X_train[['Age']])
    X_test.loc[:, 'Age'] = age_imputer.transform(X_test[['Age']])

    #  Step 6: Post-Split Feature Scaling 
    cols_to_scale = ['Age', 'Fare', 'FamilySize']
    scaler = StandardScaler()
    X_train.loc[:, cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])
    X_test.loc[:, cols_to_scale] = scaler.transform(X_test[cols_to_scale])
    
    return X_train, X_test, y_train, y_test

  def preprocess_data(file_path='E:\AIML Tasks\Titanic-Dataset.csv'):
  title_search = re.search(' ([A-Za-z]+)\.', name)


In [3]:

# Preprocessed data
X_train, X_test, y_train, y_test = preprocess_data()

print("--- Training Tuned Random Forest Model (GridSearchCV) ---")
    
#  Defining the Parameter Grid 
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

#  Initializing GridSearchCV 
# We use n_jobs=1 to prevent potential errors in some environments
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=1, # Set to 1 to avoid pickling errors
    verbose=1 
)

# Fit GridSearchCV
print("Starting Hyperparameter Tuning...")
grid_search.fit(X_train, y_train)

#  Get Best Model 
print("\n--- Hyperparameter Tuning Complete ---")
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")
    
best_rf_model = grid_search.best_estimator_

#  Evaluate Best Model 
y_pred_tuned_rf = best_rf_model.predict(X_test)
    
accuracy_tuned_rf = accuracy_score(y_test, y_pred_tuned_rf)
print(f"\n--- Tuned Model Accuracy on Test Set ---")
print(f"Accuracy: {accuracy_tuned_rf:.4f} (or {accuracy_tuned_rf*100:.2f}%)")

print("\n--- Tuned Model Classification Report ---")
report_tuned_rf = classification_report(y_test, y_pred_tuned_rf, target_names=['Did Not Survive (0)', 'Survived (1)'])
print(report_tuned_rf)



 -0.55466613  0.04009635 -0.55466613 -0.55466613 -0.55466613  0.63485883
 -0.55466613  0.04009635 -0.55466613  0.04009635  0.04009635 -0.55466613
 -0.55466613  0.04009635  3.01390875 -0.55466613 -0.55466613  0.63485883
 -0.55466613 -0.55466613  0.63485883 -0.55466613  1.22962131 -0.55466613
 -0.55466613  0.63485883 -0.55466613  0.04009635 -0.55466613  1.82438379
  1.82438379  0.63485883  0.04009635  0.04009635 -0.55466613 -0.55466613
 -0.55466613  0.63485883 -0.55466613 -0.55466613  0.63485883 -0.55466613
  1.22962131  0.04009635 -0.55466613  0.04009635  1.82438379 -0.55466613
 -0.55466613 -0.55466613 -0.55466613 -0.55466613 -0.55466613 -0.55466613
 -0.55466613 -0.55466613  0.04009635 -0.55466613 -0.55466613 -0.55466613
 -0.55466613 -0.55466613 -0.55466613 -0.55466613 -0.55466613 -0.55466613
  0.63485883 -0.55466613 -0.55466613  0.63485883 -0.55466613 -0.55466613
  0.04009635 -0.55466613  0.04009635 -0.55466613  0.04009635 -0.55466613
 -0.55466613 -0.55466613 -0.55466613 -0.55466613 -0

--- Training Tuned Random Forest Model (GridSearchCV) ---
Starting Hyperparameter Tuning...
Fitting 5 folds for each of 36 candidates, totalling 180 fits

--- Hyperparameter Tuning Complete ---
Best parameters found: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Best cross-validation accuracy: 0.8314

--- Tuned Model Accuracy on Test Set ---
Accuracy: 0.8380 (or 83.80%)

--- Tuned Model Classification Report ---
                     precision    recall  f1-score   support

Did Not Survive (0)       0.85      0.89      0.87       105
       Survived (1)       0.83      0.77      0.80        74

           accuracy                           0.84       179
          macro avg       0.84      0.83      0.83       179
       weighted avg       0.84      0.84      0.84       179



In [4]:
#  Save Model 
print("Saving model...")

# Save the Tuned Random Forest model
joblib.dump(best_rf_model, 'best_rf_model.joblib')

print("File saved: best_rf_model.joblib")

Saving model...
File saved: best_rf_model.joblib
