In [43]:
# [IMPORTS]
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score

In [44]:
# [DATA_LOADING]
data = pd.read_csv("C:\\Users\\shash\\Downloads\\titanic\\train.csv")

In [45]:
# [INITIAL_CLEANUP]
# Remove unnecessary columns
data.drop(["Name", "Ticket", "Cabin", "Embarked"], axis=1, inplace=True)
data.set_index('PassengerId', inplace=True)

In [46]:
# [FEATURE_ENGINEERING]
# Add family size feature
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

# Improved Age imputation using passenger class medians
data['Age'] = data.groupby('Pclass')['Age'].transform(lambda x: x.fillna(x.median()))

# Fill missing fare values with the median
data['Fare'].fillna(data['Fare'].median(), inplace=True)

In [47]:
# [CATEGORICAL_ENCODING]
# Convert Sex to numeric
data["Sex"] = data["Sex"].map({'male': 1, 'female': 0})

In [48]:
# [FEATURE_SELECTION]
# Prepare features and target
X = data.drop("Survived", axis=1)
y = data["Survived"]

# One-hot encode 'Pclass' (Passenger class)
X = pd.get_dummies(X, columns=['Pclass'], drop_first=True)

In [52]:
# [SCALING]
# Scale the features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [51]:
# [TRAIN_TEST_SPLIT]
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=0.7, random_state=42)

In [53]:
# [HYPERPARAMETER_TUNING]
# Set up the model and grid search for XGBoost
model = XGBClassifier(eval_metric='logloss', random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'scale_pos_weight': [1, 2]  # Handle class imbalance
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
                           scoring='f1', cv=5, verbose=1, n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


In [54]:
# [MODEL_EVALUATION]
# Make predictions
y_pred = best_model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.4f}")
print(f"Model F1 Score: {f1:.4f}")

Model Accuracy: 0.8246
Model F1 Score: 0.7685


In [55]:
# [FEATURE_IMPORTANCE]
# Display feature importance for XGBoost
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_model.feature_importances_
})
print("\nFeature Importance:")
print(feature_importance.sort_values('Importance', ascending=False))


Feature Importance:
      Feature  Importance
0         Sex    0.530948
7    Pclass_3    0.185318
4        Fare    0.053261
5  FamilySize    0.051526
6    Pclass_2    0.049000
3       Parch    0.047180
1         Age    0.046562
2       SibSp    0.036205
