In [8]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Load Data
df_train = pd.read_csv('/kaggle/input/titanic/train.csv', index_col='PassengerId')
df_test = pd.read_csv('/kaggle/input/titanic/test.csv', index_col='PassengerId')

# Define target and features
X = df_train.drop(columns=['Survived'])
y = df_train['Survived']

# Split the dataset
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=42)

# Identify categorical and numerical columns
cat_cols = [col for col in X.columns if X[col].dtype == 'object' and X[col].nunique() < 10]
num_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]

# Feature Engineering Function
def add_features(df):
    df = df.copy()
    df['FamilySize'] = df['SibSp'] + df['Parch']
    df = df.drop(columns=['SibSp', 'Parch'], errors='ignore') 
    return df

add_features(X_train)
add_features(X_valid)

# Preprocessing Pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

# Column Transformer
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

# Final Pipeline
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Hyperparameter Tuning
param_grid = {
    'classifier__n_estimators': [100, 150, 200, 300],
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': [5, 10, 15],
    'classifier__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(rf_pipeline, param_grid, scoring='accuracy', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best Model
tuned_model = grid_search.best_estimator_
y_pred = tuned_model.predict(X_valid)
print(f'Tuned Random Forest Accuracy: {accuracy_score(y_valid, y_pred):.4f}')





Tuned Random Forest Accuracy: 0.8101


In [9]:
# Prepare test data
X_test = df_test.copy()
add_features(X_test)

y_test_pred = tuned_model.predict(X_test)

# Create submission file
submission = pd.DataFrame({'PassengerId': df_test.index, 'Survived': y_test_pred})
submission.to_csv('submission.csv', index=False)

print("Submission file saved as 'submission.csv'")

Submission file saved as 'submission.csv'
