In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from IPython.display import FileLink
import os

# Verify file paths
train_path = '/kaggle/input/titanic/train.csv'
test_path = '/kaggle/input/titanic/test.csv'

if not os.path.exists(train_path) or not os.path.exists(test_path):
    raise FileNotFoundError("train.csv or test.csv not found.")

In [None]:
# Load datasets
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [None]:
# Feature Engineering
# Combine train and test for consistent preprocessing
test_df['Survived'] = np.nan
combined = pd.concat([train_df, test_df], sort=False)

# Extract Title
combined['Title'] = combined['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
title_mapping = {
    'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3,
    'Dr': 4, 'Rev': 4, 'Col': 4, 'Major': 4,
    'Mlle': 1, 'Ms': 1, 'Lady': 2, 'Countess': 2,
    'Jonkheer': 4, 'Don': 4, 'Sir': 4, 'Capt': 4, 'Dona': 2
}
combined['Title'] = combined['Title'].map(lambda x: title_mapping.get(x, 4)).astype('int64')

# Fill missing values consistently
combined['Age'] = combined.groupby(['Title', 'Pclass'])['Age'].transform(lambda x: x.fillna(x.median()))
combined['Fare'] = combined.groupby('Pclass')['Fare'].transform(lambda x: x.fillna(x.median()))
combined['Embarked'] = combined['Embarked'].fillna('S')

# Encode categorical variables
combined['Sex'] = combined['Sex'].map({'male': 0, 'female': 1}).astype('int64')
embarked_mapping = {'S': 0, 'C': 1, 'Q': 2}
combined['Embarked'] = combined['Embarked'].map(embarked_mapping).astype('int64')

# Family features
combined['FamilySize'] = combined['SibSp'] + combined['Parch'] + 1
combined['IsAlone'] = (combined['FamilySize'] == 1).astype('int64')
# AgeBin and FareBin
combined['AgeBin'] = pd.cut(combined['Age'], bins=[0, 12, 18, 30, 50, 100], labels=[0, 1, 2, 3, 4]).astype('int64')
combined['FareBin'] = pd.qcut(combined['Fare'], q=4, labels=[0, 1, 2, 3]).astype('int64')

# Drop irrelevant columns
combined.drop(['Cabin', 'Ticket', 'Name'], axis=1, inplace=True, errors='ignore')

In [None]:
# Split back into train and test
train_df = combined[combined['Survived'].notnull()]
test_df = combined[combined['Survived'].isnull()]

In [None]:
# Prepare features
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'FamilySize', 
            'Title', 'AgeBin', 'FareBin', 'IsAlone']
X_train = train_df[features]
y_train = train_df['Survived']
X_test = test_df[features]

In [None]:
# Ensemble model: XGBoost + Random Forest
xgb = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, subsample=0.8, 
                    colsample_bytree=0.8, gamma=1, random_state=42, eval_metric='logloss')
rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
ensemble = VotingClassifier(estimators=[('xgb', xgb), ('rf', rf)], voting='soft')

In [None]:
# Cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []
for train_idx, val_idx in skf.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    ensemble.fit(X_tr, y_tr)
    score = ensemble.score(X_val, y_val)
    cv_scores.append(score)

print("Cross-validation scores:", cv_scores)
print("Mean CV score:", np.mean(cv_scores))


In [None]:
# Train final model
ensemble.fit(X_train, y_train)

In [None]:
# Predict and save submission
predictions = ensemble.predict(X_test)
submission = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': predictions.astype(int)})
submission.to_csv('submission.csv', index=False)



In [None]:
print("Submission file saved: submission.csv")
try:
    display(FileLink('submission.csv'))
except NameError:
    print("FileLink not available; download submission.csv manually.")