In [1]:
import pandas as pd
import numpy as np

# Load data
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

# Save PassengerId for submission
passenger_ids = test_df['PassengerId']

# Combine for preprocessing
df = pd.concat([train_df.drop('Survived', axis=1), test_df], axis=0)

# Preprocessing: fill NA, encode categoricals
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Fare'] = df['Fare'].fillna(df['Fare'].median())
df['Embarked'] = df['Embarked'].fillna('S')
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)
df.drop(['Cabin', 'Ticket', 'Name'], axis=1, inplace=True)

# Split back
X_train = df.iloc[:len(train_df)]
X_test = df.iloc[len(train_df):]
y_train = train_df['Survived']


In [None]:
import xgboost as xgb

# XGBoost DMatrix
dtrain_full = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

# The best parameters found before:
params = {
    'max_depth': 5,
    'learning_rate': 0.01,
    'n_estimators': 100,  # ignored in XGBoost API
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'subsample': 0.8,
    'colsample_bytree': 1.0,
    'gamma': 0.1,
    'reg_alpha': 0.01,
    'reg_lambda': 1.5,
    'verbosity': 0
}

# Train final model
final_model = xgb.train(params, dtrain_full, num_boost_round=100)


In [3]:
# Predict probabilities
y_pred_proba = final_model.predict(dtest)

# Convert to binary (0 or 1)
y_pred = (y_pred_proba >= 0.5).astype(int)

# Build submission file
submission = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Survived': y_pred
})

submission.to_csv('data/submission.csv', index=False)
print("✅ File 'submission.csv' saved and ready to upload to Kaggle.")


✅ File 'submission.csv' saved and ready to upload to Kaggle.
