# Titanic - Machine Learning from Disaster

Competition: https://www.kaggle.com/c/titanic

**Notebook này được thiết kế để chạy trên:**
- Local (VS Code với conda env `kaggle-competitions`)
- Google Colab
- Kaggle Kernels

## 1. Bootstrap - Environment Setup

Cell này tự động phát hiện và cấu hình môi trường (local/colab/kaggle)

In [None]:
# === BOOTSTRAP CELL - UNIVERSAL SETUP ===
import sys
import os
from pathlib import Path

# GitHub configuration
GITHUB_USER = "n24q02m"
REPO_NAME = "n24q02m-kaggle-competitions"
BRANCH = "main"

# Detect environment
def detect_env():
    if 'google.colab' in sys.modules:
        return 'colab'
    elif 'kaggle_web_client' in sys.modules or os.path.exists('/kaggle'):
        return 'kaggle'
    else:
        return 'local'

ENV = detect_env()
print(f"Detected: {ENV.upper()}")

# Setup theo môi trường
if ENV == 'local':
    # Local: Import trực tiếp từ repo
    # Giả sử đang ở competitions/titanic/notebooks/
    repo_root = Path.cwd().parent.parent.parent
    if str(repo_root) not in sys.path:
        sys.path.insert(0, str(repo_root))
    
    from core import setup_env
    env = setup_env.setup()
    
else:
    # Cloud: Download setup_env.py từ GitHub
    import requests
    import subprocess
    
    CORE_URL = f"https://raw.githubusercontent.com/{GITHUB_USER}/{REPO_NAME}/{BRANCH}/core"
    
    # Download setup_env.py
    print("Downloading setup_env.py...")
    response = requests.get(f"{CORE_URL}/setup_env.py")
    with open("setup_env.py", "w") as f:
        f.write(response.text)
    
    # Import và setup
    import setup_env
    env = setup_env.setup(GITHUB_USER, REPO_NAME)

# Hiển thị thông tin môi trường
env.info()

## 2. Configuration

Cấu hình chung cho notebook

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Configuration class
class CFG:
    # Random seed cho reproducibility
    seed = 42
    
    # Cross-validation
    n_folds = 5
    
    # Target column
    target_col = 'Survived'
    
    # Data paths (tự động set theo môi trường)
    if ENV == 'kaggle':
        data_dir = Path('/kaggle/input/titanic')
    else:
        data_dir = Path.cwd().parent / 'data'
    
    train_path = data_dir / 'train.csv'
    test_path = data_dir / 'test.csv'
    submission_path = Path.cwd().parent / 'submissions' / 'submission.csv'

# Set random seeds
def seed_everything(seed):
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(CFG.seed)

# Display config
print("Configuration:")
print(f"  - Seed: {CFG.seed}")
print(f"  - N Folds: {CFG.n_folds}")
print(f"  - Data Dir: {CFG.data_dir}")
print(f"  - Train: {CFG.train_path.exists()}")
print(f"  - Test: {CFG.test_path.exists()}")

## 3. Load Data

In [None]:
# Load datasets
train = pd.read_csv(CFG.train_path)
test = pd.read_csv(CFG.test_path)

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

# Display first rows
train.head()

## 4. Exploratory Data Analysis (EDA)

In [None]:
# Basic info
print("=" * 50)
print("TRAIN DATA INFO")
print("=" * 50)
print(train.info())
print("\n" + "=" * 50)
print("STATISTICAL SUMMARY")
print("=" * 50)
print(train.describe())

In [None]:
# Missing values
missing = train.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)

if len(missing) > 0:
    plt.figure(figsize=(10, 5))
    missing.plot(kind='barh')
    plt.title('Missing Values in Train Data')
    plt.xlabel('Count')
    plt.tight_layout()
    plt.show()
    
    print("\nMissing Values:")
    print(missing)
else:
    print("No missing values!")

In [None]:
# Target distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Count plot
train[CFG.target_col].value_counts().plot(kind='bar', ax=axes[0])
axes[0].set_title('Survived Count')
axes[0].set_xlabel('Survived (0=No, 1=Yes)')
axes[0].set_ylabel('Count')

# Pie chart
train[CFG.target_col].value_counts().plot(kind='pie', autopct='%1.1f%%', ax=axes[1])
axes[1].set_title('Survived Distribution')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

print(f"\nSurvival Rate: {train[CFG.target_col].mean():.2%}")

## 5. Feature Engineering

In [None]:
def create_features(df):
    """Tạo các features mới từ dữ liệu gốc"""
    df = df.copy()

    # 1. FamilySize = SibSp + Parch + 1
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1

    # 2. IsAlone
    df["IsAlone"] = (df["FamilySize"] == 1).astype(int)

    # 3. Extract Title from Name
    df["Title"] = df["Name"].str.extract(" ([A-Za-z]+)\\.", expand=False)

    # 4. Group rare titles
    df["Title"] = df["Title"].replace(
        [
            "Lady",
            "Countess",
            "Capt",
            "Col",
            "Don",
            "Dr",
            "Major",
            "Rev",
            "Sir",
            "Jonkheer",
            "Dona",
        ],
        "Rare",
    )
    df["Title"] = df["Title"].replace("Mlle", "Miss")
    df["Title"] = df["Title"].replace("Ms", "Miss")
    df["Title"] = df["Title"].replace("Mme", "Mrs")

    # 5. Age groups
    df["AgeGroup"] = pd.cut(
        df["Age"],
        bins=[0, 12, 18, 35, 60, 100],
        labels=["Child", "Teen", "Adult", "Middle", "Senior"],
    )

    # 6. Fare groups
    df["FareGroup"] = pd.qcut(
        df["Fare"], q=4, labels=["Low", "Medium", "High", "VeryHigh"]
    )

    # 7. Has Cabin
    df["HasCabin"] = df["Cabin"].notna().astype(int)

    return df


# Apply feature engineering
print("Creating features...")
train_fe = create_features(train)
test_fe = create_features(test)
print("Feature engineering complete!")

# Display new features
print("\nNew features:")
print(train_fe[["FamilySize", "IsAlone", "Title", "AgeGroup", "HasCabin"]].head())

## 6. Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder


def preprocess_data(train_df, test_df, target_col="Survived"):
    """Xử lý missing values và encoding"""

    # Separate target
    y_train = train_df[target_col].copy() if target_col in train_df.columns else None

    # Drop target and unnecessary columns
    drop_cols = [target_col, "PassengerId", "Name", "Ticket", "Cabin"]
    X_train = train_df.drop(
        columns=[col for col in drop_cols if col in train_df.columns]
    )
    X_test = test_df.drop(columns=[col for col in drop_cols if col in test_df.columns])

    # Handle missing values
    # Age: fill with median by Pclass and Sex
    for df in [X_train, X_test]:
        for pclass in [1, 2, 3]:
            for sex in ["male", "female"]:
                mask = (df["Pclass"] == pclass) & (df["Sex"] == sex)
                median_age = X_train.loc[
                    (X_train["Pclass"] == pclass) & (X_train["Sex"] == sex), "Age"
                ].median()
                df.loc[mask & df["Age"].isna(), "Age"] = median_age

    # Embarked: fill with mode
    embarked_mode = X_train["Embarked"].mode()[0]
    X_train["Embarked"].fillna(embarked_mode, inplace=True)
    X_test["Embarked"].fillna(embarked_mode, inplace=True)

    # Fare: fill with median
    fare_median = X_train["Fare"].median()
    X_test["Fare"].fillna(fare_median, inplace=True)

    # Fill AgeGroup and FareGroup if they exist
    for col in ["AgeGroup", "FareGroup"]:
        if col in X_train.columns:
            mode_val = X_train[col].mode()[0]
            X_train[col].fillna(mode_val, inplace=True)
            if col in X_test.columns:
                X_test[col].fillna(mode_val, inplace=True)

    # Encode categorical variables
    label_encoders = {}
    cat_cols = X_train.select_dtypes(include=["object", "category"]).columns

    for col in cat_cols:
        le = LabelEncoder()
        # Fit on combined data to ensure same encoding
        combined = pd.concat([X_train[col], X_test[col]])
        le.fit(combined.astype(str))

        X_train[col] = le.transform(X_train[col].astype(str))
        X_test[col] = le.transform(X_test[col].astype(str))
        label_encoders[col] = le

    return X_train, X_test, y_train, label_encoders


# Apply preprocessing
print("Preprocessing data...")
X_train, X_test, y_train, encoders = preprocess_data(train_fe, test_fe)
print("Preprocessing complete!")
print(f"\nTrain shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")
print(f"Features: {list(X_train.columns)}")

## 7. Modeling

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Initialize models
models = {
    "LogisticRegression": LogisticRegression(random_state=CFG.seed, max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=CFG.seed),
    "GradientBoosting": GradientBoostingClassifier(
        n_estimators=100, random_state=CFG.seed
    ),
    "XGBoost": XGBClassifier(
        n_estimators=100, random_state=CFG.seed, eval_metric="logloss"
    ),
    "LightGBM": LGBMClassifier(n_estimators=100, random_state=CFG.seed, verbose=-1),
}

# Cross-validation
from sklearn.model_selection import cross_val_score

cv_results = {}
print("Training models với Cross-Validation...")

for name, model in models.items():
    scores = cross_val_score(
        model, X_train, y_train, cv=CFG.n_folds, scoring="accuracy"
    )
    cv_results[name] = scores
    print(f"{name:20s} - CV Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")

# Train best model on full training data
best_model_name = max(cv_results, key=lambda x: cv_results[x].mean())
print(f"\nBest model: {best_model_name}")

final_model = models[best_model_name]
final_model.fit(X_train, y_train)
print("Model training complete!")

## 8. Evaluation

In [None]:
# Feature importance (for tree-based models)
if hasattr(final_model, "feature_importances_"):
    feature_importance = pd.DataFrame(
        {"feature": X_train.columns, "importance": final_model.feature_importances_}
    ).sort_values("importance", ascending=False)

    print("\nTop 10 Feature Importances:")
    print(feature_importance.head(10))

    # Plot
    plt.figure(figsize=(10, 6))
    feature_importance.head(10).plot(x="feature", y="importance", kind="barh")
    plt.title(f"Top 10 Feature Importances - {best_model_name}")
    plt.xlabel("Importance")
    plt.tight_layout()
    plt.show()

# Cross-validation scores comparison
plt.figure(figsize=(12, 6))
cv_df = pd.DataFrame(cv_results)
cv_df.boxplot()
plt.title("Cross-Validation Scores by Model")
plt.ylabel("Accuracy")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 9. Submission

In [None]:
# Make predictions on test set
test_predictions = final_model.predict(X_test)

# Create submission file
submission = pd.DataFrame(
    {"PassengerId": test["PassengerId"], "Survived": test_predictions}
)

# Save to CSV
submission.to_csv(CFG.submission_path, index=False)
print(f"\nSubmission saved to: {CFG.submission_path}")
print(f"Submission shape: {submission.shape}")
print("\nFirst few predictions:")
print(submission.head(10))

# Display submission statistics
print(f"\nPredicted survival rate: {test_predictions.mean():.2%}")
print(f"Total passengers predicted to survive: {test_predictions.sum()}")
print(
    f"Total passengers predicted to die: {len(test_predictions) - test_predictions.sum()}"
)