# Titanic Pipeline Prediction
This notebook builds a **Logistic Regression model** using **Scikit-Learn Pipelines**  
to predict passenger survival on the Titanic dataset.  

It demonstrates:
- Feature engineering  
- Preprocessing with `ColumnTransformer`  
- End-to-end model building with a `Pipeline`  
- Generating a submission file for predictions

In [1]:
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
import os

# Set working directory
os.chdir(r"C:\Users\NIHAR\Downloads\titanic")

# Load train and test datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Feature Engineering
for df in [train_data, test_data]:
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
# Extract passenger titles (e.g., Mr, Mrs, Miss, etc.)
def extract_title(name):
    return name.split(',')[1].split('.')[0].strip()

for df in [train_data, test_data]:
    df['Title'] = df['Name'].apply(extract_title)
    
# Define Target and Features
X = train_data.drop(columns=['Survived', 'Name', 'Ticket', 'Cabin', 'PassengerId'], errors='ignore')
y=train_data['Survived']

# Define categorical and numerical feature list
categorical_features = ['Sex', 'Embarked', 'Pclass', 'Title']
numerical_features = ['Age', 'Fare',  'SibSp', 'Parch', 'FamilySize', 'IsAlone']

# Preprocessing Pipelines
# Pipeline for categorical data: impute + one-hot encode
categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Pipeline for numerical data: impute + scale
numerical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Combine both transformers
preprocessor = ColumnTransformer([
    ("cat", categorical_transformer, categorical_features),
    ("num", numerical_transformer, numerical_features)
])

# Combine preprocessing and model into one pipeline (Model Pipeline)
model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifer", LogisticRegression(max_iter=1000))
])

# Split data for training and validation
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, random_state=42)

# Train the pipeline
model.fit(X_train, y_train)

# Evaluate model performance
print("Validation Accuracy:", model.score(X_val, y_val))

# Predict survival on the test dataset
predictions = model.predict(test_data)

# Save the submission file
submission = pd.DataFrame({
    "PassengerId": test_data["PassengerId"],
    "Survived": predictions
})

# Export to CSV
submission.to_csv("submission_pipeline.csv", index=False)
print("Saved submission_pipeline.csv")

Validation Accuracy: 0.8161434977578476
Saved submission_pipeline.csv
