In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
# Load Titanic dataset
df = pd.read_csv('titanic.csv')

# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')
df[['Age', 'Embarked']] = imputer.fit_transform(df[['Age', 'Embarked']])

In [3]:
# Encode categorical variables
categorical_features = ['Sex', 'Embarked']
numeric_features = ['Age', 'Fare', 'Pclass', 'SibSp', 'Parch']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ]
)

In [4]:
# Prepare feature matrix and target variable
X = df[numeric_features + categorical_features]
y = df['Survived']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [5]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=1),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=1)
}

In [6]:
# Train and evaluate each model
for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f'{name} Accuracy: {acc:.4f}')
    print(f'Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}\n')

Logistic Regression Accuracy: 0.8045
Confusion Matrix:
[[90 16]
 [19 54]]

Random Forest Accuracy: 0.7821
Confusion Matrix:
[[95 11]
 [28 45]]

SVM Accuracy: 0.7877
Confusion Matrix:
[[96 10]
 [28 45]]

KNN Accuracy: 0.7263
Confusion Matrix:
[[90 16]
 [33 40]]

Gradient Boosting Accuracy: 0.7821
Confusion Matrix:
[[97  9]
 [30 43]]

