#Task 1: Titanic Survival Prediction



Develop a machine learning model to predict whether a passenger survived the Titanic disaster. 
• Dataset includes features like age, gender, ticket class, fare, cabin information etc. 
• Handle missing values, encode categorical variables, and normalize numerical data effectively. 
• Evaluate model performance using accuracy, precision, etc. 
• Expected outcome: A well-trained classification model with strong survival prediction accuracy.
• Submit a GitHub repository with structured code, preprocessing steps, model selection, and performance analysis in README


In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [42]:
#loading the dataset
data = pd.read_csv("titanic_dataset.csv");

In [44]:
# preprocessing

# stp1 determine number of missing values
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [45]:
# determing the type before handling null values 
data.dtypes
data.head()

# Drop unneeded columns
data.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)
''' Cabin(categorical variable) we can convert it to numerical but as it not necessary for given specific task thus conversion 
is skiped
'''

' Cabin(categorical variable) we can convert it to numerical but as it not necessary for given specific task thus conversion \nis skiped\n'

In [63]:
#stp 2: Fill missing values of Age   and fare by meadian
data['Age'] = data['Age'].fillna(data['Age'].median())


# Fill missing Embarked values with most common value
data['Embarked'].fillna(data['Embarked'].mode()[0])

0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object

In [64]:
data.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [65]:
data.columns


Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')

In [66]:
# Step 3: Create feature matrix and target vector
X = data.drop('Survived', axis=1)  # Simply drop the target column
y = data['Survived']
# Print to verify what columns we have
print("X columns:", X.columns.tolist())

X columns: ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']


In [67]:
# Step 3: Define feature columns based on what's actually in the data
# Let's autodetect numeric and categorical columns
num_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()


In [68]:
# Step 3: Define feature columns based on what's actually in the data
# Let's autodetect numeric and categorical columns
num_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

print("Numeric features:", num_features)
print("Categorical features:", cat_features)

# Step 4: Build preprocessing pipeline
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
], remainder='passthrough')  # This will keep any columns we didn't explicitly transform


Numeric features: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
Categorical features: ['Sex', 'Embarked']


In [69]:


# Step 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [70]:
from sklearn.preprocessing import LabelEncoder

# Assuming 'Sex' is a column in your DataFrame X
le = LabelEncoder()
X['Sex'] = le.fit_transform(X['Sex'])  # Transforms 'male' and 'female' into 0 and 1

# Check the transformation
print(X['Sex'].head())


0    1
1    0
2    0
3    0
4    1
Name: Sex, dtype: int32


In [71]:
# Logistic Regression
log_reg_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])
log_reg_pipeline.fit(X_train, y_train)

# Random Forest
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])
rf_pipeline.fit(X_train, y_train)

# Step 7: Make predictions
y_pred_log_reg = log_reg_pipeline.predict(X_test)
y_pred_rf = rf_pipeline.predict(X_test)

# Step 8: Evaluate models
print("\nLogistic Regression Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_log_reg):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_log_reg))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_log_reg))

print("\n" + "="*50 + "\n")

print("Random Forest Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

# Optional: Cross-validation for more robust evaluation
from sklearn.model_selection import cross_val_score

print("\n" + "="*50 + "\n")
print("Cross-validation scores (Logistic Regression):")
cv_scores_log_reg = cross_val_score(log_reg_pipeline, X, y, cv=5)
print(f"CV Accuracy: {cv_scores_log_reg.mean():.4f} ± {cv_scores_log_reg.std():.4f}")

print("\nCross-validation scores (Random Forest):")
cv_scores_rf = cross_val_score(rf_pipeline, X, y, cv=5)
print(f"CV Accuracy: {cv_scores_rf.mean():.4f} ± {cv_scores_rf.std():.4f}")


Logistic Regression Performance:
Accuracy: 0.8101

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       105
           1       0.79      0.74      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179


Confusion Matrix:
[[90 15]
 [19 55]]


Random Forest Performance:
Accuracy: 0.8212

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       105
           1       0.80      0.76      0.78        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179


Confusion Matrix:
[[91 14]
 [18 56]]


Cross-validation scores (Logistic Regression):
CV Accuracy: 0.7912 ± 0.0185

Cross-validation scores (Random Forest):
CV Ac