# Titanic Survival Prediction Model
This notebook develops a machine learning model to predict Titanic passenger survival.
It includes data preprocessing, model training, and evaluation.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load dataset
data = pd.read_csv('tested 2.csv')
data.head()

In [None]:
# Data preprocessing
# Handle missing values
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
data['Fare'].fillna(data['Fare'].median(), inplace=True)
data['Cabin'] = data['Cabin'].fillna('Unknown')

# Feature engineering: Extract deck from Cabin
data['Deck'] = data['Cabin'].apply(lambda x: x[0] if x != 'Unknown' else 'U')

# Drop columns not useful for prediction
data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# Encode categorical variables
label_encoders = {}
categorical_cols = ['Sex', 'Embarked', 'Deck']
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Normalize numerical features
scaler = StandardScaler()
numerical_cols = ['Age', 'Fare', 'SibSp', 'Parch']
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

data.head()

In [None]:
# Split data into features and target
X = data.drop('Survived', axis=1)
y = data['Survived']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [None]:
# Predict on test set
y_pred = model.predict(X_test)

In [None]:
# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Feature importance visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
plt.figure(figsize=(10,6))
sns.barplot(x=feat_importances, y=feat_importances.index)
plt.title('Feature Importances')
plt.show()