In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# --- 1. Load Data ---
df = pd.read_csv(r"C:\Users\MY PC\Downloads\Titanic-Dataset.csv")
print("Data loaded successfully.")

Data loaded successfully.


In [5]:
# --- 2. Feature Engineering and Cleaning ---

# Create FamilySize (Siblings/Spouses + Parents/Children + Passenger themselves)
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# Extract Title from Name (using r'' for raw string to avoid SyntaxWarning)
# This captures the word between a space and a dot (e.g., ' Mr.', ' Miss.')
df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)

# Group less common titles to reduce the number of categories
rare_titles = ['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
df['Title'] = df['Title'].replace(rare_titles, 'Rare')
df['Title'] = df['Title'].replace('Mlle', 'Miss') # French Miss
df['Title'] = df['Title'].replace('Ms', 'Miss')   # Miss
df['Title'] = df['Title'].replace('Mme', 'Mrs')   # French Mrs

# Impute missing Age: Fill NaNs with the median Age specific to the passenger's Pclass and Sex
df['Age'] = df.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))

# Impute missing Embarked: Fill NaNs with the mode (most common port)
most_common_embarked = df['Embarked'].mode()[0]
df['Embarked'] = df['Embarked'].fillna(most_common_embarked)

# Impute missing Fare: Fill NaNs with the median Fare
df['Fare'] = df['Fare'].fillna(df['Fare'].median())

# Drop original columns that are now redundant or not useful for modeling
df_model = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch'], axis=1)
print("Data cleaned and features engineered.")

Data cleaned and features engineered.


In [7]:
# --- 3. Encode Categorical Features ---

# Convert Pclass to categorical before one-hot encoding
df_model['Pclass'] = df_model['Pclass'].astype('category')

# One-hot encode the categorical variables
categorical_features = ['Sex', 'Embarked', 'Pclass', 'Title']
# drop_first=True helps prevent multicollinearity
df_model = pd.get_dummies(df_model, columns=categorical_features, drop_first=True)

# Drop the 'Title_Rare' column which is now captured by the absence of other Title columns
df_model = df_model.drop(columns=['Title_Rare'], errors='ignore')

print("Categorical features encoded.")
print("\nFinal feature set:")
print(list(df_model.columns))

Categorical features encoded.

Final feature set:
['Survived', 'Age', 'Fare', 'FamilySize', 'Sex_male', 'Embarked_Q', 'Embarked_S', 'Pclass_2', 'Pclass_3', 'Title_Miss', 'Title_Mr', 'Title_Mrs']


In [9]:
# --- 4. Model Training Preparation ---

# Define features (X) and target (y)
X = df_model.drop('Survived', axis=1)
y = df_model['Survived']

# Split data into training (80%) and testing (20%) sets
# random_state ensures reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nData split into training and testing sets.")


Data split into training and testing sets.


In [11]:
# --- 5. Train the Random Forest Model ---

# Initialize the model with 100 trees and a max depth of 5 for regularization
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
model.fit(X_train, y_train)

print("Random Forest Classifier trained successfully.")

Random Forest Classifier trained successfully.


In [13]:
# --- 6. Prediction, Evaluation, and Feature Importance ---

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

# Calculate and display Feature Importance
feature_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\nTop 10 Feature Importances:")
print(feature_importances.head(10))


Model Accuracy: 0.8324

Top 10 Feature Importances:
Title_Mr      0.233544
Sex_male      0.217173
Fare          0.116430
Pclass_3      0.102856
FamilySize    0.080721
Title_Mrs     0.079525
Age           0.074822
Title_Miss    0.044489
Pclass_2      0.023696
Embarked_S    0.019280
dtype: float64
