In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder

# 1. Data Import and Initial Exploration
try:
    df = pd.read_csv('Titanic-Dataset.csv')
except FileNotFoundError:
    print("Error: Please ensure 'Titanic-Dataset.csv' is in the working directory.")
    exit()

print("--- Initial Data Info ---")
print(df.info())
print("\nMissing values:\n", df.isnull().sum())

In [ ]:
# 2. Handle Missing Values

# Impute 'Age' (Numerical) using the median
df['Age'].fillna(df['Age'].median(), inplace=True)

# Impute 'Embarked' (Categorical) using the mode
most_frequent_embarked = df['Embarked'].mode()[0]
df['Embarked'].fillna(most_frequent_embarked, inplace=True)

# Drop 'Cabin' due to high percentage of missing values
df.drop('Cabin', axis=1, inplace=True)

print("\nMissing values after imputation:\n", df.isnull().sum())

In [ ]:
# 3. Convert Categorical Features (Encoding)

# Binary Categorical ('Sex'): Label Encoding
le = LabelEncoder()
df['Sex_Encoded'] = le.fit_transform(df['Sex'])
df.drop('Sex', axis=1, inplace=True)

# Multi-class Categorical ('Embarked'): One-Hot Encoding
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True, prefix='Emb')

# Drop unnecessary object columns (Name, Ticket)
df.drop(['Name', 'Ticket'], axis=1, inplace=True)

In [ ]:
# 4. Outlier Detection and Removal (Focus on 'Fare')

plt.figure(figsize=(8, 6))
sns.boxplot(y=df['Fare'])
plt.title('Fare Distribution (Before Removal)')
plt.show()

# Use the IQR method to define the upper boundary for outliers
Q1 = df['Fare'].quantile(0.25)
Q3 = df['Fare'].quantile(0.75)
IQR = Q3 - Q1
upper_bound = Q3 + 1.5 * IQR

# Filter out the outliers, creating an explicit deep copy to prevent warnings
df_clean = df[df['Fare'] <= upper_bound].copy()

print(f"\nOriginal row count: {len(df)}. Rows after outlier removal: {len(df_clean)}.")

In [ ]:
# 5. Standardize Numerical Features

# Use Standardization (Z-Score Scaling) on 'Age' and 'Fare'
scaler = StandardScaler()
# Fit and transform the columns
df_clean[['Age', 'Fare']] = scaler.fit_transform(df_clean[['Age', 'Fare']])

print("\n--- Final Clean Data Snapshot (First 5 Rows) ---")
print(df_clean.head())

# Save the final dataframe for the next task
df_clean.to_csv('titanic_preprocessed.csv', index=False)