In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load dataset
df = pd.read_csv('Titanic-Dataset.csv')

# Copy dataframe
df_clean = df.copy()

# Drop Cabin
df_clean.drop(columns=['Cabin'], inplace=True)
# Impute Age with median
age_imputer = SimpleImputer(strategy='median')
df_clean['Age'] = age_imputer.fit_transform(df_clean[['Age']])
# Impute Embarked with most frequent
embarked_imputer = SimpleImputer(strategy='most_frequent')
df_clean['Embarked'] = embarked_imputer.fit_transform(df_clean[['Embarked']])

# Encode categorical columns
label_encoders = {}
categorical_cols = ['Sex', 'Embarked']
for col in categorical_cols:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col])
    label_encoders[col] = le

# Standardize numerical columns
numerical_cols = ['Age', 'Fare', 'SibSp', 'Parch']
scaler = StandardScaler()
df_clean[numerical_cols] = scaler.fit_transform(df_clean[numerical_cols])

# Remove outliers
for col in numerical_cols:
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)]

# Visualize boxplots
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
for i, col in enumerate(numerical_cols):
    sns.boxplot(y=df_clean[col], ax=axes[i//2][i%2])
    axes[i//2][i%2].set_title(f'Boxplot of {col}')
plt.tight_layout()
plt.show()

ValueError: 2

In [None]:

# Titanic Dataset Preprocessing and Analysis

## Step 1: Import and Explore the Dataset

The dataset consists of 891 rows and 12 columns. Key observations:
- Missing values in `Age`, `Cabin`, and `Embarked`.
- `Cabin` has too many missing values and was dropped.
- Mix of categorical and numerical features.

## Step 2: Handle Missing Values
- `Age`: Imputed using the median.
- `Embarked`: Imputed using the most frequent value.
- `Cabin`: Dropped due to excessive missing data.

## Step 3: Encode Categorical Features
- `Sex` and `Embarked` converted to numeric using Label Encoding.

## Step 4: Normalize Numerical Features
- StandardScaler was used on `Age`, `Fare`, `SibSp`, and `Parch`.

## Step 5: Outlier Detection and Removal
- Boxplots used to visualize outliers.
- Outliers removed using the IQR method for the 4 standardized numerical columns.

