# Day 1: Data Cleaning & Preprocessing
**Objective:** Learn how to clean and prepare raw data for Machine Learning tasks
**Tools:** Python, Pandas, NumPy, Matplotlib, Seaborn

In [55]:
# imports
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [56]:
# Import the dataset and explore basic info (nulls, data types).
df=pd.read_csv("../data/Titanic-Dataset.csv")

print("First 5 rows of the dataset:")
print(df.head())

print("\nDataset Info:")
print(df.info()) 

print("\nSummary Statistics:")
print(df.describe())

print("\nMissing Values in Each Column:")
print(df.isnull().sum())

print("Shape of the data:")
print(df.shape)



First 5 rows of the dataset:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450

In [57]:
# Fill missing Age with median
df['Age'] = df['Age'].fillna(df['Age'].median())

# Fill missing Embarked with mode
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# Drop Cabin column
#too many missing values
df = df.drop('Cabin', axis=1)


In [58]:
# Convert categorical features into numerical using encoding.
le = LabelEncoder()

# Encode Sex
df['Sex'] = le.fit_transform(df['Sex'])  # male/female → 0/1

# Encode Embarked
df['Embarked'] = le.fit_transform(df['Embarked'])  # C/Q/S → 0/1/2

print(df[['Sex', 'Embarked']].head())


   Sex  Embarked
0    1         2
1    0         0
2    0         2
3    0         2
4    1         2


In [59]:
# Normalize/standardize the numerical features.
scaler = StandardScaler()

# Fit and transform numerical columns
df["Age"]=scaler.fit_transform(df[["Age"]])
df["Fare"]=scaler.fit_transform(df[["Fare"]])

print(df[['Age', 'Fare']].head())


        Age      Fare
0 -0.565736 -0.502445
1  0.663861  0.786845
2 -0.258337 -0.488854
3  0.433312  0.420730
4  0.433312 -0.486337


In [60]:
#Visualize outliers using boxplots and remove them.
visualization_path = '../visualizations'
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
for col in num_cols:
    plt.figure(figsize=(8,4))
    sns.boxplot(x=df[col])
    plt.title(f'{col} Outliers')
    plt.savefig(visualization_path+f'/{col}_Boxplot_before_outlier_handled.png')
    plt.close()

def remove_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    return df[(df[col] >= (Q1 - 1.5*IQR)) & (df[col] <= (Q3 + 1.5*IQR))]

print("Dataset shape before removing outliers:", df.shape)

for col in num_cols:
    print(f"Dataset shape after removing outlier of {col}: ", df.shape)
    df = remove_outliers(df, col)

print("Dataset shape after removing all outliers:", df.shape)

for col in num_cols:
    plt.figure(figsize=(8,4))
    sns.boxplot(x=df[col])
    plt.title(f'{col} Outliers')
    plt.savefig(visualization_path+f'/{col}_Boxplot_after_outlier_handled.png')
    plt.close()

Dataset shape before removing outliers: (891, 11)
Dataset shape after removing outlier of PassengerId:  (891, 11)
Dataset shape after removing outlier of Survived:  (891, 11)
Dataset shape after removing outlier of Pclass:  (891, 11)
Dataset shape after removing outlier of Sex:  (891, 11)
Dataset shape after removing outlier of Age:  (891, 11)
Dataset shape after removing outlier of SibSp:  (825, 11)
Dataset shape after removing outlier of Parch:  (786, 11)
Dataset shape after removing outlier of Fare:  (642, 11)
Dataset shape after removing outlier of Embarked:  (561, 11)
Dataset shape after removing all outliers: (421, 11)
