<h1 style="color:#2E86C1; text-align:center;"> Data Cleaning on Titanic Dataset</h1>


<h2 style="color:#117A65;">🔹 Step 1: Import Libraries</h2>

In [2]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML


<h2 style="color:#117A65;">🔹 Step 2: Load Dataset</h2>

In [3]:
# Titanic dataset (Kaggle version hosted on GitHub or Drive)
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# Preview
def style_table(df, bg_color="#20B2AA"):
    return df.style.set_table_styles(
        [{'selector': 'th', 'props': [('background-color', bg_color),
                                      ('color', 'white'),
                                      ('border', '1px solid black'),
                                      ('padding', '8px'),
                                      ('font-weight', 'bold')]},
         {'selector': 'td', 'props': [('border', '1px solid black'),
                                      ('padding', '8px')]}]
    ).set_properties(**{'text-align': 'center'})

display(style_table(df.head()))


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


<h2 style="color:#117A65;">🔹 Step 3: Check Missing Values</h2>


In [4]:
missing = df.isnull().sum()
missing = missing[missing > 0]

styled_missing = missing.to_frame("Missing Values").style.set_table_styles(
    [{'selector': 'th', 'props': [('background-color', '#F5B7B1'),
                                  ('color', 'black'),
                                  ('border', '1px solid black'),
                                  ('padding', '8px'),
                                  ('font-weight', 'bold')]},
     {'selector': 'td', 'props': [('border', '1px solid black'),
                                  ('padding', '8px')]}]
)

display(styled_missing)


Unnamed: 0,Missing Values
Age,177
Cabin,687
Embarked,2


<h2 style="color:#117A65;">🔹 Step 4: Handle Missing Values</h2>

In [5]:
# Fill missing 'Age' with median
df['Age'].fillna(df['Age'].median(), inplace=True)

# Fill missing 'Embarked' with mode
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Drop 'Cabin' column (too many missing)
df.drop(columns=['Cabin'], inplace=True)

print("\033[1;34mMissing values handled successfully!\033[0m")


[1;34mMissing values handled successfully![0m


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


<h2 style="color:#117A65;">🔹 Step 5: Check for Duplicates</h2>

In [6]:
duplicates = df.duplicated().sum()
print(f"\033[1;34mTotal Duplicates: {duplicates}\033[0m")


df.drop_duplicates(inplace=True)
print("\033[1;34m  Duplicates removed!\033[0m")


[1;34mTotal Duplicates: 0[0m
[1;34m  Duplicates removed![0m


<h2 style="color:#117A65;">🔹 Step 6: Encode Categorical Columns</h2>


In [7]:
# Label Encoding for "Sex"
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# One-hot Encoding for "Embarked"
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)

print("\033[1;34m Categorical values converted to numeric!\033[0m")


[1;34m Categorical values converted to numeric![0m


<h2 style="color:#117A65;">🔹 Step 7: Preview Cleaned Dataset</h2>

In [8]:
display(style_table(df.head()))


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,False,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",1,38.0,1,0,PC 17599,71.2833,False,False
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,False,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,False,True
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,False,True


<h2 style="color:#117A65;">🔹 Step 8: Summary After Cleaning</h2>

In [9]:
from IPython.display import HTML

summary_data = {
    "Column Names": [df.columns.tolist()],
    "Shape": [df.shape],
    "Data Types": [df.dtypes.to_dict()]
}

summary_html = """
<h3 style="color:#117A65;"> Final Summary After Cleaning</h3>
<table style="border-collapse:collapse; border:2px solid black; width:80%; text-align:left;">
  <tr style="background-color:#20B2AA; color:white; font-weight:bold;">
    <th style="padding:8px; border:1px solid black;">Metric</th>
    <th style="padding:8px; border:1px solid black;">Details</th>
  </tr>
  <tr>
    <td style="padding:8px; border:1px solid black; font-weight:bold;">Column Names</td>
    <td style="padding:8px; border:1px solid black;">{columns}</td>
  </tr>
  <tr>
    <td style="padding:8px; border:1px solid black; font-weight:bold;">Shape</td>
    <td style="padding:8px; border:1px solid black;">{shape}</td>
  </tr>
  <tr>
    <td style="padding:8px; border:1px solid black; font-weight:bold;">Data Types</td>
    <td style="padding:8px; border:1px solid black;">{dtypes}</td>
  </tr>
</table>
""".format(columns=df.columns.tolist(), shape=df.shape, dtypes=df.dtypes.to_dict())

display(HTML(summary_html))


summary_after = pd.DataFrame({
    "Metric": ["Rows", "Columns", "Missing Values"],
    "Value": [df.shape[0], df.shape[1], df.isnull().sum().sum()]
})

# Style the summary table
summary_after.style.set_table_styles(
    [
        {"selector": "th",
         "props": [("background-color", "#20B2AA"),
                   ("color", "white"),
                   ("font-weight", "bold"),
                   ("text-align", "center"),
                   ("border", "1px solid black")]},
        {"selector": "td",
         "props": [("border", "1px solid black"),
                   ("padding", "6px"),
                   ("text-align", "center")]}
    ]
).set_properties(**{"background-color": "#E6F2F0"})


Metric,Details
Column Names,"['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Embarked_Q', 'Embarked_S']"
Shape,"(891, 12)"
Data Types,"{'PassengerId': dtype('int64'), 'Survived': dtype('int64'), 'Pclass': dtype('int64'), 'Name': dtype('O'), 'Sex': dtype('int64'), 'Age': dtype('float64'), 'SibSp': dtype('int64'), 'Parch': dtype('int64'), 'Ticket': dtype('O'), 'Fare': dtype('float64'), 'Embarked_Q': dtype('bool'), 'Embarked_S': dtype('bool')}"


Unnamed: 0,Metric,Value
0,Rows,891
1,Columns,12
2,Missing Values,0


In [11]:
# 🔹 Step 8: Final Summary After Cleaning
from IPython.display import HTML

# --- Numeric Summary ---
summary_after = pd.DataFrame({
    "Metric": ["Rows", "Columns", "Missing Values"],
    "Value": [df.shape[0], df.shape[1], df.isnull().sum().sum()]
})

styled_numeric = summary_after.style.set_table_styles(
    [
        {"selector": "th",
         "props": [("background-color", "#20B2AA"),
                   ("color", "white"),
                   ("font-weight", "bold"),
                   ("text-align", "center"),
                   ("border", "1px solid black")]},
        {"selector": "td",
         "props": [("border", "1px solid black"),
                   ("padding", "6px"),
                   ("text-align", "center")]}
    ]
).set_properties(**{"background-color": "#E6F2F0"})

display(HTML("<h3 style='color:#117A65;'> Numeric Summary</h3>"))
display(styled_numeric)

# --- Detailed Dataset Summary ---
summary_html = """
<h3 style="color:#117A65;"> Detailed Dataset Summary</h3>
<table style="border-collapse:collapse; border:2px solid black; width:90%; text-align:left;">
  <tr style="background-color:#20B2AA; color:white; font-weight:bold;">
    <th style="padding:8px; border:1px solid black;">Metric</th>
    <th style="padding:8px; border:1px solid black;">Details</th>
  </tr>
  <tr>
    <td style="padding:8px; border:1px solid black; font-weight:bold;">Column Names</td>
    <td style="padding:8px; border:1px solid black;">{columns}</td>
  </tr>
  <tr>
    <td style="padding:8px; border:1px solid black; font-weight:bold;">Shape</td>
    <td style="padding:8px; border:1px solid black;">{shape}</td>
  </tr>
  <tr>
    <td style="padding:8px; border:1px solid black; font-weight:bold;">Data Types</td>
    <td style="padding:8px; border:1px solid black;">{dtypes}</td>
  </tr>
</table>
""".format(columns=df.columns.tolist(), shape=df.shape, dtypes=df.dtypes.to_dict())

display(HTML(summary_html))


Unnamed: 0,Metric,Value
0,Rows,891
1,Columns,12
2,Missing Values,0


Metric,Details
Column Names,"['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Embarked_Q', 'Embarked_S']"
Shape,"(891, 12)"
Data Types,"{'PassengerId': dtype('int64'), 'Survived': dtype('int64'), 'Pclass': dtype('int64'), 'Name': dtype('O'), 'Sex': dtype('int64'), 'Age': dtype('float64'), 'SibSp': dtype('int64'), 'Parch': dtype('int64'), 'Ticket': dtype('O'), 'Fare': dtype('float64'), 'Embarked_Q': dtype('bool'), 'Embarked_S': dtype('bool')}"


In [12]:
# Step 9: Export the Cleaned Dataset
# -----------------------------------
df.to_csv("titanic_cleaned.csv", index=False)

from IPython.display import HTML

export_html = """
<h3 style="color:#117A65;">✅ Cleaned Dataset Exported</h3>
<p style="font-size:14px;">The cleaned Titanic dataset has been successfully exported as
<strong><code>titanic_cleaned.csv</code></strong> and is ready for further analysis or modeling.</p>
"""

display(HTML(export_html))


<h3 style="color:#117A65;"> 🔹Export Cleaned Dataset </h3>


In [14]:
df.to_csv("titanic_cleaned.csv", index=False)

print(" Cleaned dataset exported as 'titanic_cleaned.csv'. Ready for further analysis or modeling.")


 Cleaned dataset exported as 'titanic_cleaned.csv'. Ready for further analysis or modeling.
