# **Data Cleaning Notebook**

## Objectives

- Assess and handle missing values
- Clean data

## Inputs

- outputs/datasets/collection/HousePrices.csv

## Outputs

- Cleaned full dataset: outputs/datasets/cleaned/HousePricesCleaned.csv
- Cleaned train/test splits: outputs/datasets/cleaned/TrainSetCleaned.csv, outputs/datasets/cleaned/TestSetCleaned.csv
- Data cleaning pipeline: outputs/ml_pipeline/data_cleaning/dataCleaning_pipeline.pkl


---

# Change working directory

In [None]:
import os

current_dir = os.getcwd()
os.chdir(os.path.dirname(current_dir))  # set project root
print("Current directory:", os.getcwd())

---

## Load Collected Data

In [None]:
import pandas as pd

df = pd.read_csv("outputs/datasets/collection/HousePrices.csv")
df.head(5)

---

## Data Exploration

Identify columns with missing values

In [None]:
vars_with_missing = df.columns[df.isna().sum() > 0].tolist()
print("Columns with missing:", vars_with_missing)
print(df[vars_with_missing].info())

---

## Profile Report

In [None]:
from pandas_profiling import ProfileReport

if vars_with_missing_data:
    profile = ProfileReport(df=df[vars_with_missing_data], minimal=True)
    profile.to_notebook_iframe()
else:
    print("There are no variables with missing data")

---

## Data Cleaning

Assessing Missing Data Levels

In [None]:
def EvaluateMissingData(df):
    """
    Function to evaluate data with missing values
    """
    missing_data_absolute = df.isnull().sum()
    missing_data_percentage = round(missing_data_absolute / len(df) * 100, 2)
    df_missing_data = (
        pd.DataFrame(
            data={
                "RowsWithMissingData": missing_data_absolute,
                "PercentageOfDataset": missing_data_percentage,
                "DataType": df.dtypes,
            }
        )
        .sort_values(by=["PercentageOfDataset"], ascending=False)
        .query("PercentageOfDataset > 0")
    )

    return df_missing_data

In [None]:
EvaluateMissingData(df)

## Handling Missing Data

In [None]:
import seaborn as sns

sns.set(style="whitegrid")
import matplotlib.pyplot as plt


def DataCleaningEffect(df_original, df_cleaned, variables_applied_with_method):
    """
    Function to visualize data cleaning effect
    """
    flag_count = 1  # Indicate plot number

    # distinguish between numerical and categorical variables
    categorical_variables = df_original.select_dtypes(exclude=["number"]).columns

    # scan over variables,
    # first on variables that you applied the method
    # if the variable is a numerical plot, a histogram if categorical plot a barplot
    for set_of_variables in [variables_applied_with_method]:
        print(
            "\n====================================================================================="
        )
        print(
            f"* Distribution Effect Analysis After Data Cleaning Method in the following variables:"
        )
        print(f"{set_of_variables} \n\n")

        for var in set_of_variables:
            if var in categorical_variables:  # it is categorical variable: barplot

                df1 = pd.DataFrame({"Type": "Original", "Value": df_original[var]})
                df2 = pd.DataFrame({"Type": "Cleaned", "Value": df_cleaned[var]})
                dfAux = pd.concat([df1, df2], axis=0)
                fig, axes = plt.subplots(figsize=(15, 5))
                sns.countplot(
                    hue="Type", data=dfAux, x="Value", palette=["#432371", "#FAAE7B"]
                )
                axes.set(title=f"Distribution Plot {flag_count}: {var}")
                plt.xticks(rotation=90)
                plt.legend()

            else:  # it is numerical variable: histogram

                fig, axes = plt.subplots(figsize=(10, 5))
                sns.histplot(
                    data=df_original,
                    x=var,
                    color="#432371",
                    label="Original",
                    kde=True,
                    element="step",
                    ax=axes,
                )
                sns.histplot(
                    data=df_cleaned,
                    x=var,
                    color="#FAAE7B",
                    label="Cleaned",
                    kde=True,
                    element="step",
                    ax=axes,
                )
                axes.set(title=f"Distribution Plot {flag_count}: {var}")
                plt.legend()

            plt.show()
            flag_count += 1

## Split Train and Test Set

In [None]:
from sklearn.model_selection import train_test_split

TrainSet, TestSet, _, __ = train_test_split(
    df, df["SalePrice"], test_size=0.2, random_state=0
)

print(f"TrainSet shape: {TrainSet.shape} \nTestSet shape: {TestSet.shape}")

In [None]:
df_missing_data = EvaluateMissingData(TrainSet)
print(f"* There are {df_missing_data.shape[0]} variables with missing data \n")
df_missing_data

## Drop Variables

In [None]:
from feature_engine.selection import DropFeatures

variables_to_drop = ["EnclosedPorch", "WoodDeckSF"]
imputer = DropFeatures(features_to_drop=variables_to_drop)
df_method = imputer.fit_transform(TrainSet)

for i in variables_to_drop:
    print(i in df_method.columns.to_list())

## Mean Imputation

In [None]:
from feature_engine.imputation import MeanMedianImputer

variables_mean = ["LotFrontage", "BedroomAbvGr"]
imputer = MeanMedianImputer(imputation_method="mean", variables=variables_mean)
df_method = imputer.fit_transform(TrainSet)
DataCleaningEffect(
    df_original=TrainSet,
    df_cleaned=df_method,
    variables_applied_with_method=variables_mean,
)

## Median Imputation

In [None]:
variables_median = ["2ndFlrSF", "GarageYrBlt", "MasVnrArea"]
imputer = MeanMedianImputer(imputation_method="median", variables=variables_median)
df_method = imputer.fit_transform(TrainSet)
DataCleaningEffect(
    df_original=TrainSet,
    df_cleaned=df_method,
    variables_applied_with_method=variables_median,
)

In [None]:
TrainSet[(TrainSet["GarageArea"] == 0)][["GarageYrBlt", "GarageArea"]]

## Categorical Imputation

In [None]:
from feature_engine.imputation import CategoricalImputer

variables_categorical = ["GarageFinish", "BsmtFinType1"]
imputer = CategoricalImputer(
    imputation_method="missing", fill_value="None", variables=variables_categorical
)
df_method = imputer.fit_transform(TrainSet)
DataCleaningEffect(
    df_original=TrainSet,
    df_cleaned=df_method,
    variables_applied_with_method=variables_categorical,
)

In [None]:
TrainSet[(TrainSet["GarageArea"] == 0)][["GarageFinish", "GarageArea"]]

In [None]:
TrainSet[(TrainSet["TotalBsmtSF"] == 0)][["BsmtFinType1", "TotalBsmtSF"]]

## Data Cleaning Pipeline

This pipeline has the following steps:

- Mean imputation: variables=['LotFrontage' , 'BedroomAbvGr']
- Median imputation: variables=['2ndFlrSF', 'MasVnrArea']
- Categorical imputation: variables=['GarageFinish' , 'BsmtFinType1']
- Dropping variables: features_to_drop=['EnclosedPorch', 'GarageYrBlt', 'WoodDeckSF']

In [None]:
from sklearn.pipeline import Pipeline

dataCleaning_pipeline = Pipeline(
    [
        (
            "mean",
            MeanMedianImputer(
                imputation_method="mean", variables=["LotFrontage", "BedroomAbvGr"]
            ),
        ),
        (
            "median",
            MeanMedianImputer(
                imputation_method="median", variables=["2ndFlrSF", "MasVnrArea"]
            ),
        ),
        (
            "categorical",
            CategoricalImputer(
                imputation_method="missing",
                fill_value="None",
                variables=["GarageFinish", "BsmtFinType1"],
            ),
        ),
        (
            "drop",
            DropFeatures(
                features_to_drop=["EnclosedPorch", "GarageYrBlt", "WoodDeckSF"]
            ),
        ),
    ]
)

Apply the pipeline to the whole dataset to get cleaned data.

In [None]:
TrainSet, TestSet = dataCleaning_pipeline.fit_transform(
    TrainSet
), dataCleaning_pipeline.fit_transform(TestSet)

In [None]:
df = dataCleaning_pipeline.fit_transform(df)

In [None]:
EvaluateMissingData(TrainSet)

In [None]:
EvaluateMissingData(TestSet)

In [None]:
EvaluateMissingData(df)

---

## Save Cleaned Data and Pipeline

In [None]:
# Create output dirs
os.makedirs("outputs/datasets/cleaned", exist_ok=True)
# Save CSVs
pd.DataFrame(FullCleaned, columns=TrainCleaned.columns).to_csv(
    "outputs/datasets/cleaned/HousePricesCleaned.csv", index=False
)
pd.DataFrame(TrainCleaned, columns=TrainCleaned.columns).to_csv(
    "outputs/datasets/cleaned/TrainSetCleaned.csv", index=False
)
pd.DataFrame(TestCleaned, columns=TestCleaned.columns).to_csv(
    "outputs/datasets/cleaned/TestSetCleaned.csv", index=False
)
# Save pipeline
import joblib

os.makedirs("outputs/ml_pipeline/data_cleaning", exist_ok=True)
joblib.dump(pipeline, "outputs/ml_pipeline/data_cleaning/dataCleaning_pipeline.pkl")

---

## Summary and Next Steps

**Summary**

- Assess and handle missing values
    - Mean imputation: variables=['LotFrontage' , 'BedroomAbvGr']
    - Median imputation: variables=['2ndFlrSF', 'MasVnrArea']
    - Categorical imputation: variables=['GarageFinish' , 'BsmtFinType1']
    - Dropping variables: features_to_drop=['EnclosedPorch', 'GarageYrBlt', 'WoodDeckSF']
- Clean data

**Next Steps**:

Move to Data Study (EDA) Notebook to analyze feature–target relationships and generate visual insights for the dashboard.