# Data Loading and Missing Data Imputation

This notebook loads the Adult, Breast Cancer Wisconsin, and Heart Disease datasets, checks for missing values, visualizes missingness patterns, and performs MICE (IterativeImputer) imputation.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

# Set plotting style
%matplotlib inline

In [None]:
# --- 1) LOAD DATA ---
datasets = {
    'adult': {
        'path': 'adult.data',
        'column_names': [
            'age', 'workclass', 'fnlwgt', 'education', 'education-num',
            'marital-status', 'occupation', 'relationship', 'race', 'sex',
            'capital-gain', 'capital-loss', 'hours-per-week',
            'native-country', 'income'
        ]
    },
    'breast_cancer': {
        'path': 'breast-cancer-wisconsin.data',
        'column_names': [
            'id', 'clump_thickness', 'uniformity_cell_size',
            'uniformity_cell_shape', 'marginal_adhesion',
            'single_epithelial_size', 'bare_nuclei', 'bland_chromatin',
            'normal_nucleoli', 'mitoses', 'class'
        ]
    },
    'heart_disease': {
        'path': 'heart-disease.data',
        'column_names': [
            'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
            'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'
        ]
    }
}

dataframes = {}
for name, meta in datasets.items():
    df = pd.read_csv(
        meta['path'],
        names=meta['column_names'],
        na_values='?',        # mark '?' as NaN
        skipinitialspace=True  # strip spaces after commas
    )
    dataframes[name] = df
    print(f"Loaded {name}: {df.shape[0]} rows, {df.shape[1]} columns")

In [None]:
# --- 2) MISSING VALUE SUMMARY & PATTERNS ---

    # a) overall missing counts

    # b) % missing by column

    # c) visualize with missingno

In [None]:
# --- 3) MICE IMPUTATION ---
# Example on the Breast Cancer dataset
