### Garbage In, Garbage Out (GIGO): Cleaning Missing Data
**Description**: Load a dataset (e.g., Titanic dataset) and identify missing values. Use
appropriate techniques to handle these missing values.

In [3]:
import pandas as pd
import seaborn as sns

def load_data():
    try:
        df = sns.load_dataset('titanic')
        if df.empty:
            raise ValueError("Loaded DataFrame is empty.")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

def clean_missing_values(df):
    if df is None:
        print("No data to clean.")
        return None

    # Check DataFrame type
    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input is not a pandas DataFrame")

    # Identify missing values
    print("Missing values before cleaning:")
    print(df.isnull().sum())

    # Separate numerical and categorical columns safely
    num_cols = df.select_dtypes(include=['number']).columns
    cat_cols = df.select_dtypes(include=['object', 'category']).columns

    # Handle empty columns or all NaN columns
    for col in num_cols:
        if df[col].dropna().empty:
            print(f"Warning: Numerical column '{col}' has no valid data, skipping imputation.")
            continue
        median_val = df[col].median()
        df[col].fillna(median_val, inplace=True)
        print(f"Filled missing in numerical column '{col}' with median: {median_val}")

    for col in cat_cols:
        if df[col].dropna().empty:
            print(f"Warning: Categorical column '{col}' has no valid data, skipping imputation.")
            continue
        mode_val = df[col].mode()[0]
        df[col].fillna(mode_val, inplace=True)
        print(f"Filled missing in categorical column '{col}' with mode: {mode_val}")

    print("Missing values after cleaning:")
    print(df.isnull().sum())

    return df

# Simple test function
def test_clean_missing_values():
    print("\nRunning tests...")

    # Test 1: Normal dataset
    df = load_data()
    clean_df = clean_missing_values(df)
    assert clean_df is not None
    assert clean_df.isnull().sum().sum() == 0  # no missing values remain

    # Test 2: Empty DataFrame
    empty_df = pd.DataFrame()
    try:
        clean_missing_values(empty_df)
    except Exception as e:
        print(f"Handled empty DataFrame test: {e}")

    # Test 3: Non-DataFrame input
    try:
        clean_missing_values("not a dataframe")
    except Exception as e:
        print(f"Handled non-DataFrame input test: {e}")

    # Test 4: Column with all NaNs
    df2 = pd.DataFrame({
        'num_col': [None, None, None],
        'cat_col': [None, None, None]
    })
    clean_missing_values(df2)

    print("Tests completed.\n")

if __name__ == "__main__":
    df = load_data()
    clean_df = clean_missing_values(df)
    test_clean_missing_values()

Missing values before cleaning:
survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64
Filled missing in numerical column 'survived' with median: 0.0
Filled missing in numerical column 'pclass' with median: 3.0
Filled missing in numerical column 'age' with median: 28.0
Filled missing in numerical column 'sibsp' with median: 0.0
Filled missing in numerical column 'parch' with median: 0.0
Filled missing in numerical column 'fare' with median: 14.4542
Filled missing in categorical column 'sex' with mode: male
Filled missing in categorical column 'embarked' with mode: S
Filled missing in categorical column 'class' with mode: Third
Filled missing in categorical column 'who' with mode: man
Filled missing in categorical column 'deck' with mode: C
Filled 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always beha

Initial DataFrame shape: (891, 15)

Missing values per column:
survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64
Filled missing in 'survived' with median: 0.0
Filled missing in 'pclass' with median: 3.0
Filled missing in 'age' with median: 28.0
Filled missing in 'sibsp' with median: 0.0
Filled missing in 'parch' with median: 0.0
Filled missing in 'fare' with median: 14.4542
Filled missing in 'sex' with mode: male
Filled missing in 'embarked' with mode: S
Filled missing in 'class' with mode: Third
Filled missing in 'who' with mode: man
Filled missing in 'deck' with mode: C
Filled missing in 'embark_town' with mode: Southampton
Filled missing in 'alive' with mode: no

Missing values after cleaning:
survived       0
pclass         0
sex         

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always beha

In [None]:
# Write your code from here


import pandas as pd
import seaborn as sns

# Load Titanic dataset from seaborn
df = sns.load_dataset('titanic')

print("Initial DataFrame shape:", df.shape)

# Step 1: Identify missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# Step 2: Simple cleaning strategy

# For numerical columns, fill missing values with median
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
for col in num_cols:
    median_val = df[col].median()
    df[col].fillna(median_val, inplace=True)
    print(f"Filled missing in '{col}' with median: {median_val}")

# For categorical columns, fill missing values with mode
cat_cols = df.select_dtypes(include=['object', 'category']).columns
for col in cat_cols:
    mode_val = df[col].mode()[0]
    df[col].fillna(mode_val, inplace=True)
    print(f"Filled missing in '{col}' with mode: {mode_val}")

# Step 3: Verify no missing values remain
print("\nMissing values after cleaning:")
print(df.isnull().sum())

print("\nCleaned DataFrame shape:", df.shape)
