## Handling missing data

In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Sample data
data = {
    'Age': [25, None, 30, 35, None], 
    'Salary': [50000, 60000, None, 80000, None]
        }
df = pd.DataFrame(data)

print("Original DataFrame with missing values:\n", df)

Original DataFrame with missing values:
     Age   Salary
0  25.0  50000.0
1   NaN  60000.0
2  30.0      NaN
3  35.0  80000.0
4   NaN      NaN


In [2]:
# Adding a flag column for missing values
df['Age_missing'] = df['Age'].isnull()
df['Salary_missing'] = df['Salary'].isnull()

print("Original DataFrame with new columns:\n", df)

Original DataFrame with new columns:
     Age   Salary  Age_missing  Salary_missing
0  25.0  50000.0        False           False
1   NaN  60000.0         True           False
2  30.0      NaN        False            True
3  35.0  80000.0        False           False
4   NaN      NaN         True            True


In [None]:
# Deleting rows with missing data
df_dropna = df.dropna()

print("\nDataFrame after dropping missing values:\n", df_dropna)

In [None]:
# Imputation with mean for missing values
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

print("\nDataFrame after imputation:\n", df_imputed)