In [1]:
# Activity 1: Handling Missing Data

# Task A: Dropping vs Imputation

# 1. Dropping Missing Data:
# - Load a dataset (e.g., a CSV file with some missing values like employees.csv ).
# - Inspect the dataset for missing values using a Python library (e.g., Pandas).
# - Drop rows with missing data and save the result.






# 2. Imputation using Mean:
# - Use the same dataset.
# - Fill missing numerical values with the column mean.
# - Save and display the modified data.









# 3. Imputation using Median and Mode:
# - For numerical columns, replace missing values with the median.
# - For categorical columns, use the mode.
# - Display the updated dataset.

import pandas as pd

# Create a sample DataFrame with missing values
data = {
    'EmployeeID': [1, 2, 3, 4, 5],
    'Age': [25, 30, None, 40, None],
    'Salary': [50000, 60000, 70000, None, 80000],
    'Department': ['HR', 'IT', 'IT', 'HR', None]
}

df = pd.DataFrame(data)

# Task 1: Dropping Missing Data
print("Original DataFrame:")
print(df)

# Dropping rows with missing values
df_dropped = df.dropna()

# Display and save the result of dropping missing data
print("\nDataFrame after dropping missing values:")
print(df_dropped)

# Task 2: Imputation using Mean (for numerical columns)
df_imputed_mean = df.copy()  # Copy original df to preserve it for other tasks

# Filling missing values for numerical columns with the column mean
numerical_cols = df_imputed_mean.select_dtypes(include=['float64', 'int64']).columns
for col in numerical_cols:
    df_imputed_mean[col].fillna(df_imputed_mean[col].mean(), inplace=True)

# Display the result of mean imputation
print("\nDataFrame after imputing missing values with mean:")
print(df_imputed_mean)

# Task 3: Imputation using Median for numerical columns and Mode for categorical columns
df_imputed_median_mode = df.copy()

# Impute missing values for numerical columns with median
for col in numerical_cols:
    df_imputed_median_mode[col].fillna(df_imputed_median_mode[col].median(), inplace=True)

# Impute missing values for categorical columns with mode
categorical_cols = df_imputed_median_mode.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df_imputed_median_mode[col].fillna(df_imputed_median_mode[col].mode()[0], inplace=True)

# Display the result of median and mode imputation
print("\nDataFrame after imputing missing values with median (numerical) and mode (categorical):")
print(df_imputed_median_mode)





Original DataFrame:
   EmployeeID   Age   Salary Department
0           1  25.0  50000.0         HR
1           2  30.0  60000.0         IT
2           3   NaN  70000.0         IT
3           4  40.0      NaN         HR
4           5   NaN  80000.0       None

DataFrame after dropping missing values:
   EmployeeID   Age   Salary Department
0           1  25.0  50000.0         HR
1           2  30.0  60000.0         IT

DataFrame after imputing missing values with mean:
   EmployeeID        Age   Salary Department
0           1  25.000000  50000.0         HR
1           2  30.000000  60000.0         IT
2           3  31.666667  70000.0         IT
3           4  40.000000  65000.0         HR
4           5  31.666667  80000.0       None

DataFrame after imputing missing values with median (numerical) and mode (categorical):
   EmployeeID   Age   Salary Department
0           1  25.0  50000.0         HR
1           2  30.0  60000.0         IT
2           3  30.0  70000.0         IT
3      

In [2]:
# Task B: Predictive Imputation

# 4. ML-based Imputation with Simple Imputer:
# - Use SimpleImputer from sklearn to fill missing values.
# - Choose a strategy (e.g., mean) and apply it to the dataset.





# 5. Imputation using a Regression Model:
# - Use a regression model to predict missing values.
# - Train the model on complete cases and fill the missing data.




# 6. K-Nearest Neighbors Imputation:
# - Use KNNImputer from sklearn .
# - Impute missing data based on neighbors' information.


import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Create a sample DataFrame with missing values
data = {
    'EmployeeID': [1, 2, 3, 4, 5],
    'Age': [25, 30, None, 40, None],
    'Salary': [50000, 60000, 70000, None, 80000],
    'Department': ['HR', 'IT', 'IT', 'HR', None]
}

df = pd.DataFrame(data)

# Task 4: ML-based Imputation with Simple Imputer (Mean Strategy)
# Impute missing values using SimpleImputer with mean strategy for numerical columns
simple_imputer = SimpleImputer(strategy='mean')
df_imputed_simple = df.copy()

# Apply SimpleImputer to numerical columns only
numerical_cols = df_imputed_simple.select_dtypes(include=['float64', 'int64']).columns
df_imputed_simple[numerical_cols] = simple_imputer.fit_transform(df_imputed_simple[numerical_cols])

print("\nDataFrame after imputing missing values using Simple Imputer (Mean):")
print(df_imputed_simple)

# Task 5: Imputation using a Regression Model
# Use Linear Regression to predict missing 'Age' based on 'Salary'
df_imputed_regression = df.copy()

# Create a dataset with complete cases for training
train_data = df_imputed_regression.dropna(subset=['Age'])  # Remove rows where 'Age' is missing
X_train = train_data[['Salary']]  # Features: Salary
y_train = train_data['Age']  # Target: Age

# Train a Linear Regression model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predict missing 'Age' values using the trained model
X_missing = df_imputed_regression[df_imputed_regression['Age'].isnull()][['Salary']]
predicted_age = regressor.predict(X_missing)

# Fill the missing 'Age' values with the predictions
df_imputed_regression.loc[df_imputed_regression['Age'].isnull(), 'Age'] = predicted_age

print("\nDataFrame after imputing missing 'Age' values using Regression Model:")
print(df_imputed_regression)

# Task 6: K-Nearest Neighbors Imputation
# Impute missing values using KNNImputer for numerical columns
knn_imputer = KNNImputer(n_neighbors=2)
df_imputed_knn = df.copy()

# Apply KNNImputer to the entire DataFrame (numerical and categorical columns)
df_imputed_knn[numerical_cols] = knn_imputer.fit_transform(df_imputed_knn[numerical_cols])

# For categorical columns, we can impute with the mode (most frequent value)
df_imputed_knn['Department'].fillna(df_imputed_knn['Department'].mode()[0], inplace=True)

print("\nDataFrame after imputing missing values using KNN Imputer:")
print(df_imputed_knn)




DataFrame after imputing missing values using Simple Imputer (Mean):
   EmployeeID        Age   Salary Department
0         1.0  25.000000  50000.0         HR
1         2.0  30.000000  60000.0         IT
2         3.0  31.666667  70000.0         IT
3         4.0  40.000000  65000.0         HR
4         5.0  31.666667  80000.0       None


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values