In [6]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

def knn_impute(df, k):
    """
    Perform KNN imputation on the DataFrame.

    Parameters:
    - df: DataFrame containing missing values
    - k: Maximum number of nearest neighbors to consider for imputation

    Returns:
    - DataFrame with missing values imputed using KNN
    """
    imputer = KNNImputer(n_neighbors=k)
    df_numeric = df.select_dtypes(include=[np.number])
    df_non_numeric = df.select_dtypes(exclude=[np.number])
    
    imputed_data = imputer.fit_transform(df_numeric)
    df_imputed = pd.DataFrame(imputed_data, columns=df_numeric.columns, index=df_numeric.index)
    
    return pd.concat([df_imputed, df_non_numeric], axis=1)

# Load CSV data
df = pd.read_csv('3_output_class_rem_extra.csv')

# Mean Imputation
df_mean = df.copy()
for column in df_mean.columns:
    if df_mean[column].dtype in [np.float64, np.int64]:
        df_mean[column] = df_mean[column].fillna(df_mean[column].mean())
    else:
        df_mean[column] = df_mean[column].fillna(0)  # Filling non-numeric columns with a constant value
df_mean.fillna(0, inplace=True)  # Ensure no missing values remain
df_mean.to_csv('4_out_csvs_classification/output_mean_imputed.csv', index=False)

# Median Imputation
df_median = df.copy()
for column in df_median.columns:
    if df_median[column].dtype in [np.float64, np.int64]:
        df_median[column] = df_median[column].fillna(df_median[column].median())
    else:
        df_median[column] = df_median[column].fillna(0)  # Filling non-numeric columns with a constant value
df_median.fillna(0, inplace=True)  # Ensure no missing values remain
df_median.to_csv('4_out_csvs_classification/output_median_imputed.csv', index=False)

# Mode Imputation
df_mode = df.copy()
for column in df_mode.columns:
    if df_mode[column].dtype == object:
        df_mode[column] = df_mode[column].fillna(df_mode[column].mode()[0])
    else:
        df_mode[column] = df_mode[column].fillna(0)  # Filling non-categorical columns with a constant value
df_mode.fillna(0, inplace=True)  # Ensure no missing values remain
df_mode.to_csv('4_out_csvs_classification/output_mode_imputed.csv', index=False)

# Forward Fill and fill remaining NaNs with 0
df_ffill = df.copy()
df_ffill.ffill(inplace=True)  # Using .ffill() instead of fillna(method='ffill')
df_ffill.fillna(0, inplace=True)  # Fill any remaining NaNs with 0
df_ffill.to_csv('4_out_csvs_classification/output_ffill_imputed.csv', index=False)

# Backward Fill and fill remaining NaNs with 0
df_bfill = df.copy()
df_bfill.bfill(inplace=True)  # Using .bfill() instead of fillna(method='bfill')
df_bfill.fillna(0, inplace=True)  # Fill any remaining NaNs with 0
df_bfill.to_csv('4_out_csvs_classification/output_bfill_imputed.csv', index=False)

# Linear Interpolation and fill remaining NaNs with 0
df_interpolated = df.copy()
df_interpolated = df_interpolated.infer_objects()  # Infer types before interpolation
df_interpolated.interpolate(method='linear', inplace=True)
df_interpolated.fillna(0, inplace=True)  # Fill any remaining NaNs with 0
df_interpolated.to_csv('4_out_csvs_classification/output_interpolated.csv', index=False)

k = 3
# Perform KNN imputation
df_knn_imputed = knn_impute(df, k)
df_knn_imputed.fillna(0, inplace=True)  # Fill any remaining NaNs with 0
# Save the imputed DataFrame to CSV
df_knn_imputed.to_csv('4_out_csvs_classification/output_knn_imputed_custom.csv', index=False)

# Iterative Imputation (MICE)
iter_imputer = IterativeImputer()
df_iter_imputed_values = iter_imputer.fit_transform(df.select_dtypes(include=[np.number]))
df_iter_imputed = pd.DataFrame(df_iter_imputed_values, columns=df.select_dtypes(include=[np.number]).columns)
df_iter_imputed.fillna(0, inplace=True)  # Fill any remaining NaNs with 0

# Ensure the number of columns matches between the original and imputed DataFrames
if len(df_iter_imputed.columns) == len(df.select_dtypes(include=[np.number]).columns):
    df_iter_imputed.to_csv('4_out_csvs_classification/output_iterative_imputed.csv', index=False)
else:
    print("Number of columns mismatch after Iterative imputation")

# Custom Imputation (e.g., fill with a specific value)
def custom_imputation(value):
    if pd.isnull(value):
        return -1  # Custom value for missing data
    return value

df_custom = df.copy()
for column in df_custom.columns:
    df_custom[column] = df_custom[column].apply(custom_imputation)
df_custom.fillna(0, inplace=True)  # Fill any remaining NaNs with 0
df_custom.to_csv('4_out_csvs_classification/output_custom_imputed.csv', index=False)


  df_interpolated.interpolate(method='linear', inplace=True)
