# DSS dataset Imputation with KNN

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.metrics import r2_score, mean_squared_error

warnings.filterwarnings("ignore")

In [None]:
dss = pd.read_csv('DSS_Data.csv').rename(columns={"Unnamed: 0": "Patients"}).set_index('Patients')
dss

In [None]:
# Randomly removing 5% of non-missing data points

df_with_additional_nans = dss.copy()
np.random.seed(45) 

# Getting indices of non-missing data

non_missing_indices = np.argwhere(~np.isnan(df_with_additional_nans.values))

# Randomly choosing 5% of these non-missing indices to remove

num_to_remove = int(0.05 * len(non_missing_indices))
indices_to_remove = non_missing_indices[np.random.choice(len(non_missing_indices), num_to_remove, replace=False)]

# Creating a mask for the values to remove

mask_to_remove = np.zeros(df_with_additional_nans.shape, dtype=bool)
mask_to_remove[tuple(indices_to_remove.T)] = True

# Removing the selected values

df_with_additional_nans[mask_to_remove] = np.nan


In [None]:
# Normalising the data

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_with_additional_nans)


In [None]:
# Applying KNN imputation

k = 5  
imputer = KNNImputer(n_neighbors=k)
df_imputed_scaled = imputer.fit_transform(df_scaled)
df_imputed = scaler.inverse_transform(df_imputed_scaled)
df_imputed = pd.DataFrame(df_imputed, columns=dss.columns, index=dss.index)


In [None]:
# Extracting the original and imputed values for the removed data points

actual_values = dss.values[mask_to_remove]
imputed_values = df_imputed.values[mask_to_remove]

# Calculating R^2 value

r2 = r2_score(actual_values, imputed_values)
print(f"R^2 value of the imputation: {r2}")


In [None]:
# Calculating Root Mean Squared Error (RMSE)

mse = mean_squared_error(actual_values, imputed_values)
rmse = np.sqrt(mse)

# Calculating Normalized RMSE (NRMSE)
range_of_actuals = actual_values.max() - actual_values.min()
nrmse = rmse / range_of_actuals

print(f"Root Mean Squared Error (RMSE) of the imputation: {rmse}")
print(f"Normalized Root Mean Squared Error (NRMSE) of the imputation: {nrmse}")


In [None]:
# Creating  a DataFrame to display actual and imputed values side by side

comparison_df = pd.DataFrame({
    'Actual': actual_values,
    'Imputed': imputed_values
})
print("\nActual vs Imputed Values:")
print(comparison_df)

In [None]:
# Plotting the results

plt.scatter(actual_values, imputed_values, color='blue', s=20)
plt.xlabel('Actual Values')
plt.ylabel('Imputed Values')
plt.title(f'KNN Imputation: Actual vs Imputed Values (R^2 = {r2:.2f})')

# Adding a red dotted line indicating the perfect fit

min_val = min(min(actual_values), min(imputed_values))
max_val = max(max(actual_values), max(imputed_values))
plt.plot([min_val, max_val], [min_val, max_val], 'r--')

# Adding grid and remove all spines

plt.grid(True)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)

plt.legend()
plt.show()