In [None]:
%config InlineBackend.figure_format = 'retina'

In [None]:
import pandas as pd

In [None]:
import matplotlib.pyplot as plt

In [None]:
import numpy as np

<div class="alert alert-danger">
  You must have MissForest package: <br>
   ! pip install MissForest <br>
   ! pip install lightgbm
</div>

# 1) Loading data

In [None]:
url = 'https://raw.githubusercontent.com/SaravananJaichandar/Credit-Risk-Model/refs/heads/master/german_credit_data.csv'
df = pd.read_csv(url, header=0, names = ['Index', 'Age', 'Sex', 'Job', 'Housing', 'Saving accounts',
       'Checking account', 'Credit amount', 'Duration', 'Purpose'])
df = df.iloc[:, 1:]

job_mapping = {0: 'unemployed/unskilled', 1: 'unskilled', 2: 'skilled', 3: 'highly skilled'}

df['Job'] = df['Job'].map(job_mapping).astype('object')

The **German Credit Data** dataset contains information about individuals and their financial background, used to determine their **creditworthiness**. It includes both **numerical** and **categorical** features related to personal characteristics, financial status, and loan information. 

<b>Features</b>

- `Age`: Age of the individual (numeric).
- `Sex`: Gender of the individual (`male` or `female`).
- `Job`: Job category (0 = unemployed/unskilled, 1 = unskilled, 2 = skilled, 3 = highly skilled).
- `Housing`: Housing status (`own`, `rent`, or `free`).
- `Saving accounts`: Category indicating amount in savings account (`little`, `moderate`, `quite rich`, `rich`, or `NaN`).
- `Checking account`: Category indicating balance in checking account (`little`, `moderate`, `rich`, or `NaN`).
- `Credit amount`: Amount of the loan requested (numeric).
- `Duration`: Duration of the loan in months (numeric).
- `Purpose`: Reason for the loan (`car`, `furniture/equipment`, `radio/TV`, `education`, `business`, etc.).

In [None]:
df.info()

In [None]:
df.head()

Let's introduce some more `NaN` into the data, randomly. 

In [None]:
df.loc[np.random.randint(1000, size=20),"Age"] = None
df.loc[np.random.randint(1000, size=20),"Credit amount"] = None

In [None]:
df.info()

# 2) Imputing missing values

## 2.1) Central tendency

In [None]:
categorical_cols = list(df.select_dtypes(include=['object', 'category']).columns)
numerical_cols = list(df.select_dtypes(include=['number']).columns)

In [None]:
categorical_cols

In [None]:
numerical_cols

In [None]:
df.head()

The following are the mean and mode values:

In [None]:
df[numerical_cols].mean()

In [None]:
df[categorical_cols].mode()

Let's impute missing values!

In [None]:
from sklearn.impute import SimpleImputer

We use the mean for numerical variables and the mode for the categorical ones. 

In [None]:
mean_imputer = SimpleImputer(strategy='mean')
mode_imputer = SimpleImputer(strategy='most_frequent')

In [None]:
df_central_tendency_imputed = df.copy()

In [None]:
df_central_tendency_imputed[numerical_cols] = pd.DataFrame(
    mean_imputer.fit_transform(df_central_tendency_imputed[numerical_cols]), columns=numerical_cols)

In [None]:
df_central_tendency_imputed[categorical_cols] = pd.DataFrame(
    mode_imputer.fit_transform(df_central_tendency_imputed[categorical_cols]), columns=categorical_cols)

In [None]:
df_central_tendency_imputed.head()

## 2.1) k-Nearest Neighbors

In [None]:
from sklearn.impute import KNNImputer

### First, a toy example.

In [None]:
# toy dataset with 10 observations
toy_example = pd.DataFrame({
    'Age':     [25, np.nan, 30, 32, 22, 29, np.nan, 24, 28, np.nan],  
    'Pclass':  [1,    2,    1,   3,  2,   1,   2,   3,  1,   1],
    'Fare':    [50,  60,  np.nan, 40, 55, 48, 52, 42, 49, 50],    
    'Survived':[0,    1,    1,   0,  1,   1,   0,   0,  1,   0]
})

In [None]:
toy_example

In [None]:
# Create KNNImputer instance with 2 nearest neighbors
imputer = KNNImputer(n_neighbors=2)

# Fit and transform the dataset
toy_example_imputed = pd.DataFrame(imputer.fit_transform(toy_example), columns=toy_example.columns)

In [None]:
toy_example_imputed

### Larger data set

In [None]:
df.head()

In [None]:
df.info()

In [None]:
imputer = KNNImputer(n_neighbors=2)

In [None]:
# imputer.fit_transform(df) # this shouldn't work 

In [None]:
numerical_cols

In [None]:
df[numerical_cols].info()

In [None]:
imputer.fit_transform(df[numerical_cols])

In [None]:
df_knn_imputed = pd.DataFrame(imputer.fit_transform(df[numerical_cols]), columns=numerical_cols)

In [None]:
df[categorical_cols].head()

In [None]:
df_knn_imputed = df_knn_imputed.join(df[categorical_cols])

In [None]:
df_knn_imputed.head()

In [None]:
df_knn_imputed.info()

<div class="alert alert-info">
  <strong>Assignment 1:</strong> How well did KNN imputation perform? To evaluate its accuracy, compare the imputed values to the original ones using Mean Squared Error (MSE).
</div>


## 2.2) missForest 🌳

In [None]:
# ! pip install missforest # if you don't have it installed
# ! pip install lightgbm

In [None]:
from missforest import MissForest 

In [None]:
df.head()

In [None]:
df.info()

`missForest` is a bit picky regarding representation, so we will transform categorical into a integer representation. 

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
encoder = OrdinalEncoder()
df[categorical_cols] = encoder.fit_transform(df[categorical_cols])

In [None]:
df.head()

In [None]:
for col, cats in zip(categorical_cols, encoder.categories_):
    print(f"Mapping for '{col}':")
    for i, cat in enumerate(cats):
        if not pd.isna(cat): 
            print(f"    {cat} -> {i}")

<div class="alert alert-block alert-warning">
  Note that although they are numbers, we treat them as categorical variables!
</div>


In [None]:
imputer = MissForest(categorical=categorical_cols)

In [None]:
df_imputed_array = imputer.fit_transform(df)

In [None]:
df_imputed_array.head()

In [None]:
df_imputed_array.info()

<div class="alert alert-info">
  <strong>Assignment 2:</strong> How well did missForest imputation perform? Compare the imputed values to the original (before introducing missing values) to measure the true performance.
</div>

<div class="alert alert-info">
  <strong>Assignment 3:</strong> Identify variables with a natural order, convert them into numeric format reflecting that order, and compare how the imputation performs before and after this transformation.
</div>

<div class="alert alert-info">
  Further reading: useful library that we are not using here: <a href=https://github.com/iskandr/fancyimpute>https://github.com/iskandr/fancyimpute</a>.
</div>