In [1]:
%config InlineBackend.figure_format = 'retina'

In [2]:
import pandas as pd

In [3]:
import matplotlib.pyplot as plt

In [4]:
import numpy as np

<div class="alert alert-danger">
  You must have MissForest package: <br>
   ! pip install MissForest <br>
   ! pip install lightgbm
</div>

# 1) Loading data

In [5]:
url = 'https://raw.githubusercontent.com/SaravananJaichandar/Credit-Risk-Model/refs/heads/master/german_credit_data.csv'
df = pd.read_csv(url, header=0, names = ['Index', 'Age', 'Sex', 'Job', 'Housing', 'Saving accounts',
       'Checking account', 'Credit amount', 'Duration', 'Purpose'])
df = df.iloc[:, 1:]

job_mapping = {0: 'unemployed/unskilled', 1: 'unskilled', 2: 'skilled', 3: 'highly skilled'}

df['Job'] = df['Job'].map(job_mapping).astype('object')

The **German Credit Data** dataset contains information about individuals and their financial background, used to determine their **creditworthiness**. It includes both **numerical** and **categorical** features related to personal characteristics, financial status, and loan information. 

<b>Features</b>

- `Age`: Age of the individual (numeric).
- `Sex`: Gender of the individual (`male` or `female`).
- `Job`: Job category (0 = unemployed/unskilled, 1 = unskilled, 2 = skilled, 3 = highly skilled).
- `Housing`: Housing status (`own`, `rent`, or `free`).
- `Saving accounts`: Category indicating amount in savings account (`little`, `moderate`, `quite rich`, `rich`, or `NaN`).
- `Checking account`: Category indicating balance in checking account (`little`, `moderate`, `rich`, or `NaN`).
- `Credit amount`: Amount of the loan requested (numeric).
- `Duration`: Duration of the loan in months (numeric).
- `Purpose`: Reason for the loan (`car`, `furniture/equipment`, `radio/TV`, `education`, `business`, etc.).

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1000 non-null   int64 
 1   Sex               1000 non-null   object
 2   Job               1000 non-null   object
 3   Housing           1000 non-null   object
 4   Saving accounts   817 non-null    object
 5   Checking account  606 non-null    object
 6   Credit amount     1000 non-null   int64 
 7   Duration          1000 non-null   int64 
 8   Purpose           1000 non-null   object
dtypes: int64(3), object(6)
memory usage: 70.4+ KB


In [7]:
df.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,67,male,skilled,own,,little,1169,6,radio/TV
1,22,female,skilled,own,little,moderate,5951,48,radio/TV
2,49,male,unskilled,own,little,,2096,12,education
3,45,male,skilled,free,little,little,7882,42,furniture/equipment
4,53,male,skilled,free,little,little,4870,24,car


Let's introduce some more `NaN` into the data, randomly. 

In [8]:
df.loc[np.random.randint(1000, size=20),"Age"] = None
df.loc[np.random.randint(1000, size=20),"Credit amount"] = None

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Age               980 non-null    float64
 1   Sex               1000 non-null   object 
 2   Job               1000 non-null   object 
 3   Housing           1000 non-null   object 
 4   Saving accounts   817 non-null    object 
 5   Checking account  606 non-null    object 
 6   Credit amount     980 non-null    float64
 7   Duration          1000 non-null   int64  
 8   Purpose           1000 non-null   object 
dtypes: float64(2), int64(1), object(6)
memory usage: 70.4+ KB


# 2) Imputing missing values

## 2.1) Central tendency

In [10]:
categorical_cols = list(df.select_dtypes(include=['object', 'category']).columns)
numerical_cols = list(df.select_dtypes(include=['number']).columns)

In [11]:
categorical_cols

['Sex', 'Job', 'Housing', 'Saving accounts', 'Checking account', 'Purpose']

In [12]:
numerical_cols

['Age', 'Credit amount', 'Duration']

In [13]:
df.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,67.0,male,skilled,own,,little,1169.0,6,radio/TV
1,22.0,female,skilled,own,little,moderate,5951.0,48,radio/TV
2,49.0,male,unskilled,own,little,,2096.0,12,education
3,45.0,male,skilled,free,little,little,7882.0,42,furniture/equipment
4,53.0,male,skilled,free,little,little,4870.0,24,car


The following are the mean and mode values:

In [14]:
df[numerical_cols].mean()

Age                35.525510
Credit amount    3280.531633
Duration           20.903000
dtype: float64

In [15]:
df[categorical_cols].mode()

Unnamed: 0,Sex,Job,Housing,Saving accounts,Checking account,Purpose
0,male,skilled,own,little,little,car


Let's impute missing values!

In [16]:
from sklearn.impute import SimpleImputer

We use the mean for numerical variables and the mode for the categorical ones. 

In [17]:
mean_imputer = SimpleImputer(strategy='mean')
mode_imputer = SimpleImputer(strategy='most_frequent')

In [18]:
df_central_tendency_imputed = df.copy()

In [19]:
df_central_tendency_imputed[numerical_cols] = pd.DataFrame(
    mean_imputer.fit_transform(df_central_tendency_imputed[numerical_cols]), columns=numerical_cols)

In [20]:
df_central_tendency_imputed[categorical_cols] = pd.DataFrame(
    mode_imputer.fit_transform(df_central_tendency_imputed[categorical_cols]), columns=categorical_cols)

In [21]:
df_central_tendency_imputed.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,67.0,male,skilled,own,little,little,1169.0,6.0,radio/TV
1,22.0,female,skilled,own,little,moderate,5951.0,48.0,radio/TV
2,49.0,male,unskilled,own,little,little,2096.0,12.0,education
3,45.0,male,skilled,free,little,little,7882.0,42.0,furniture/equipment
4,53.0,male,skilled,free,little,little,4870.0,24.0,car


## 2.1) k-Nearest Neighbors

In [22]:
from sklearn.impute import KNNImputer

### First, a toy example.

In [23]:
# toy dataset with 10 observations
toy_example = pd.DataFrame({
    'Age':     [25, np.nan, 30, 32, 22, 29, np.nan, 24, 28, np.nan],  
    'Pclass':  [1,    2,    1,   3,  2,   1,   2,   3,  1,   1],
    'Fare':    [50,  60,  np.nan, 40, 55, 48, 52, 42, 49, 50],    
    'Survived':[0,    1,    1,   0,  1,   1,   0,   0,  1,   0]
})

In [24]:
toy_example

Unnamed: 0,Age,Pclass,Fare,Survived
0,25.0,1,50.0,0
1,,2,60.0,1
2,30.0,1,,1
3,32.0,3,40.0,0
4,22.0,2,55.0,1
5,29.0,1,48.0,1
6,,2,52.0,0
7,24.0,3,42.0,0
8,28.0,1,49.0,1
9,,1,50.0,0


In [25]:
# Create KNNImputer instance with 2 nearest neighbors
imputer = KNNImputer(n_neighbors=2)

# Fit and transform the dataset
toy_example_imputed = pd.DataFrame(imputer.fit_transform(toy_example), columns=toy_example.columns)

In [26]:
toy_example_imputed

Unnamed: 0,Age,Pclass,Fare,Survived
0,25.0,1.0,50.0,0.0
1,26.0,2.0,60.0,1.0
2,30.0,1.0,54.0,1.0
3,32.0,3.0,40.0,0.0
4,22.0,2.0,55.0,1.0
5,29.0,1.0,48.0,1.0
6,27.5,2.0,52.0,0.0
7,24.0,3.0,42.0,0.0
8,28.0,1.0,49.0,1.0
9,27.5,1.0,50.0,0.0


### Larger data set

In [27]:
df.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,67.0,male,skilled,own,,little,1169.0,6,radio/TV
1,22.0,female,skilled,own,little,moderate,5951.0,48,radio/TV
2,49.0,male,unskilled,own,little,,2096.0,12,education
3,45.0,male,skilled,free,little,little,7882.0,42,furniture/equipment
4,53.0,male,skilled,free,little,little,4870.0,24,car


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Age               980 non-null    float64
 1   Sex               1000 non-null   object 
 2   Job               1000 non-null   object 
 3   Housing           1000 non-null   object 
 4   Saving accounts   817 non-null    object 
 5   Checking account  606 non-null    object 
 6   Credit amount     980 non-null    float64
 7   Duration          1000 non-null   int64  
 8   Purpose           1000 non-null   object 
dtypes: float64(2), int64(1), object(6)
memory usage: 70.4+ KB


In [29]:
imputer = KNNImputer(n_neighbors=2)

In [30]:
# imputer.fit_transform(df) # this shouldn't work 

In [31]:
numerical_cols

['Age', 'Credit amount', 'Duration']

In [32]:
df[numerical_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Age            980 non-null    float64
 1   Credit amount  980 non-null    float64
 2   Duration       1000 non-null   int64  
dtypes: float64(2), int64(1)
memory usage: 23.6 KB


In [33]:
imputer.fit_transform(df[numerical_cols])

array([[  67., 1169.,    6.],
       [  22., 5951.,   48.],
       [  49., 2096.,   12.],
       ...,
       [  38.,  804.,   12.],
       [  23., 1845.,   45.],
       [  27., 4576.,   45.]])

In [34]:
df_knn_imputed = pd.DataFrame(imputer.fit_transform(df[numerical_cols]), columns=numerical_cols)

In [35]:
df[categorical_cols].head()

Unnamed: 0,Sex,Job,Housing,Saving accounts,Checking account,Purpose
0,male,skilled,own,,little,radio/TV
1,female,skilled,own,little,moderate,radio/TV
2,male,unskilled,own,little,,education
3,male,skilled,free,little,little,furniture/equipment
4,male,skilled,free,little,little,car


In [36]:
df_knn_imputed = df_knn_imputed.join(df[categorical_cols])

In [37]:
df_knn_imputed.head()

Unnamed: 0,Age,Credit amount,Duration,Sex,Job,Housing,Saving accounts,Checking account,Purpose
0,67.0,1169.0,6.0,male,skilled,own,,little,radio/TV
1,22.0,5951.0,48.0,female,skilled,own,little,moderate,radio/TV
2,49.0,2096.0,12.0,male,unskilled,own,little,,education
3,45.0,7882.0,42.0,male,skilled,free,little,little,furniture/equipment
4,53.0,4870.0,24.0,male,skilled,free,little,little,car


In [38]:
df_knn_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Age               1000 non-null   float64
 1   Credit amount     1000 non-null   float64
 2   Duration          1000 non-null   float64
 3   Sex               1000 non-null   object 
 4   Job               1000 non-null   object 
 5   Housing           1000 non-null   object 
 6   Saving accounts   817 non-null    object 
 7   Checking account  606 non-null    object 
 8   Purpose           1000 non-null   object 
dtypes: float64(3), object(6)
memory usage: 70.4+ KB


<div class="alert alert-info">
  <strong>Assignment 1:</strong> How well did KNN imputation perform? To evaluate its accuracy, compare the imputed values to the original ones using Mean Squared Error (MSE).
</div>


## 2.2) missForest 🌳

In [39]:
from missforest import MissForest 

In [40]:
df.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,67.0,male,skilled,own,,little,1169.0,6,radio/TV
1,22.0,female,skilled,own,little,moderate,5951.0,48,radio/TV
2,49.0,male,unskilled,own,little,,2096.0,12,education
3,45.0,male,skilled,free,little,little,7882.0,42,furniture/equipment
4,53.0,male,skilled,free,little,little,4870.0,24,car


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Age               980 non-null    float64
 1   Sex               1000 non-null   object 
 2   Job               1000 non-null   object 
 3   Housing           1000 non-null   object 
 4   Saving accounts   817 non-null    object 
 5   Checking account  606 non-null    object 
 6   Credit amount     980 non-null    float64
 7   Duration          1000 non-null   int64  
 8   Purpose           1000 non-null   object 
dtypes: float64(2), int64(1), object(6)
memory usage: 70.4+ KB


`missForest` is a bit picky regarding representation, so we will transform categorical into a integer representation. 

In [42]:
from sklearn.preprocessing import OrdinalEncoder

In [43]:
encoder = OrdinalEncoder()
df[categorical_cols] = encoder.fit_transform(df[categorical_cols])

In [44]:
df.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,67.0,1.0,1.0,1.0,,0.0,1169.0,6,5.0
1,22.0,0.0,1.0,1.0,0.0,1.0,5951.0,48,5.0
2,49.0,1.0,3.0,1.0,0.0,,2096.0,12,3.0
3,45.0,1.0,1.0,0.0,0.0,0.0,7882.0,42,4.0
4,53.0,1.0,1.0,0.0,0.0,0.0,4870.0,24,1.0


In [45]:
for col, cats in zip(categorical_cols, encoder.categories_):
    print(f"Mapping for '{col}':")
    for i, cat in enumerate(cats):
        if not pd.isna(cat): 
            print(f"    {cat} -> {i}")

Mapping for 'Sex':
    female -> 0
    male -> 1
Mapping for 'Job':
    highly skilled -> 0
    skilled -> 1
    unemployed/unskilled -> 2
    unskilled -> 3
Mapping for 'Housing':
    free -> 0
    own -> 1
    rent -> 2
Mapping for 'Saving accounts':
    little -> 0
    moderate -> 1
    quite rich -> 2
    rich -> 3
Mapping for 'Checking account':
    little -> 0
    moderate -> 1
    rich -> 2
Mapping for 'Purpose':
    business -> 0
    car -> 1
    domestic appliances -> 2
    education -> 3
    furniture/equipment -> 4
    radio/TV -> 5
    repairs -> 6
    vacation/others -> 7


<div class="alert alert-block alert-warning">
  Note that although they are numbers, we treat them as categorical variables!
</div>


In [46]:
imputer = MissForest(categorical=categorical_cols)

In [47]:
df_imputed_array = imputer.fit_transform(df)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:23<00:00,  4.64s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 80.67it/s]


In [48]:
df_imputed_array.head()

Unnamed: 0,Sex,Job,Housing,Duration,Purpose,Age,Credit amount,Saving accounts,Checking account
0,1.0,1.0,1.0,6,5.0,67.0,1169.0,0.0,0.0
1,0.0,1.0,1.0,48,5.0,22.0,5951.0,0.0,1.0
2,1.0,3.0,1.0,12,3.0,49.0,2096.0,0.0,0.0
3,1.0,1.0,0.0,42,4.0,45.0,7882.0,0.0,0.0
4,1.0,1.0,0.0,24,1.0,53.0,4870.0,0.0,0.0


In [49]:
df_imputed_array.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Sex               1000 non-null   float64
 1   Job               1000 non-null   float64
 2   Housing           1000 non-null   float64
 3   Duration          1000 non-null   int64  
 4   Purpose           1000 non-null   float64
 5   Age               1000 non-null   float64
 6   Credit amount     1000 non-null   float64
 7   Saving accounts   1000 non-null   float64
 8   Checking account  1000 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 70.4 KB


<div class="alert alert-info">
  <strong>Assignment 2:</strong> How well did missForest imputation perform? Compare the imputed values to the original (before introducing missing values) to measure the true performance.
</div>

<div class="alert alert-info">
  <strong>Assignment 3:</strong> Identify variables with a natural order, convert them into numeric format reflecting that order, and compare how the imputation performs before and after this transformation.
</div>

<div class="alert alert-info">
  Further reading: useful library that we are not using here: <a href=https://github.com/iskandr/fancyimpute>https://github.com/iskandr/fancyimpute</a>.
</div>