In [35]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import micegradient as mg
from missing_generator import mcar_sampling, mar_sampling, mnar_sampling
# the micegradientlibrary is based on https://github.com/AnotherSamWilson/miceforest v2. 
# They already released v5 which is vastly different so things will differ quite a lot. But still you can check their documentation to get some ideas
# to use the library you need to install micegradient. use below command - 
# pip install -e micegradient

## Setup Variables

In [36]:
# gradient boosting is the most reliable, deep regressor take a lot of time
available_estimators = ['LinearRegression', 'GradientBoosting', 'DecisionTree', 'RandomForest', 'DeeRegressor']

# initially impute the value using median or random value (selected from the value range) 
initialization_options = ['median', 'random']

# when 0 is mean meatching off, 1 is on (please search predictive mean matching to understand how it works)
# it doesn't improve imputation greatly so default is 0 (off)
mean_match = [0,1]

# number of mice iterations ranges between 1 - 100 (higher numbers can increase runtime drastically)  
mice_iterations = 20
no_of_clusters = 3

In [37]:
# training_data_missing should be a pandas dataframe
training_data_missing = pd.DataFrame()
initialization = 'median'
estimator_name = 'GradientBoosting'

# we can create multiple versions of the imputed datasets by increasing no_of_generated_datasets
# if you generate multiple versions you have take the mean from multiple generated version 
no_of_generated_datasets = 2

## Load dataset and introduce missing values

In [38]:
dataset = pd.read_csv('dummy_dataset.csv',dtype=np.float32)

In [39]:
def missing_generator(missing_type, original_dataset, percentage=0.5, batch_size=None):
    # normalization happens inside the each sampling function
    # the functions are called sampling because they sample the batch from the whole dataset
    # then apply the desired type of missingness
    if missing_type == 'mar':
        return mar_sampling(original_dataset, percentage, batch_size)
    elif missing_type == 'mnar':
        return mnar_sampling(original_dataset, percentage, batch_size)
    elif missing_type == 'mcar':
        return mcar_sampling(original_dataset, percentage, batch_size)

In [40]:
dbName = 'dummy'
missing_type = 'mcar'
percentage = 0.5
# original dataset is returned in case you select a size smaller than the actual dataset
original_dataset, missing_dataset = missing_generator(missing_type, dataset, percentage=percentage)
original_dataset.to_csv('original_dataset_'+dbName+'_'+missing_type+'_'+str(percentage)+'.csv', index = False)
missing_dataset.to_csv('missing_dataset_'+dbName+'_'+missing_type+'_'+str(percentage)+'.csv', index = False)

In [41]:
original_dataset

Unnamed: 0,Release date,Max resolution,Low resolution,Effective pixels,Zoom wide (W),Zoom tele (T),Normal focus range,Macro focus range,Storage included,Weight (inc. batteries),Dimensions,Price
0,1997.0,1024.0,640.0,0.0,38.0,114.0,70.0,40.0,4.0,420.0,95.0,179.0
1,1998.0,1280.0,640.0,1.0,38.0,114.0,50.0,0.0,4.0,420.0,158.0,179.0
2,2000.0,640.0,0.0,0.0,45.0,45.0,0.0,0.0,2.0,0.0,0.0,179.0
3,1999.0,1152.0,640.0,0.0,35.0,35.0,0.0,0.0,4.0,0.0,0.0,269.0
4,1999.0,1152.0,640.0,0.0,43.0,43.0,50.0,0.0,40.0,300.0,128.0,1299.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1033,2001.0,2048.0,1024.0,3.0,38.0,114.0,10.0,10.0,8.0,320.0,120.0,62.0
1034,2000.0,2048.0,1024.0,3.0,35.0,105.0,80.0,9.0,16.0,390.0,116.0,62.0
1035,2001.0,2048.0,1024.0,3.0,35.0,98.0,80.0,10.0,8.0,340.0,107.0,62.0
1036,2001.0,2400.0,1200.0,3.0,35.0,98.0,80.0,10.0,16.0,340.0,107.0,62.0


In [42]:
missing_dataset

Unnamed: 0,Release date,Max resolution,Low resolution,Effective pixels,Zoom wide (W),Zoom tele (T),Normal focus range,Macro focus range,Storage included,Weight (inc. batteries),Dimensions,Price
0,1997.0,,640.0,0.0,38.0,114.0,,40.0,,,95.0,179.0
1,1998.0,1280.0,,,,114.0,50.0,0.0,4.0,,158.0,179.0
2,,640.0,,,,45.0,,0.0,2.0,0.0,,
3,1999.0,1152.0,,,35.0,35.0,,0.0,,0.0,,
4,,1152.0,640.0,0.0,43.0,43.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
1033,2001.0,,1024.0,3.0,,114.0,10.0,,,320.0,120.0,62.0
1034,2000.0,2048.0,1024.0,3.0,35.0,105.0,,,16.0,,116.0,
1035,,,,3.0,35.0,98.0,80.0,10.0,8.0,340.0,107.0,62.0
1036,,,,3.0,35.0,98.0,,,16.0,340.0,,62.0


## Usage example with cluster labels

### clustering & imputing function

In [43]:
def proposed_clustering(missing_dataset, initialization):
    
    # initialization with median values for clustering
    copy_missing_dataset = missing_dataset.copy()
    copy_missing_dataset = copy_missing_dataset.fillna(copy_missing_dataset.median())

    # initialize kmeans
    clustering = KMeans(n_clusters=no_of_clusters).fit(copy_missing_dataset)

    copy_missing_dataset['labels']=clustering.labels_
    all_imputed_dataframes = []
    
    all_complete_imputed_datasets = {}

    # predict missing data for each cluster separately
    for i in range(0, no_of_clusters):
        
        clustered_dataset = missing_dataset[copy_missing_dataset['labels'] == i]
        kernel = mg.MultipleImputedKernel(
            missing_dataset,
            datasets=no_of_generated_datasets,
            save_all_iterations=False,
            mean_match_candidates=0,
            initialization=initialization
        )
        for j in range(0, no_of_generated_datasets):
            all_imputed_dataframes.append(kernel.complete_data(j))
        

    for i in range(0, no_of_generated_datasets):

        clustered_dataframe_iterations = {}

        for j in range(0, no_of_clusters):

            clustered_dataframe_iterations[j] = all_imputed_dataframes[i+(j*no_of_generated_datasets)]
            
        completed_data = pd.concat([clustered_dataframe_iterations[0], clustered_dataframe_iterations[1], clustered_dataframe_iterations[2]] , ignore_index=False)
        completed_data = completed_data.sort_index()
        all_complete_imputed_datasets[i] = completed_data


    return all_complete_imputed_datasets

#### combining multiple imputed results to get final results

In [44]:
all_complete_imputed_datasets = proposed_clustering(missing_dataset, initialization)
combined_dataframes = pd.concat(all_complete_imputed_datasets)
mean_imputed_df = combined_dataframes.groupby(level=1).mean()


In [45]:
mean_imputed_df

Unnamed: 0,Release date,Max resolution,Low resolution,Effective pixels,Zoom wide (W),Zoom tele (T),Normal focus range,Macro focus range,Storage included,Weight (inc. batteries),Dimensions,Price
0,1997.0,2560.0,640.0,0.0,38.0,114.0,50.0,40.0,16.0,225.0,95.0,179.0
1,1998.0,1280.0,2048.0,4.0,36.0,114.0,50.0,0.0,4.0,225.0,158.0,179.0
2,2004.0,640.0,2048.0,4.0,36.0,45.0,50.0,0.0,2.0,0.0,102.0,199.0
3,1999.0,1152.0,2048.0,4.0,35.0,35.0,50.0,0.0,16.0,0.0,102.0,199.0
4,2004.0,1152.0,640.0,0.0,43.0,43.0,50.0,6.0,16.0,225.0,102.0,199.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1033,2001.0,2560.0,1024.0,3.0,36.0,114.0,10.0,6.0,16.0,320.0,120.0,62.0
1034,2000.0,2048.0,1024.0,3.0,35.0,105.0,50.0,6.0,16.0,225.0,116.0,199.0
1035,2004.0,2560.0,2048.0,3.0,35.0,98.0,80.0,10.0,8.0,340.0,107.0,62.0
1036,2004.0,2560.0,2048.0,3.0,35.0,98.0,50.0,6.0,16.0,340.0,102.0,62.0
