In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from models.mida import Mida
from models.mice import Mice

import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from fancyimpute import KNN, SoftImpute

from preprocessing.data_loading import DataSet
from preprocessing.missing_value_generation import mcar_generator, mar_generator, mnar_generator
from models.imputation_wrapper import SingleImputationWrapper, MultiImputationWrapper
from analysis.evaluation import ClassificationEvaluation, LnormEvaluation, TimeEvaluation
from analysis.method_wrapper import ClassificationMethodWrapper

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
datasets_df = pd.read_csv('data/datasets.csv')
datasets = datasets_df.loc[datasets_df['type']=='classification', 'dataset'].tolist()
print(datasets)

['ionosphere', 'iris', 'ecoli', 'satellite', 'wdbc']


In [4]:
amputations = [mcar_generator(), mar_generator(2), mnar_generator(2)]

In [5]:
p_range = [0.2, 0.4, 0.6, 0.8]

In [6]:
# each dataset
for dataset_name in datasets:    
    print(dataset_name + ' started-------------------------------------------')
    # init dataset
    dataset = DataSet(dataset_name, 'class')

    # init evaluation
    clf_evaluation = ClassificationEvaluation(ClassificationMethodWrapper(KNeighborsClassifier(5)), dataset)
    lnorm_evaluation = LnormEvaluation(dataset)
    time_evaluation = TimeEvaluation(dataset)

    # result on complete dataset
    clf_evaluation.evaluate_result([dataset.complete_data()], 0, 'None', 'None')
    lnorm_evaluation.evaluate_result([dataset.complete_data()], 0, 'None', 'None')

    # define methods for imputation
    imputations = [SingleImputationWrapper(KNN(3, verbose=False)), SingleImputationWrapper(SoftImpute(verbose=False)),
                   MultiImputationWrapper(Mice(40, verbose=False)), MultiImputationWrapper(Mida(40, verbose=False))]

    # different methods
    for amputation in amputations:
        dataset.init_missing_data(amputation)

        # increasing rate p
        for p in p_range:
            dataset.ampute_values(p)

            # apply each imputation
            for imputation in imputations:
                results, exec_time = imputation.complete(dataset.missing_data())
                clf_evaluation.evaluate_result(results, p, amputation.name(), imputation.name())
                lnorm_evaluation.evaluate_result(results, p, amputation.name(), imputation.name())
                time_evaluation.evaluate_result(exec_time, p, amputation.name(), imputation.name(), imputation.number())
            print(amputation.name() + ': ' + str(p) + ' done')
    
    # dump results
    clf_evaluation.dump_results()
    lnorm_evaluation.dump_results()
    time_evaluation.dump_results()
    print(dataset_name + ' done-------------------------------------------')

ionosphere started-------------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  result[c][rows] = np.nan


mcar: 0.2 done
mcar: 0.4 done
mcar: 0.6 done
mcar: 0.8 done


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  result[c][rows] = np.nan


mar: 0.2 done
mar: 0.4 done
mar: 0.6 done
mar: 0.8 done


  return (a - mns) / sstd
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  result[c][rows] = np.nan


mnar: 0.2 done
mnar: 0.4 done
mnar: 0.6 done
mnar: 0.8 done
ionosphere done-------------------------------------------
iris started-------------------------------------------
mcar: 0.2 done
mcar: 0.4 done
mcar: 0.6 done
mcar: 0.8 done
mar: 0.2 done
mar: 0.4 done
mar: 0.6 done
mar: 0.8 done
mnar: 0.2 done
mnar: 0.4 done
mnar: 0.6 done
mnar: 0.8 done
iris done-------------------------------------------
ecoli started-------------------------------------------
mcar: 0.2 done
mcar: 0.4 done
mcar: 0.6 done
mcar: 0.8 done
mar: 0.2 done
mar: 0.4 done
mar: 0.6 done
mar: 0.8 done
mnar: 0.2 done
mnar: 0.4 done
mnar: 0.6 done
mnar: 0.8 done
ecoli done-------------------------------------------
satellite started-------------------------------------------
mcar: 0.2 done
mcar: 0.4 done
mcar: 0.6 done
mcar: 0.8 done
mar: 0.2 done
mar: 0.4 done
mar: 0.6 done
mar: 0.8 done
mnar: 0.2 done
mnar: 0.4 done
mnar: 0.6 done
mnar: 0.8 done
satellite done-------------------------------------------
wdbc started--