# Missing data imputation

In [1]:
import random
import pandas as pd
import numpy as np

## Removing observations with missing data
- CCA (complete case analysys): odstranimo vrstice, kjer manjka katera izmed značilk
- lahko apliciramo na numerical ali categorical data
- ohrani porazdelitev spremenljivk
- tak postopek je primeren, če ni preveč manjkajočih vrednostih

In [4]:
# Load the data with the following command
data = pd.read_csv('data/crx.data', header=None)
varnames = ['A'+str(s) for s in range(1,17)]
data.columns = varnames
data = data.replace('?', np.nan)
data['A2'] = data['A2'].astype('float')
data['A14'] = data['A14'].astype('float')
data['A16'] = data['A16'].map({'+':1, '-':0})

# simuliramo manjkajoče podatke
# Add some missing values at random positions in four variables:
random.seed(9001)
values = list(set([random.randint(0, len(data)) for p in range(0, 100)]))
for var in ['A3', 'A8', 'A9', 'A10']:
    data.loc[values, var] = np.nan
    
# Save your prepared data
data.to_csv('data/creditApprovalUCI.csv', index=False)

data = pd.read_csv('data/creditApprovalUCI.csv')

In [8]:
data.sample(10)

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
286,a,,1.5,u,g,ff,ff,0.0,f,t,2,t,g,200.0,105,0
99,a,28.5,1.0,u,g,q,v,1.0,t,t,2,t,g,167.0,500,0
307,b,18.58,10.29,u,g,ff,ff,0.415,f,f,0,f,g,80.0,0,0
637,b,19.5,9.585,u,g,aa,v,0.79,f,f,0,f,g,80.0,350,0
234,a,58.42,21.0,u,g,i,bb,10.0,t,t,13,f,g,0.0,6700,1
70,b,32.33,7.5,u,g,e,bb,1.585,t,f,0,t,s,420.0,0,0
31,b,42.0,9.79,u,g,x,h,7.96,t,t,8,f,g,0.0,0,1
519,b,39.17,1.71,u,g,x,v,0.125,t,t,5,t,g,480.0,0,1
125,b,34.92,5.0,u,g,x,h,7.5,t,t,6,t,g,0.0,1000,1
329,b,,4.0,y,p,i,v,0.085,f,f,0,t,g,411.0,0,0


In [12]:
data.isnull().mean().sort_values(ascending=True) * 100

A11     0.000000
A12     0.000000
A13     0.000000
A15     0.000000
A16     0.000000
A4      0.869565
A5      0.869565
A6      1.304348
A7      1.304348
A1      1.739130
A2      1.739130
A14     1.884058
A3     13.333333
A8     13.333333
A9     13.333333
A10    13.333333
dtype: float64

In [13]:
data_cca = data.dropna()

In [14]:
print(f'Number of total observations: {len(data)}')
print(f'Number of observations with complete cases: {len(data_cca)}')

Number of total observations: 690
Number of observations with complete cases: 564


## Performing mean or median imputation

**Mean or median imputation consists of replacing missing values with the variable mean or
median**. 

- Lahko izvajamo na numeričnih podatkih.

- Pomembno je da najprej splitamo -> mediano ali mean računamo na training setu. To vrednost potem uporabimo za test set. Pomembno je da se test del ne uporabi pri izračunu povprečja, ker lahko na ta način že prejudiciramo na rezultat.

- mean imputacijo uporabimo za normalno distribuirane spremenljivke
- median imputacijo uporabimo če **ne gre** za normalno distribucijo


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from feature_engine.imputation import MeanMedianImputer # feature-engine knjižnica

##### Primer imputacije s pandasom

In [None]:
data = pd.read_csv('data/creditApprovalUCI.csv')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [None]:
X_train.isnull().mean() * 100

In [None]:
for var in ['A2', 'A3', 'A8', 'A11', 'A15']:
    value = X_train[var].median()
    X_train[var] = X_train[var].fillna(value)
    X_test[var] = X_test[var].fillna(value)

##### Primer imputacije z sklearn
- komentar: od tu dalje si poglej posnetek predavanj

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data[['A2', 'A3', 'A8', 'A11', 'A15']], data['A16'], test_size=0.3, random_state=0)

In [None]:
imputer = SimpleImputer(strategy='median')
# imputer = SimpleImputer(strategy = 'mean')

In [None]:
imputer.fit(X_train)

In [None]:
# Let's inspect the learned median values:
imputer.statistics_

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

# To perform mean imputation, change the imputation method, as follows: MeanMedianImputer(imputation_method='mean').
median_imputer = MeanMedianImputer(imputation_method='median', variables=['A2', 'A3', 'A8', 'A11', 'A15'])

In [None]:
median_imputer.fit(X_train)

In [None]:
median_imputer.imputer_dict_

In [None]:
X_train = median_imputer.transform(X_train)
X_test = median_imputer.transform(X_test)

In [None]:
X_train.head()

In [None]:
X_train[['A2','A3', 'A8', 'A11', 'A15']].isnull().mean()

### Mean / median imputation with Sklearn selecting features to impute

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
# load data
data = pd.read_csv('data/creditApprovalUCI.csv')

# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

# first we need to make a list with the numerical vars
numeric_features_mean = ['A2', 'A3', 'A8', 'A11', 'A15']

# then we instantiate the imputer within a pipeline
numeric_mean_imputer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
])

# then we put the features list and the imputer in the column transformer
preprocessor = ColumnTransformer(transformers=[
    ('mean_imputer', numeric_mean_imputer, numeric_features_mean)
    ], remainder='passthrough')

# now we fit the preprocessor
preprocessor.fit(X_train)

In [None]:
# and now we impute the data
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

In [None]:
# Note that Scikit-Learn transformers return NumPy arrays!!
X_train

## Implementing mode or frequent category imputation

**Mode imputation consists of replacing missing values with the mode.** 


In [None]:
data = pd.read_csv('data/creditApprovalUCI.csv')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [None]:
for var in ['A4', 'A5', 'A6', 'A7']:
    value = X_train[var].mode()[0]
    X_train[var] = X_train[var].fillna(value)
    X_test[var] = X_test[var].fillna(value)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data[['A4', 'A5', 'A6', 'A7']], data['A16'], test_size=0.3, random_state=0)

In [None]:
imputer = SimpleImputer(strategy='most_frequent')

In [None]:
imputer.fit(X_train)

In [None]:
imputer.statistics_

In [None]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [None]:
from feature_engine.imputation import CategoricalImputer

X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

mode_imputer = CategoricalImputer(imputation_method="frequent", variables=['A4', 'A5', 'A6','A7'])

In [None]:
mode_imputer.fit(X_train)

In [None]:
mode_imputer.imputer_dict_

In [None]:
X_train = mode_imputer.transform(X_train)
X_test = mode_imputer.transform(X_test)

In [None]:
X_train.head()

## Replacing missing values with an arbitrary number

In [None]:
data = pd.read_csv('data/creditApprovalUCI.csv')

X_train, X_test, y_train, y_test = train_test_split(data[['A2', 'A3', 'A8', 'A11']], data['A16'], test_size=0.3, random_state=0)

imputer = SimpleImputer(strategy='constant', fill_value=99)

imputer.fit(X_train)

In [None]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

## Capturing missing values in a bespoke category

In [None]:
data = pd.read_csv('data/creditApprovalUCI.csv')

X_train, X_test, y_train, y_test = train_test_split(data[['A4', 'A5', 'A6', 'A7']], data['A16'], test_size=0.3, random_state=0)

imputer = SimpleImputer(strategy='constant', fill_value='Missing')
imputer.fit(X_train)

In [None]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [None]:
X_train[35:45]

> https://feature-engine.trainindata.com/en/latest/api_doc/imputation/CategoricalImputer.html: The CategoricalImputer() replaces missing data in categorical variables by an arbitrary value or by the most frequent category.

## Replacing missing values with a value at the end of the distribution

In [None]:
from feature_engine.imputation import EndTailImputer


data = pd.read_csv('data/creditApprovalUCI.csv')


X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [None]:
imputer = EndTailImputer(imputation_method='iqr', tail='right', variables=['A2', 'A3', 'A8', 'A11', 'A15'])

In [None]:
imputer.fit(X_train)

In [None]:
imputer.imputer_dict_

In [None]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

## Implementing random sample imputation

In [None]:
from feature_engine.imputation import RandomSampleImputer

In [None]:
data = pd.read_csv('data/creditApprovalUCI.csv')

X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [None]:
imputer = RandomSampleImputer()
imputer.fit(X_train)

In [None]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

## Performing multivariate imputation by chained equations

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import train_test_split

In [None]:
variables = ['A2','A3','A8', 'A11', 'A14', 'A15', 'A16']
data = pd.read_csv('data/creditApprovalUCI.csv', usecols=variables)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1),data['A16' ], test_size=0.3, random_state=0)

In [None]:
imputer = IterativeImputer(estimator = BayesianRidge(), max_iter=10, random_state=0)

In [None]:
imputer.fit(X_train)

In [None]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

## Assembling an imputation pipeline with scikit-learn

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('data/creditApprovalUCI.csv')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [None]:
features_num_arbitrary = ['A3', 'A8']
features_num_median = ['A2', 'A14']
features_cat_frequent = ['A4', 'A5', 'A6', 'A7']
features_cat_missing = ['A1', 'A9', 'A10']

In [None]:
imputer_num_arbitrary = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=99)),
    ])

imputer_num_median = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ])

imputer_cat_frequent = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ])

imputer_cat_missing = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant',
    fill_value='Missing')),
])

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ('imp_num_arbitrary', imputer_num_arbitrary, features_num_arbitrary),
    ('imp_num_median', imputer_num_median, features_num_median),
    ('imp_cat_frequent', imputer_cat_frequent, features_cat_frequent),
    ('imp_cat_missing', imputer_cat_missing, features_cat_missing),
], remainder='passthrough')

In [None]:
preprocessor.fit(X_train)

In [None]:
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

In [None]:
X_train