# Missing data imputation

In [1]:
import random
import pandas as pd
import numpy as np

## Removing observations with missing data
- CCA (complete case analysys): odstranimo vrstice, kjer manjka katera izmed značilk
- lahko apliciramo na numerical ali categorical data
- ohrani porazdelitev spremenljivk
- tak postopek je primeren, če ni preveč manjkajočih vrednostih

In [2]:
# Load the data with the following command
data = pd.read_csv('data/crx.data', header=None)
varnames = ['A'+str(s) for s in range(1,17)]
data.columns = varnames
data = data.replace('?', np.nan)
data['A2'] = data['A2'].astype('float')
data['A14'] = data['A14'].astype('float')
data['A16'] = data['A16'].map({'+':1, '-':0})

# simuliramo manjkajoče podatke
# Add some missing values at random positions in four variables:
random.seed(9001)
values = list(set([random.randint(0, len(data)) for p in range(0, 100)]))
for var in ['A3', 'A8', 'A9', 'A10']:
    data.loc[values, var] = np.nan
    
# Save your prepared data
data.to_csv('data/creditApprovalUCI.csv', index=False)

data = pd.read_csv('data/creditApprovalUCI.csv')

In [3]:
data.sample(10)

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
631,a,27.25,0.29,u,g,m,h,0.125,f,t,1,t,g,272.0,108,0
424,b,33.67,2.165,u,g,c,v,1.5,f,f,0,f,p,120.0,0,0
677,b,19.42,7.25,u,g,m,v,0.04,f,t,1,f,g,100.0,1,0
326,b,30.17,1.085,y,p,c,v,0.04,f,f,0,f,g,170.0,179,0
599,b,20.5,2.415,u,g,c,v,2.0,t,t,11,t,g,200.0,3000,1
146,b,23.25,1.5,u,g,q,v,2.375,t,t,3,t,g,0.0,582,1
522,b,22.83,3.0,u,g,m,v,1.29,t,t,1,f,g,260.0,800,1
134,b,32.67,5.5,u,g,q,h,5.5,t,t,12,t,g,408.0,1000,1
413,b,40.58,1.5,u,g,i,bb,0.0,f,f,0,f,s,300.0,0,0
273,b,38.25,10.125,y,p,k,v,0.125,f,f,0,f,g,160.0,0,0


In [4]:
data.isnull().mean().sort_values(ascending=True) * 100

A11     0.000000
A12     0.000000
A13     0.000000
A15     0.000000
A16     0.000000
A4      0.869565
A5      0.869565
A6      1.304348
A7      1.304348
A1      1.739130
A2      1.739130
A14     1.884058
A3     13.333333
A8     13.333333
A9     13.333333
A10    13.333333
dtype: float64

In [5]:
data_cca = data.dropna()

In [6]:
print(f'Number of total observations: {len(data)}')
print(f'Number of observations with complete cases: {len(data_cca)}')

Number of total observations: 690
Number of observations with complete cases: 564


## Performing mean or median imputation

**Mean or median imputation consists of replacing missing values with the variable mean or
median**. 

- Lahko izvajamo na numeričnih podatkih.

- Pomembno je da najprej splitamo -> mediano ali mean računamo na training setu. To vrednost potem uporabimo za test set. Pomembno je da se test del ne uporabi pri izračunu povprečja, ker lahko na ta način že prejudiciramo na rezultat.

- mean imputacijo uporabimo za normalno distribuirane spremenljivke
- median imputacijo uporabimo če **ne gre** za normalno distribucijo


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from feature_engine.imputation import MeanMedianImputer # feature-engine knjižnica

##### Primer imputacije s pandasom

In [10]:
data = pd.read_csv('data/creditApprovalUCI.csv')

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [12]:
X_train.isnull().mean() * 100
# procentualni pregled manjkajočih vrednosti

A1      0.828157
A2      2.277433
A3     14.078675
A4      0.828157
A5      0.828157
A6      0.828157
A7      0.828157
A8     14.078675
A9     14.078675
A10    14.078675
A11     0.000000
A12     0.000000
A13     0.000000
A14     1.449275
A15     0.000000
dtype: float64

In [13]:
# sprehodimo se čez vsak stolpec in jih napolnimo z mean/median
# v praksi ne delamo na načina, tukaj samo za ponazoritev
for var in ['A2', 'A3', 'A8', 'A11', 'A15']:
    value = X_train[var].median()
    X_train[var] = X_train[var].fillna(value)
    X_test[var] = X_test[var].fillna(value)

##### Primer imputacije z sklearn

In [15]:

X_train, X_test, y_train, y_test = train_test_split(data[['A2', 'A3', 'A8', 'A11', 'A15']], data['A16'], test_size=0.3, random_state=0)

In [16]:
# inicializiramo imputer
imputer = SimpleImputer(strategy='median')
# imputer = SimpleImputer(strategy = 'mean')

In [17]:
imputer.fit(X_train) # pozor: X_train morajo biti samo numerični tip podatkov (ne kategorični, stringi...)

In [18]:
# Let's inspect the learned median values:
# mean/median vrednosti posameznih značilk
imputer.statistics_

array([28.835,  2.75 ,  1.   ,  0.   ,  6.   ])

In [22]:
# Simple imputer vrne vrednosti v numpy array. Če želimo nadaljevati analizo v pandas moramo najprej zgraditi dataframe
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)
pd.DataFrame(X_train, columns = ['A2', 'A3', 'A8', 'A11', 'A15'])



Unnamed: 0,A2,A3,A8,A11,A15
0,46.08,3.000,2.375,8.0,4159.0
1,15.92,2.875,0.085,0.0,0.0
2,36.33,2.125,0.085,1.0,1187.0
3,22.17,0.585,0.000,0.0,0.0
4,57.83,7.040,14.000,6.0,1332.0
...,...,...,...,...,...
478,36.75,4.710,0.000,0.0,0.0
479,41.75,0.960,2.500,0.0,600.0
480,19.58,0.665,1.665,0.0,5.0
481,22.83,2.290,2.290,7.0,2384.0


##### Uporaba knjižnice feature-eninge
- imputer od feature-engine-a zna rabrati kje so numerične vrednosti, tako da lahko podamo vse značilke (lahko pa eksplicitno še povemo nad katerimi naj naredi imput)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

# To perform mean imputation, change the imputation method, as follows: MeanMedianImputer(imputation_method='mean').
median_imputer = MeanMedianImputer(imputation_method='median', variables=['A2', 'A3', 'A8', 'A11', 'A15']) # podali cel dataset, ekspilicitno še povedali katere dopolnjujemo

In [25]:
median_imputer.fit(X_train)

In [26]:
median_imputer.imputer_dict_

{'A2': 28.835, 'A3': 2.75, 'A8': 1.0, 'A11': 0.0, 'A15': 6.0}

In [27]:
X_train = median_imputer.transform(X_train)
X_test = median_imputer.transform(X_test)

In [30]:
X_train.head()
# vrne direktno dataframe (za razliko od sci-kit learn paketa)

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
596,a,46.08,3.0,u,g,c,v,2.375,t,t,8,t,g,396.0,4159
303,a,15.92,2.875,u,g,q,v,0.085,f,f,0,f,g,120.0,0
204,b,36.33,2.125,y,p,w,v,0.085,t,t,1,f,g,50.0,1187
351,b,22.17,0.585,y,p,ff,ff,0.0,f,f,0,f,g,100.0,0
118,b,57.83,7.04,u,g,m,v,14.0,t,t,6,t,g,360.0,1332


In [32]:
X_train[['A2','A3', 'A8', 'A11', 'A15']].isnull().mean()
# preverimo, da smo res nadomestili vse null podatke

A2     0.0
A3     0.0
A8     0.0
A11    0.0
A15    0.0
dtype: float64

### Mean / median imputation with Sklearn selecting features to impute

In [33]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

- Primer kako uporabimo pipeline, da imputamo samo numerične podatke, ostale pa pošljemo mimo "passthrough".  
- **Pipeline-i nam omogočajo avtomatizacijo celotnega ML postopka.**
- feature-engine je zelo združljiv s scikitlearn-om, tako da ga lahko vključimo tudi v pipeline

In [35]:
# load data
data = pd.read_csv('data/creditApprovalUCI.csv')

# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

# first we need to make a list with the numerical vars
numeric_features_mean = ['A2', 'A3', 'A8', 'A11', 'A15']

# then we instantiate the imputer within a pipeline
numeric_mean_imputer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
])

# then we put the features list and the imputer in the column transformer
preprocessor = ColumnTransformer(transformers=[('mean_imputer', numeric_mean_imputer, numeric_features_mean)], remainder='passthrough')

# now we fit the preprocessor
preprocessor.fit(X_train)

# V jupyter notebook lahko klikaš na graf, da vidiš po katerem pipeline-u gredo posamezne značilke

In [36]:
# and now we impute the data
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

In [37]:
# Note that Scikit-Learn transformers return NumPy arrays!!
X_train

array([[46.08, 3.0, 2.375, ..., 't', 'g', 396.0],
       [15.92, 2.875, 0.085, ..., 'f', 'g', 120.0],
       [36.33, 2.125, 0.085, ..., 'f', 'g', 50.0],
       ...,
       [19.58, 0.665, 1.665, ..., 'f', 'g', 220.0],
       [22.83, 2.29, 2.29, ..., 't', 'g', 140.0],
       [40.58, 3.29, 3.5, ..., 't', 's', 400.0]], dtype=object)

## Implementing mode or frequent category imputation

**Mode imputation consists of replacing missing values with the mode.** 


- **using the train set and then use to impute values in train and test**
- če je preveliko število manjkajočih vrednosti, lahko ta imputacija spremeni distribucijo, kar ni želeno -. pogoj je torej, da ni preveč manjkajočih vrednosti

In [39]:
data = pd.read_csv('data/creditApprovalUCI.csv')
data.sample(5)

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
336,b,47.33,6.5,u,g,c,v,1.0,f,f,0,t,g,0.0,228,0
242,b,28.33,5.0,u,g,w,v,11.0,t,f,0,t,g,70.0,0,1
57,b,44.33,0.5,u,g,i,h,5.0,t,f,0,t,g,320.0,0,1
529,b,43.25,25.21,u,g,q,h,0.21,t,t,1,f,g,760.0,90,0
558,b,51.42,0.04,u,g,x,h,0.04,t,f,0,f,g,0.0,3000,1


##### Uporaba pandas (za ponazoritev)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [41]:
for var in ['A4', 'A5', 'A6', 'A7']:
    value = X_train[var].mode()[0]
    X_train[var] = X_train[var].fillna(value)
    X_test[var] = X_test[var].fillna(value)

In [62]:
# mode je vrednost ki se najpogosteje ponavlja
# X_train['A6'].value_counts()
# X_train['A6'].mode()

##### Scikit learn primer:

In [46]:
# moramo podati samo kategorične vrednosti za most_frequent imputer
X_train, X_test, y_train, y_test = train_test_split(data[['A4', 'A5', 'A6', 'A7']], data['A16'], test_size=0.3, random_state=0)

In [47]:
imputer = SimpleImputer(strategy='most_frequent')

In [48]:
imputer.fit(X_train)

In [49]:
imputer.statistics_

array(['u', 'g', 'c', 'v'], dtype=object)

In [50]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

##### feature-engine primer:

In [51]:
from feature_engine.imputation import CategoricalImputer

X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

mode_imputer = CategoricalImputer(imputation_method="frequent", variables=['A4', 'A5', 'A6','A7'])

In [52]:
mode_imputer.fit(X_train)

In [53]:
mode_imputer.imputer_dict_

{'A4': 'u', 'A5': 'g', 'A6': 'c', 'A7': 'v'}

In [54]:
X_train = mode_imputer.transform(X_train)
X_test = mode_imputer.transform(X_test)

In [55]:
X_train.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
596,a,46.08,3.0,u,g,c,v,2.375,t,t,8,t,g,396.0,4159
303,a,15.92,2.875,u,g,q,v,0.085,f,f,0,f,g,120.0,0
204,b,36.33,2.125,y,p,w,v,0.085,t,t,1,f,g,50.0,1187
351,b,22.17,0.585,y,p,ff,ff,0.0,f,f,0,f,g,100.0,0
118,b,57.83,7.04,u,g,m,v,14.0,t,t,6,t,g,360.0,1332


##### Pipeline primer
Pipeline primer od zgoraj dopolnemo tako da imputira tudi category podate.

In [56]:
# load data
data = pd.read_csv('data/creditApprovalUCI.csv')

# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

# first we need to make lists with different variable types
numeric_features_mean = ['A2', 'A3', 'A8', 'A11', 'A15']
category_features_mode = ['A4', 'A5', 'A6','A7']

# then we instantiate all the imputers within a pipeline
numeric_mean_imputer = Pipeline(steps=[
    ('imputer_n_mean', SimpleImputer(strategy='mean')),
])

category_features_mode_imputer =Pipeline(steps=[
    ('imputer_c_mode', SimpleImputer(strategy='most_frequent')),
])

# then we put the features list and the imputers in the column transformer
preprocessor = ColumnTransformer(transformers=[('mean_imputer', numeric_mean_imputer, numeric_features_mean),
                                               ('mode_c_imputer', category_features_mode_imputer, category_features_mode)],
                                remainder='passthrough')

# now we fit the preprocessor
preprocessor.fit(X_train)

## Replacing missing values with an arbitrary number
- manjkajoče vrednosti nadomestimo z nekim naključnim številom
- ponavadi gre za vrednosti, ki ne pašejo v dejansko porazdelitev vrednosti (outlier): npr 999999,-1 ...
- ne smemo izbrati vrednosti, ki so blizu mean, median ali drugih vrednosti v porazdelitvi

#### Kdaj lahko uproabimo tak pristop?
- tak pristop je uporaben kadar ne gre za **naključno manjkajoče podatke**
- uporabljamo nelinearne modele
- odstotek manjkajočih podatkov je visok
- tovrstna imputacija spremeni variable-distribution

In [63]:
data = pd.read_csv('data/creditApprovalUCI.csv')

X_train, X_test, y_train, y_test = train_test_split(data[['A2', 'A3', 'A8', 'A11']], data['A16'], test_size=0.3, random_state=0)

imputer = SimpleImputer(strategy='constant', fill_value=99)

imputer.fit(X_train)

In [64]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

## Capturing missing values in a bespoke category
- včasih pri kategoričnih vrednostih za manjkajoče vrednosti uvedemo novo kategorijo npr: "Missing"

In [65]:
data = pd.read_csv('data/creditApprovalUCI.csv')

X_train, X_test, y_train, y_test = train_test_split(data[['A4', 'A5', 'A6', 'A7']], data['A16'], test_size=0.3, random_state=0)

imputer = SimpleImputer(strategy='constant', fill_value='Missing')
imputer.fit(X_train)

In [66]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [67]:
X_train[35:45]

array([['u', 'g', 'c', 'bb'],
       ['y', 'p', 'ff', 'ff'],
       ['y', 'p', 'ff', 'ff'],
       ['u', 'g', 'q', 'v'],
       ['Missing', 'Missing', 'Missing', 'Missing'],
       ['y', 'p', 'c', 'h'],
       ['u', 'g', 'd', 'v'],
       ['y', 'p', 'aa', 'v'],
       ['y', 'p', 'j', 'v'],
       ['u', 'g', 'k', 'v']], dtype=object)

> https://feature-engine.trainindata.com/en/latest/api_doc/imputation/CategoricalImputer.html: The CategoricalImputer() replaces missing data in categorical variables by an arbitrary value or by the most frequent category.

## Replacing missing values with a value at the end of the distribution
- EndTailImputer avtomatsko najde outliner, da ga ne rabimo ročno izbirati (prejšnje poglavje)
- prav tako spemeni distribucijo originalnih značilk, torej ni primerno za linearne modele
- po defaultu izvede impute nad vsemi numeričnimi vrednostmi

In [68]:
from feature_engine.imputation import EndTailImputer


data = pd.read_csv('data/creditApprovalUCI.csv')


X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [69]:
imputer = EndTailImputer(imputation_method='iqr', tail='right', variables=['A2', 'A3', 'A8', 'A11', 'A15'])

In [70]:
imputer.fit(X_train)

In [71]:
imputer.imputer_dict_

{'A2': 88.18,
 'A3': 27.31,
 'A8': 11.504999999999999,
 'A11': 12.0,
 'A15': 1800.0}

In [72]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

## Implementing random sample imputation
- ohranja originalno distribucijo
- primerno za numerične in kategorične vrednosti

In [73]:
from feature_engine.imputation import RandomSampleImputer

In [74]:
data = pd.read_csv('data/creditApprovalUCI.csv')

X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [75]:
imputer = RandomSampleImputer()
imputer.fit(X_train)

In [76]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

## Performing multivariate imputation by chained equations (MICE)
- prejšnje imputacije so bile univariantne - glede se samo ena značilka
- pri multivariantih se gleda celoten dataset za dopolnitev manjkajočih vrednosti
- dodatna razlaga : https://www.numpyninja.com/post/mice-algorithm-to-impute-missing-values-in-a-dataset
- iterativna metoda
- na nek način postane značilka -> target column, in jo napovedujemo z nekim ML modelom

In [83]:
from sklearn.experimental import enable_iterative_imputer # moramo enablat, da lahko importamo IterativeImputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import train_test_split

In [78]:
variables = ['A2','A3','A8', 'A11', 'A14', 'A15', 'A16']
data = pd.read_csv('data/creditApprovalUCI.csv', usecols=variables)

In [79]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1),data['A16' ], test_size=0.3, random_state=0)

In [84]:
imputer = IterativeImputer(estimator = BayesianRidge(), max_iter=10, random_state=0) # estimator -> s katerim ML modelom imputamo manjkajoče vrednosti

In [81]:
imputer.fit(X_train)

In [82]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

## Assembling an imputation pipeline with scikit-learn

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('data/creditApprovalUCI.csv')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [None]:
features_num_arbitrary = ['A3', 'A8']
features_num_median = ['A2', 'A14']
features_cat_frequent = ['A4', 'A5', 'A6', 'A7']
features_cat_missing = ['A1', 'A9', 'A10']

In [None]:
imputer_num_arbitrary = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=99)),
    ])

imputer_num_median = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ])

imputer_cat_frequent = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ])

imputer_cat_missing = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant',
    fill_value='Missing')),
])

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ('imp_num_arbitrary', imputer_num_arbitrary, features_num_arbitrary),
    ('imp_num_median', imputer_num_median, features_num_median),
    ('imp_cat_frequent', imputer_cat_frequent, features_cat_frequent),
    ('imp_cat_missing', imputer_cat_missing, features_cat_missing),
], remainder='passthrough')

In [None]:
preprocessor.fit(X_train)

In [None]:
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

In [None]:
X_train