In [1]:
using_colab = 'google.colab' in str(get_ipython())
if using_colab:
  !git clone https://github.com/mahynski/chemometrics.git
  !cd chemometrics; pip install -r requirements.txt
else:
  import sys
  sys.path.append('../../')

import chemometrics

import matplotlib.pyplot as plt
%matplotlib notebook

import watermark
%load_ext watermark

%load_ext autoreload
%autoreload 2

Cloning into 'chemometrics'...
remote: Enumerating objects: 638, done.[K
remote: Counting objects: 100% (638/638), done.[K
remote: Compressing objects: 100% (464/464), done.[K
remote: Total 638 (delta 377), reused 371 (delta 161), pack-reused 0[K
Receiving objects: 100% (638/638), 8.34 MiB | 31.27 MiB/s, done.
Resolving deltas: 100% (377/377), done.
Collecting umap-learn
  Downloading umap-learn-0.5.2.tar.gz (86 kB)
[K     |████████████████████████████████| 86 kB 1.9 MB/s 
[?25hCollecting watermark
  Downloading watermark-2.3.0-py2.py3-none-any.whl (7.2 kB)
Collecting pynndescent>=0.5
  Downloading pynndescent-0.5.6.tar.gz (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 52.2 MB/s 
Collecting importlib-metadata<3.0
  Downloading importlib_metadata-2.1.3-py2.py3-none-any.whl (10 kB)
Building wheels for collected packages: umap-learn, pynndescent
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Created wheel for umap-learn: filename=umap_learn-0.5.2-py3-n

In [2]:
import imblearn
import sklearn

from sklearn.model_selection import GridSearchCV

import numpy as np
import pandas as pd

Overview
--------
This are some examples ways to impute missing data. scikit-learn has a [library](https://scikit-learn.org/stable/modules/impute.html#univariate-vs-multivariate-imputation) for simple methods which is also very useful. 

In [3]:
%watermark -t -m -v --iversions

Python implementation: CPython
Python version       : 3.7.12
IPython version      : 5.5.0

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.144+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

numpy     : 1.19.5
pandas    : 1.3.5
watermark : 2.3.0
matplotlib: 3.2.2
sklearn   : 0.0
imblearn  : 0.0
IPython   : 5.5.0



# Load some Data

In [4]:
if using_colab:
    loc = 'https://raw.githubusercontent.com/mahynski/chemometrics/master/tests/data/pls_train.csv'
else:
    loc = '../tests/data/pls_train.csv'
df = pd.read_csv(loc)

raw_X = np.array(df.values[:,3:], dtype=float) # Extract features
raw_y = np.array(df['Water'].values, dtype=float) # Take the water content as the target

# Randomly delete some entries
n_delete = 10

np.random.seed(0)
a = [np.random.randint(low=0, high=raw_X.shape[0]) 
     for i in range(n_delete)]
b = [np.random.randint(low=0, high=raw_X.shape[1]) 
     for i in range(n_delete)]

missing_X = raw_X.copy()
for i,j in zip(a,b):
    missing_X[i,j] = np.nan 
    
def compare(raw_X, reconstructed_X):
    print('Reconstructed\tOriginal\tDifference\tRelative Err')
    for i,j in zip(a,b):
        print('%.3e\t'%reconstructed_X[i,j]
              +'%.3e\t'%raw_X[i,j]
              +'%.3e\t'%(reconstructed_X[i,j]-raw_X[i,j])
              +'%.3f'%(np.abs((reconstructed_X[i,j]-raw_X[i,j])/raw_X[i,j]))
             )

# Iterative PCA (Missing X values)

## Fixed n_components

If you know the number of components to use you can just perform this directly.

In [5]:
from chemometrics.preprocessing.missing import PCA_IA

In [6]:
itim = PCA_IA(n_components=3, 
              scale_x=True,
              missing_values=np.nan, 
              tol=1.0e-6, 
              max_iters=5000)

In [7]:
reconstructed_X = itim.fit_transform(missing_X)
compare(raw_X, reconstructed_X)

Reconstructed	Original	Difference	Relative Err
5.814e-01	5.629e-01	1.848e-02	0.033
-1.458e+00	-1.457e+00	-9.806e-04	0.001
6.187e-01	6.290e-01	-1.027e-02	0.016
6.521e-01	6.713e-01	-1.927e-02	0.029
1.000e+00	9.980e-01	2.025e-03	0.002
-1.540e+00	-1.542e+00	1.949e-03	0.001
-1.608e+00	-1.609e+00	2.426e-04	0.000
1.104e+00	1.107e+00	-3.625e-03	0.003
-5.570e-01	-5.565e-01	-5.697e-04	0.001
4.703e-01	4.465e-01	2.377e-02	0.053


## Unknown n_components

Usually, we need to figure out what a good n_components value is. We can use cross-validation for this.

In [8]:
pipeline = sklearn.pipeline.Pipeline(steps=[
    ("pca_ia", PCA_IA(
        n_components=1, 
        scale_x=True)
    )
])

# Hyperparameters of pipeline steps are given in standard notation: step__parameter_name
param_grid = [{
    'pca_ia__n_components': np.arange(1, 10, 2),
    'pca_ia__scale_x': [True, False],
}]

gs = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    n_jobs=-1,
    cv=sklearn.model_selection.KFold(n_splits=3, shuffle=True, random_state=0),
    error_score=0,
    refit=True
)

_ = gs.fit(missing_X, raw_y.reshape(-1,1))

In [9]:
gs.best_params_

{'pca_ia__n_components': 9, 'pca_ia__scale_x': False}

In [10]:
filler = PCA_IA(
        n_components=9, 
        scale_x=False)
reconstructed_X = filler.fit_transform(missing_X, 
                                       raw_y.reshape(-1,1))

In [11]:
compare(raw_X, reconstructed_X)

Reconstructed	Original	Difference	Relative Err
5.175e-01	5.629e-01	-4.544e-02	0.081
-1.456e+00	-1.457e+00	3.158e-04	0.000
6.290e-01	6.290e-01	5.983e-05	0.000
6.120e-01	6.713e-01	-5.932e-02	0.088
1.012e+00	9.980e-01	1.429e-02	0.014
-1.542e+00	-1.542e+00	9.013e-05	0.000
-1.609e+00	-1.609e+00	2.404e-05	0.000
1.108e+00	1.107e+00	5.621e-05	0.000
-5.563e-01	-5.565e-01	1.776e-04	0.000
5.139e-01	4.465e-01	6.736e-02	0.151


You can then use this in other pipelines.  You can specify the imputer without any hyperparameters in those cases, for example.
Below is an example of how you might do that. Of course, you can also include the imputer's hyperparameters as part of the CV, too.

```

pipeline = imblearn.pipeline.Pipeline(steps=[
    # Insert other preprocessing steps here...
    ("pca_ia", PCA_IA(n_components=9, scale_x=False)),
    ("plsda", PLSDA(n_components=5, 
                    alpha=0.05,
                    scale_x=True, 
                    not_assigned='UNKNOWN',
                    style='soft', 
                   )
    )
])

# NO HYPERPARAMETERS ASSOCIATED WITH THE IMPUTER
param_grid = [{
    'plsda__n_components':np.arange(1, 10, 2),
    'plsda__alpha': [0.07, 0.05, 0.03, 0.01],
}]

gs = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    n_jobs=-1,
    cv=5,
    error_score=0,
    refit=True
)

_ = gs.fit(x_train, y_train)
```

# Iterative PLS (Missing X values)

## Fixed n_components

In [12]:
from chemometrics.preprocessing.missing import PLS_IA

In [13]:
itim = PLS_IA(
    n_components=3, 
    missing_values=np.nan, 
    scale_x=True,
    tol=1.0e-6, 
    max_iters=5000)

In [14]:
reconstructed_X = itim.fit_transform(missing_X, raw_y.reshape(-1,1))

In [15]:
compare(raw_X, reconstructed_X)

Reconstructed	Original	Difference	Relative Err
5.646e-01	5.629e-01	1.679e-03	0.003
-1.455e+00	-1.457e+00	1.376e-03	0.001
6.299e-01	6.290e-01	9.861e-04	0.002
6.705e-01	6.713e-01	-7.987e-04	0.001
9.934e-01	9.980e-01	-4.562e-03	0.005
-1.541e+00	-1.542e+00	1.084e-03	0.001
-1.607e+00	-1.609e+00	1.668e-03	0.001
1.106e+00	1.107e+00	-1.266e-03	0.001
-5.569e-01	-5.565e-01	-4.453e-04	0.001
4.477e-01	4.465e-01	1.213e-03	0.003


## Unknown n_components

In [16]:
pipeline = sklearn.pipeline.Pipeline(steps=[
    ("pls_ia", PLS_IA(
        n_components=1, 
        scale_x=True)
    )
])

# Hyperparameters of pipeline steps are given in standard notation: step__parameter_name
param_grid = [{
    'pls_ia__n_components': np.arange(1, 10, 2),
    'pls_ia__scale_x': [True, False],
}]

gs = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    n_jobs=-1,
    cv=sklearn.model_selection.KFold(n_splits=3, shuffle=True, random_state=0),
    error_score=0,
    refit=True
)

_ = gs.fit(missing_X, raw_y.reshape(-1,1))

In [17]:
gs.best_params_

{'pls_ia__n_components': 9, 'pls_ia__scale_x': False}

# Below LOD

In [18]:
from chemometrics.preprocessing.missing import LOD

## Missing values < LOD only

In [19]:
X = np.array(
    [
        [1.0, 2.0, 3.0, 4.0],
        [np.nan, 3.0, 2.0, np.nan],
        [5.0, 1.0, np.nan, 5.0],
        [2.0, 3.0, 4.0, 5.0]
    ]
)

lod = np.array([0.15, 0.15, 0.25, 0.15])

In [20]:
imputer = LOD(lod, missing_values=np.nan, seed=0)
imputer.fit_transform(X)

array([[1.        , 2.        , 3.        , 4.        ],
       [0.09554425, 3.        , 2.        , 0.04046801],
       [5.        , 1.        , 0.01024338, 5.        ],
       [2.        , 3.        , 4.        , 5.        ]])

## Missing values and < LOD

In [21]:
# Now assume -1 indicates < LOD and a corrupted data entry is
# indicated by a NaN
X = np.array(
    [
        [1.0, np.nan, 3.0, 4.0],
        [-1, 3.0, 2.0, -1],
        [5.0, 1.0, -1, 5.0],
        [2.0, 3.0, np.nan, 5.0]
    ]
)

lod = np.array([0.15, 0.15, 0.25, 0.15])

In [22]:
# If you leave "-1" then when doing imputation that will be 
# considered a "real" value which is not what you (probably) want.

# Step 1: Remove values encoded by numbers. 
imputer = LOD(lod, missing_values=-1, seed=0)
X_lod = imputer.fit_transform(X)
X_lod

array([[1.        ,        nan, 3.        , 4.        ],
       [0.09554425, 3.        , 2.        , 0.04046801],
       [5.        , 1.        , 0.01024338, 5.        ],
       [2.        , 3.        ,        nan, 5.        ]])

In [23]:
# Step 2: Remove NaNs by doing imputation
itim = PLS_IA(
    n_components=2, 
    missing_values=np.nan, 
    scale_x=True,
    tol=1.0e-6, 
    max_iters=5000)
X_recon = itim.fit_transform(X_lod, np.arange(X.shape[0]).reshape(-1,1))
X_recon

array([[ 1.        ,  0.45966465,  3.        ,  4.        ],
       [ 0.09554425,  3.        ,  2.        ,  0.04046801],
       [ 5.        ,  1.        ,  0.01024338,  5.        ],
       [ 2.        ,  3.        , -0.908703  ,  5.        ]])

In [24]:
# Note how some imputed values are now < 0.  This may, or may
# not be sensible. If you want, you can re-perform the LOD
# check because this will register as < LOD due to the sign.

imputer = LOD(lod, missing_values=-1, seed=0)
X_lod = imputer.fit_transform(X_recon)
X_lod

array([[1.00000000e+00, 4.59664648e-01, 3.00000000e+00, 4.00000000e+00],
       [9.55442531e-02, 3.00000000e+00, 2.00000000e+00, 4.04680071e-02],
       [5.00000000e+00, 1.00000000e+00, 1.02433810e-02, 5.00000000e+00],
       [2.00000000e+00, 3.00000000e+00, 4.13190888e-03, 5.00000000e+00]])

In [25]:
# Lesson: Be careful when combining preprocessing!