
### Using Imputations and EM in the Iris Dataset

In [None]:
# import relevant packages
import numpy as np
import pandas as pd
import plotly.express as px

### Data and visualization without missing data

In [None]:
# import iris dataset
df = px.data.iris()
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_id
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,3.0,1.4,0.2,setosa,1
2,4.7,3.2,1.3,0.2,setosa,1
3,4.6,3.1,1.5,0.2,setosa,1
4,5.0,3.6,1.4,0.2,setosa,1
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,3
146,6.3,2.5,5.0,1.9,virginica,3
147,6.5,3.0,5.2,2.0,virginica,3
148,6.2,3.4,5.4,2.3,virginica,3


In [None]:
# plot the 2d pair plot
fig = px.scatter_matrix(df.iloc[:,0:5], color='species', symbol='species', symbol_sequence=['circle', 'cross', 'square'])
fig.update_layout(title=' Original Pair Plot', height = 800, width = 1000)
fig.show()

In [None]:
# Best view of 2D ['sepal_width', 'petal_length']
fig = px.scatter(df, x ='sepal_width', y='petal_length', color='species', symbol='species', symbol_sequence=['circle', 'cross', 'square'])
fig.update_layout(title='Sepal_width vs Petal_length -- Original')
fig.show()

In [None]:
# plot the 3D scatter plot
fig = px.scatter_3d(df, x='sepal_length', y='sepal_width', z='petal_width',
                    color='petal_length', symbol='species',symbol_sequence= ['circle', 'cross', 'square'])
fig.update_layout(legend=dict(y=0, x = 0), title = '3D Iris -- Original', width = 1000, height = 800)
fig.show()

# Create about 40% of missing data and visulize it
#### choose 40% because of easy compare the different between methods

In [None]:
# randomly choose 30 rows as missing data in each column['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
import random
# create a new dataframe store the df with missing data
missing_data_df = px.data.iris()
# define the columns with missing data
missing_data_columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
# for loop to generate missing data in each columns
for column in missing_data_df[missing_data_columns]:
  i = [random.choice(range(150)) for _ in range(60)]
  missing_data_df[column][i] = np.nan

# print the dataframe
missing_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  99 non-null     float64
 1   sepal_width   98 non-null     float64
 2   petal_length  108 non-null    float64
 3   petal_width   96 non-null     float64
 4   species       150 non-null    object 
 5   species_id    150 non-null    int64  
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
# plot the 2d pair plot
fig = px.scatter_matrix(missing_data_df.iloc[:,0:5], color='species', symbol='species', symbol_sequence=['circle', 'cross', 'square'])
fig.update_layout(title='Pair Plot -- Missing Data', height = 800, width = 1000)
fig.show()

In [None]:
# Best view of 2D ['sepal_width', 'petal_length']
fig = px.scatter(missing_data_df, x ='sepal_width', y='petal_length', color='species', symbol='species', symbol_sequence=['circle', 'cross', 'square'])
fig.update_layout(title='Sepal_width vs Petal_length -- Missing Data')
fig.show()

In [None]:
missing_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  99 non-null     float64
 1   sepal_width   98 non-null     float64
 2   petal_length  108 non-null    float64
 3   petal_width   96 non-null     float64
 4   species       150 non-null    object 
 5   species_id    150 non-null    int64  
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


##### In 3D, the number of points drop significantly due to if it missing 1 dimensional data, it might dispear or turn black

In [None]:
# plot the 3D scatter plot
fig = px.scatter_3d(missing_data_df, x='sepal_length', y='sepal_width', z='petal_width',
                    color='petal_length', symbol='species',symbol_sequence= ['circle', 'cross', 'square'])
fig.update_layout(legend=dict(y=0, x = 0), title = '3D Iris -- Missing data', width = 1000, height = 800)
fig.show()

#### Copy the missing data DataFrame

## Imputation by a single value: default value, mean, median

# Default value 0

In [None]:
# use df.copy to copy the missing data dataframe, copy = True(copy the indice and values, original will not affect the copy one)
imputation_defaultValue_df = missing_data_df.copy(deep = True)

In [None]:
# replace with default numnber 0
imputation_defaultValue_df = imputation_defaultValue_df.fillna(0)
imputation_defaultValue_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_id
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,0.0,1.4,0.2,setosa,1
2,4.7,3.2,1.3,0.2,setosa,1
3,0.0,3.1,1.5,0.0,setosa,1
4,5.0,3.6,1.4,0.2,setosa,1
...,...,...,...,...,...,...
145,0.0,0.0,5.2,2.3,virginica,3
146,0.0,2.5,5.0,1.9,virginica,3
147,6.5,3.0,0.0,0.0,virginica,3
148,0.0,3.4,0.0,2.3,virginica,3


In [None]:
# plot the 2d pair plot
fig = px.scatter_matrix(imputation_defaultValue_df.iloc[:,0:5], color='species', symbol='species', symbol_sequence=['circle', 'cross', 'square'])
fig.update_layout(title='Pair Plot -- Imputation by 0', height = 800, width = 1000)
fig.show()

In [None]:
# Best view of 2D ['sepal_width', 'petal_length']
fig = px.scatter(imputation_defaultValue_df, x ='sepal_width', y='petal_length', color='species', symbol='species', symbol_sequence=['circle', 'cross', 'square'])
fig.update_layout(title='Sepal_width vs Petal_length -- Imputation by 0')
fig.show()

In [None]:
# plot the 3D scatter plot
fig = px.scatter_3d(imputation_defaultValue_df, x='sepal_length', y='sepal_width', z='petal_width',
                    color='petal_length', symbol='species',symbol_sequence= ['circle', 'cross', 'square'])
fig.update_layout(legend=dict(y=0, x = 0), title = '3D Iris -- Imputation by 0', width = 1000, height = 800)
fig.show()

# Mean

In [None]:
# copy DataFrame
imputation_mean_df = missing_data_df.copy(deep = True)

In [None]:
# import SimpleImputer
from sklearn.impute import SimpleImputer
# define the missing_values and stratege
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
# fit the defined imputer to DataFrame
imputation_mean_df = imp_mean.fit_transform(imputation_mean_df.iloc[:,0:4])


In [None]:
# we need to transform the numpy ndarray to pandas dataframe again
a1 = pd.DataFrame(imputation_mean_df, columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
a2 = df['species']
imputation_mean_df = pd.concat([a1,a2], axis = 1)
imputation_mean_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.100000,3.500000,1.400000,0.200000,setosa
1,4.900000,3.046535,1.400000,0.200000,setosa
2,4.700000,3.200000,1.300000,0.200000,setosa
3,5.809901,3.100000,1.500000,1.175258,setosa
4,5.000000,3.600000,1.400000,0.200000,setosa
...,...,...,...,...,...
145,5.809901,3.046535,5.200000,2.300000,virginica
146,5.809901,2.500000,5.000000,1.900000,virginica
147,6.500000,3.000000,3.714141,1.175258,virginica
148,5.809901,3.400000,3.714141,2.300000,virginica


In [None]:
# plot the 2d pair plot
fig = px.scatter_matrix(imputation_mean_df.iloc[:,0:5], color='species', symbol='species', symbol_sequence=['circle', 'cross', 'square'])
fig.update_layout(title='Pair Plot -- Mean Imputation', height = 800, width = 1000)
fig.show()

In [None]:
# Best view of 2D ['sepal_width', 'petal_length']
fig = px.scatter(imputation_mean_df, x ='sepal_width', y='petal_length', color='species', symbol='species', symbol_sequence=['circle', 'cross', 'square'])
fig.update_layout(title='Sepal_width vs Petal_length -- Mean Imputation')
fig.show()

In [None]:
# plot the 3D scatter plot
fig = px.scatter_3d(imputation_mean_df, x='sepal_length', y='sepal_width', z='petal_width',
                    color='petal_length', symbol='species',symbol_sequence= ['circle', 'cross', 'square'])
fig.update_layout(legend=dict(y=0, x = 0), title = '3D Iris -- Mean Impuation', width = 1000, height = 800)
fig.show()

#Median

In [None]:
# copy DataFrame
imputation_median_df = missing_data_df.copy(deep = True)

In [None]:
# import SimpleImputer
from sklearn.impute import SimpleImputer
# define the missing_values and stratege
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
# fit the defined imputer to DataFrame
imputation_median_df = imp_median.fit_transform(imputation_median_df.iloc[:,0:4])

In [None]:
# we need to transform the numpy ndarray to pandas dataframe again
b1 = pd.DataFrame(imputation_median_df, columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
b2 = df['species']
imputation_median_df = pd.concat([b1,b2], axis = 1)
imputation_median_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,5.7,3.1,1.5,1.3,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,5.7,3.0,5.2,2.3,virginica
146,5.7,2.5,5.0,1.9,virginica
147,6.5,3.0,4.2,1.3,virginica
148,5.7,3.4,4.2,2.3,virginica


In [None]:
# plot the 2d pair plot
fig = px.scatter_matrix(imputation_median_df.iloc[:,0:5], color='species', symbol='species', symbol_sequence=['circle', 'cross', 'square'])
fig.update_layout(title='Pair Plot -- Median Imputation', height = 800, width = 1000)
fig.show()

In [None]:
# Best view of 2D ['sepal_width', 'petal_length']
fig = px.scatter(imputation_median_df, x ='sepal_width', y='petal_length', color='species', symbol='species', symbol_sequence=['circle', 'cross', 'square'])
fig.update_layout(title='Sepal_width vs Petal_length  -- Median Imputation')
fig.show()

In [None]:
# plot the 3D scatter plot
fig = px.scatter_3d(imputation_median_df, x='sepal_length', y='sepal_width', z='petal_width',
                    color='petal_length', symbol='species',symbol_sequence= ['circle', 'cross', 'square'])
fig.update_layout(legend=dict(y=0, x = 0), title = '3D Iris -- Median Imputaion', width = 1000, height = 800)
fig.show()

# Imputation by Center of the Group

In [None]:
# copy the DataFrame with missing data
imputation_centerGroup_df = missing_data_df.copy(deep = True)

#### when the n_neighbors=50, we can think of this data is at the center of the group of that types of iris

In [None]:
# import SimpleImputer
from sklearn.impute import KNNImputer
# define the missing_values and stratege, when the n_neighbors = 50, we can think of it is the center of the group
imp_CoG = KNNImputer(n_neighbors= 50)
# fit the defined imputer to DataFrame
imputation_centerGroup_df = imp_CoG.fit_transform(imputation_centerGroup_df.iloc[:,0:4])

In [None]:
# we need to transform the numpy ndarray to pandas dataframe again
c1 = pd.DataFrame(imputation_centerGroup_df, columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
c2 = df['species']
imputation_centerGroup_df = pd.concat([c1,c2], axis = 1)
imputation_centerGroup_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.100,3.500,1.400,0.200,setosa
1,4.900,3.258,1.400,0.200,setosa
2,4.700,3.200,1.300,0.200,setosa
3,5.436,3.100,1.500,0.800,setosa
4,5.000,3.600,1.400,0.200,setosa
...,...,...,...,...,...
145,6.282,2.846,5.200,2.300,virginica
146,6.236,2.500,5.000,1.900,virginica
147,6.500,3.000,4.512,1.584,virginica
148,6.276,3.400,4.432,2.300,virginica


In [None]:
# plot the 2d pair plot
fig = px.scatter_matrix(imputation_centerGroup_df.iloc[:,0:5], color='species', symbol='species', symbol_sequence=['circle', 'cross', 'square'])
fig.update_layout(title='Pair Plot -- Center of Group Imputation', height = 800, width = 1000)
fig.show()

In [None]:
# Best view of 2D ['sepal_width', 'petal_length']
fig = px.scatter(imputation_centerGroup_df, x ='sepal_width', y='petal_length', color='species', symbol='species', symbol_sequence=['circle', 'cross', 'square'])
fig.update_layout(title='Sepal_width vs Petal_length  -- Center of Group Imputation')
fig.show()

In [None]:
# plot the 3D scatter plot
fig = px.scatter_3d(imputation_centerGroup_df, x='sepal_length', y='sepal_width', z='petal_width',
                    color='petal_length', symbol='species',symbol_sequence= ['circle', 'cross', 'square'])
fig.update_layout(legend=dict(y=0, x = 0), title = '3D Iris -- CenterGroup Imputaion', width = 1000, height = 800)
fig.show()

# Imputation by KNNImputer

In [None]:
imputation_KNN_df = missing_data_df.copy(deep = True)

In [None]:
# import SimpleImputer
from sklearn.impute import KNNImputer
# define the missing_values and stratege
imp_KNN = KNNImputer(n_neighbors= 5)
# fit the defined imputer to DataFrame
imputation_KNN_df = imp_KNN.fit_transform(imputation_KNN_df.iloc[:,0:4])

In [None]:
# we need to transform the numpy ndarray to pandas dataframe again
k1 = pd.DataFrame(imputation_KNN_df, columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
k2 = df['species']
imputation_KNN_df = pd.concat([k1,k2], axis = 1)
imputation_KNN_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.10,3.50,1.40,0.20,setosa
1,4.90,3.30,1.40,0.20,setosa
2,4.70,3.20,1.30,0.20,setosa
3,5.60,3.10,1.50,0.94,setosa
4,5.00,3.60,1.40,0.20,setosa
...,...,...,...,...,...
145,6.66,3.34,5.20,2.30,virginica
146,6.10,2.50,5.00,1.90,virginica
147,6.50,3.00,4.92,1.78,virginica
148,6.34,3.40,3.86,2.30,virginica


In [None]:
# plot the 2d pair plot
fig = px.scatter_matrix(imputation_KNN_df.iloc[:,0:5], color='species', symbol='species', symbol_sequence=['circle', 'cross', 'square'])
fig.update_layout(title='Pair Plot -- KNN Imputation', height = 800, width = 1000)
fig.show()

In [None]:
# Best view of 2D ['sepal_width', 'petal_length']
fig = px.scatter(imputation_KNN_df, x ='sepal_width', y='petal_length', color='species', symbol='species', symbol_sequence=['circle', 'cross', 'square'])
fig.update_layout(title='Sepal_width vs Petal_length  -- KNN Imputation')
fig.show()

In [None]:
# plot the 3D scatter plot
fig = px.scatter_3d(imputation_KNN_df, x='sepal_length', y='sepal_width', z='petal_width',
                    color='petal_length', symbol='species',symbol_sequence= ['circle', 'cross', 'square'])
fig.update_layout(legend=dict(y=0, x = 0), title = '3D Iris -- KNN Imputaion', width = 1000, height = 800)
fig.show()

# Partial Mean
In this method we have used autoimpute library to implement MulipleImputer using pmm startegy. 




Main Features:

Utility functions and basic visualizations to explore missingness patterns
Missingness classifier and automatic missing data test set generator
Native handling for categorical variables (as predictors and targets of imputation)
Single and multiple imputation classes for pandas DataFrames

Autoimpute is designed to be user friendly and flexible. When performing imputation, Autoimpute fits directly into scikit-learn machine learning projects. Imputers inherit from sklearn’s BaseEstimator and TransformerMixin and implement fit and transform methods, making them valid Transformers in an sklearn pipeline.

In [None]:
imputation_partialMean_df = missing_data_df.copy(deep = True)

In [None]:
# install the autoimpute
!pip install autoimpute

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting autoimpute
  Downloading autoimpute-0.13.0-py3-none-any.whl (97 kB)
[K     |████████████████████████████████| 97 kB 3.0 MB/s 
Installing collected packages: autoimpute
Successfully installed autoimpute-0.13.0


In [None]:
from autoimpute.imputations import MultipleImputer
imp_pm = MultipleImputer(n=1, strategy = 'pmm', return_list = True)
imputation_partialMean_df = imp_pm.fit_transform(imputation_partialMean_df.iloc[:,0:4])
imputation_partialMean_df



ERROR:pymc:There were 11 divergences after tuning. Increase `target_accept` or reparameterize.
ERROR:pymc:There were 11 divergences after tuning. Increase `target_accept` or reparameterize.


ERROR:pymc:There was 1 divergence after tuning. Increase `target_accept` or reparameterize.




[(1,      sepal_length  sepal_width  petal_length  petal_width
  0             5.1          3.5           1.4          0.2
  1             4.9          3.4           1.4          0.2
  2             4.7          3.2           1.3          0.2
  3             5.2          3.1           1.5          1.1
  4             5.0          3.6           1.4          0.2
  ..            ...          ...           ...          ...
  145           5.9          3.2           5.2          2.3
  146           7.4          2.5           5.0          1.9
  147           6.5          3.0           3.9          1.8
  148           7.4          3.4           6.1          2.3
  149           5.9          3.0           5.1          1.8
  
  [150 rows x 4 columns])]

In [None]:
imputation_partialMean_df = pd.concat([imputation_partialMean_df[0][1], df['species']],axis = 1)
imputation_partialMean_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.4,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,5.2,3.1,1.5,1.1,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,5.9,3.2,5.2,2.3,virginica
146,7.4,2.5,5.0,1.9,virginica
147,6.5,3.0,3.9,1.8,virginica
148,7.4,3.4,6.1,2.3,virginica


In [None]:
# plot the 2d pair plot
fig = px.scatter_matrix(imputation_partialMean_df.iloc[:,0:5], color='species', symbol='species', symbol_sequence=['circle', 'cross', 'square'])
fig.update_layout(title='Pair Plot -- Partial Mean Imputation', height = 800, width = 1000)
fig.show()

In [None]:
# Best view of 2D ['sepal_width', 'petal_length']
fig = px.scatter(imputation_partialMean_df, x ='sepal_width', y='petal_length', color='species', symbol='species', symbol_sequence=['circle', 'cross', 'square'])
fig.update_layout(title='Sepal_width vs Petal_length  -- Partial Mean Imputation')
fig.show()

In [None]:
# plot the 3D scatter plot
fig = px.scatter_3d(imputation_partialMean_df, x='sepal_length', y='sepal_width', z='petal_width',
                    color='petal_length', symbol='species',symbol_sequence= ['circle', 'cross', 'square'])
fig.update_layout(legend=dict(y=0, x = 0), title = '3D Iris -- Partial Mean Imputaion', width = 1000, height = 800)
fig.show()

# SVD -- singular value decomposition

In this method we have implemented Matrix Completion and Low-Rank SVD via
Fast Alternating Least Squares algorithm.

fit a low-rank matrix approximation to a matrix with missing values via nuclear-norm regularization. The algorithm works like EM, filling in the missing values with the current guess, and then solving the optimization problem on the complete matrix using a soft-thresholded SVD. Special sparse-matrix classes available for very large matrices.

An svd object is returned, with components "u", "d", and "v". If the solution has zeros in "d", the solution is truncated to rank one more than the number of zeros (so the zero is visible). If the input matrix had been centered and scaled by biScale, the scaling details are assigned as attributes inherited from the input matrix.

Source Paper: https://www.jmlr.org/papers/volume16/hastie15a/hastie15a.pdf

In [None]:
imputation_SVD_df = missing_data_df.copy(deep = True)

In [None]:
#SVD Impute
from __future__ import print_function
import numpy as np


def frob(Uold, Dsqold, Vold, U, Dsq, V):
    denom = (Dsqold ** 2).sum()
    utu = Dsq * (U.T.dot(Uold))
    vtv = Dsqold * (Vold.T.dot(V))
    uvprod = utu.dot(vtv).diagonal().sum()
    num = denom + (Dsqold ** 2).sum() - 2*uvprod
    return num / max(denom, 1e-9)

class SoftImpute:
    def __init__(self, J=2, thresh=1e-05, lambda_=0, maxit=100, random_state=None, verbose=False):
        self.J = J
        self.thresh = thresh
        self.lambda_ = lambda_
        self.maxit = maxit
        self.rs = np.random.RandomState(random_state)
        self.verbose = verbose
        self.u = None
        self.d = None
        self.v = None

    def fit(self, X):
        n,m = X.shape
        xnas = np.isnan(X)
        nz = m*n - xnas.sum()
        xfill = X.copy()
        V = np.zeros((m, self.J))
        U = self.rs.normal(0.0, 1.0, (n, self.J))
        U, _, _ = np.linalg.svd(U, full_matrices=False)
        Dsq = np.ones((self.J, 1))
        #xfill[xnas] = 0.0
        col_means = np.nanmean(xfill, axis=0)
        np.copyto(xfill, col_means, where=np.isnan(xfill))
        ratio = 1.0
        iters = 0
        while ratio > self.thresh and iters < self.maxit:
            iters += 1
            U_old = U
            V_old = V
            Dsq_old = Dsq
            B = U.T.dot(xfill)

            if self.lambda_ > 0:
                tmp = (Dsq / (Dsq + self.lambda_))
                B = B * tmp

            Bsvd = np.linalg.svd(B.T, full_matrices=False)
            V = Bsvd[0]
            Dsq = Bsvd[1][:, np.newaxis]
            U = U.dot(Bsvd[2])

            tmp = Dsq * V.T

            xhat = U.dot(tmp)

            xfill[xnas] = xhat[xnas]
            A = xfill.dot(V).T
            Asvd = np.linalg.svd(A.T, full_matrices=False)
            U = Asvd[0]
            Dsq = Asvd[1][:, np.newaxis]
            V = V.dot(Asvd[2])
            tmp = Dsq * V.T

            xhat = U.dot(tmp)
            xfill[xnas] = xhat[xnas]
            ratio = frob(U_old, Dsq_old, V_old, U, Dsq, V)
            if self.verbose:
                print('iter: %4d ratio = %.5f' % (iters, ratio))

        self.u = U[:,:self.J]
        self.d = Dsq[:self.J]
        self.v = V[:,:self.J]
        return self

    def suv(self, vd):
        res = self.u.dot(vd.T)
        return res

    def predict(self, X, copyto=False):
        vd = self.v * np.outer(np.ones(self.v.shape[0]), self.d)
        X_imp = self.suv(vd)
        if copyto:
            np.copyto(X, X_imp, where=np.isnan(X))
        else:
            return X_imp

In [None]:
# convert DataFrame to numpy ndarray
svd_array = imputation_SVD_df.iloc[:,0:4].to_numpy()
# define the model
clf = SoftImpute(J=2, lambda_=0.0)
# fit the model
fit = clf.fit(svd_array)
# predict the missing value
svd_array = clf.predict(svd_array)

In [None]:
# svd_array

In [None]:
s1 = pd.DataFrame(svd_array, columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
s2 = df['species']
imputation_SVD_df = pd.concat([s1,s2],axis=1)
imputation_SVD_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.275192,2.905930,1.669905,0.834629,setosa
1,4.423062,2.376133,1.744760,0.849304,setosa
2,4.850062,2.668424,1.554249,0.775575,setosa
3,2.797172,1.443972,1.438425,0.682448,setosa
4,5.246648,2.892054,1.650322,0.825537,setosa
...,...,...,...,...,...
145,0.680465,-0.508872,5.258398,2.295434,virginica
146,2.474095,0.644225,4.884345,2.170624,virginica
147,6.084381,3.372314,1.808346,0.911590,virginica
148,2.869432,1.406098,1.904581,0.886188,virginica


In [None]:
# plot the 2d pair plot
fig = px.scatter_matrix(imputation_SVD_df.iloc[:,0:5], color='species', symbol='species', symbol_sequence=['circle', 'cross', 'square'])
fig.update_layout(title='Pair Plot -- singular value decomposition Imputation', height = 800, width = 1000)
fig.show()

In [None]:
# Best view of 2D ['sepal_width', 'petal_length']
fig = px.scatter(imputation_SVD_df, x ='sepal_width', y='petal_length', color='species', symbol='species', symbol_sequence=['circle', 'cross', 'square'])
fig.update_layout(title='Sepal_width vs Petal_length  -- singular value decomposition Imputation')
fig.show()

In [None]:
# plot the 3D scatter plot
fig = px.scatter_3d(imputation_SVD_df, x='sepal_length', y='sepal_width', z='petal_width',
                    color='petal_length', symbol='species',symbol_sequence= ['circle', 'cross', 'square'])
fig.update_layout(legend=dict(y=0, x = 0), title = '3D Iris -- Singular Value Decomposition Imputaion', width = 1000, height = 800)
fig.show()

# EM
we use impyute<br>
source code: https://impyute.readthedocs.io/en/master/_modules/impyute/imputation/cs/em.html?highlight=Expectation%20Maximization

In [None]:
imputation_EM_df = missing_data_df.copy(deep = True)

In [None]:
# install the impyute
! pip install impyute

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting impyute
  Downloading impyute-0.0.8-py2.py3-none-any.whl (31 kB)
Installing collected packages: impyute
Successfully installed impyute-0.0.8


In [None]:
# importing the package
import impyute as impy

# using EM algorithm
em_array = impy.em(imputation_EM_df.iloc[:,0:4].values, loops = 200)

In [None]:
# convert em_array to DataFrame
e1 = pd.DataFrame(em_array, columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
e2 =df['species']
imputation_EM_df = pd.concat([e1,e2],axis=1)
imputation_EM_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.100000,3.500000,1.400000,0.200000,setosa
1,4.900000,2.954785,1.400000,0.200000,setosa
2,4.700000,3.200000,1.300000,0.200000,setosa
3,4.548149,3.100000,1.500000,0.038501,setosa
4,5.000000,3.600000,1.400000,0.200000,setosa
...,...,...,...,...,...
145,4.758424,3.182758,5.200000,2.300000,virginica
146,4.771757,2.500000,5.000000,1.900000,virginica
147,6.500000,3.000000,3.479755,1.309835,virginica
148,4.286505,3.400000,1.739146,2.300000,virginica


In [None]:
# plot the 2d pair plot
fig = px.scatter_matrix(imputation_EM_df.iloc[:,0:5], color='species', symbol='species', symbol_sequence=['circle', 'cross', 'square'])
fig.update_layout(title='Pair Plot -- EM', height = 800, width = 1000)
fig.show()

In [None]:
# Best view of 2D ['sepal_width', 'petal_length']
fig = px.scatter(imputation_EM_df, x ='sepal_width', y='petal_length', color='species', symbol='species', symbol_sequence=['circle', 'cross', 'square'])
fig.update_layout(title='Sepal_width vs Petal_length  -- EM')
fig.show()

In [None]:
# plot the 3D scatter plot
fig = px.scatter_3d(imputation_EM_df, x='sepal_length', y='sepal_width', z='petal_width',
                    color='petal_length', symbol='species',symbol_sequence= ['circle', 'cross', 'square'])
fig.update_layout(legend=dict(y=0, x = 0), title = '3D Iris -- EM', width = 1000, height = 800)
fig.show()