In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


class PcaInputter:
    def __init__(self, data):
        
        if not isinstance(data, (pd.DataFrame, np.ndarray)):
            raise Exception("Input data must be a pandas DataFrame or a numpy array.")
        
        if isinstance(data, pd.DataFrame):
            if not all(data.applymap(np.isreal).all()):
                raise Exception("All values in the DataFrame must be numerical.")
            
            if data.apply(lambda row: row.isnull().all(), axis=1).any():
                raise Exception("Found a row where all columns are NaN in DataFrame.")
        
            if data.apply(lambda col: col.isnull().all(), axis=0).any():
                raise Exception("Found a column where all rows are NaN in DataFrame.")
            
            else:
                self.r_idx, self.c_idx = np.where(data.isnull())
     
        if isinstance(data, np.ndarray):
            if not np.isreal(data).all():
                raise Exception("All values in the numpy array must be numerical.")
                
            if np.all(np.isnan(data), axis=1).any():
                raise Exception("Found a row where all columns are NaN in numpy array.")
        
            if np.all(np.isnan(data), axis=0).any():
                raise Exception("Found a column where all rows are NaN in numpy array.")
    
            else:
                self.r_idx, self.c_idx = np.where(np.isnan(data))

        self.na_data = data
        

    def run_pca(self, X, M):
        pca_obj = PCA().fit(X)
        self.scores = pca_obj.transform(X)
        self.components = pca_obj.components_

        return self.scores[:,1-M].reshape(X.shape[0],M) @self.components[:M][0].reshape(M,X.shape[1])

    
    def iterfill(self, M=1, thresh=1e-7):
        hat_data = self.na_data.copy()
        bar_data = np.nanmean(hat_data, axis=0)
        hat_data[self.r_idx, self.c_idx] = bar_data[self.c_idx]
        
        rel_err = 1
        count = 0
        ismiss = np.isnan(self.na_data)
        mssold = np.mean(hat_data[~ismiss]**2)
        mss0 = np.mean(self.na_data[~ismiss]**2)
        mssold

        while rel_err > thresh:
            count += 1
            app_data = self.run_pca(hat_data, M)
            hat_data[ismiss] = app_data[ismiss]
            mss = np.mean(((self.na_data - app_data)[~ismiss])**2)
            rel_err = (mssold - mss) / mss0
            mssold = mss
            print("Iteration: {0}, MSS:{1:.3f}, Rel.Err {2:.2e}".format(count, mss, rel_err))
        return hat_data
 

## Testing

In [None]:
# helper function to input NA values and standardize the dataset

def make_nas(n,X):
    n_omit = n
    np.random.seed(15)
    ridx = np.random.choice(np.arange(X.shape[0]), n_omit, replace=False)
    cidx = np.random.choice(np.arange(X.shape[1]), n_omit, replace=True)
    Xna = X.copy()
    Xna[ridx, cidx] = np.nan

    scaler = StandardScaler(with_std=True, with_mean=True)
    Xna_scaled = scaler.fit_transform(Xna)
    return ridx, cidx, Xna, Xna_scaled

#### Filling the NA values on the Hitters dataset

In [None]:
url = "https://raw.githubusercontent.com/JWarmenhoven/ISLR-python/master/Notebooks/Data/Hitters.csv"
Hitters = pd.read_csv(url)
Hitters = Hitters.set_index('Unnamed: 0')
Hitters = Hitters.select_dtypes(include=['number'])
X=Hitters.values


ridx, cidx, Xna, Xna_scaled = make_nas(50,X)  # input 50 NA values
inputter_object = PcaInputter(Xna_scaled)
Xinputted = inputter_object.iterfill()

print("\n")
print(Xna_scaled)
print("\n")
print(Xinputted)

In [None]:
#correlation of the original values with the inputted values
# result is nan, because the original data contains nan values
scaler = StandardScaler(with_std=True, with_mean=True)
original_data= scaler.fit(Xna).transform(X)
np.corrcoef(original_data[ridx, cidx], Xinputted[ridx, cidx])[0,1]

#### Filling the NA values on the Auto dataset

In [None]:
url = "https://raw.githubusercontent.com/JWarmenhoven/ISLR-python/master/Notebooks/Data/Auto.csv"
Auto = pd.read_csv(url)
Auto = Auto.set_index('name')
Auto = Auto.select_dtypes(include=['number'])
X=Auto.values
ridx, cidx, Xna, Xna_scaled = make_nas(50,X)
inputter_object = PcaInputter(Xna_scaled)
Xinputted = inputter_object.iterfill()

print("\n")
print(Xna_scaled)
print("\n")
print(Xinputted)

In [None]:
#correlation of the original values with the inputted values
scaler = StandardScaler(with_std=True, with_mean=True)
original_data= scaler.fit(Xna).transform(X)
np.corrcoef(original_data[ridx, cidx], Xinputted[ridx, cidx])[0,1]

#### Filling the NA values on the Boston dataset

In [None]:
url = "https://raw.githubusercontent.com/JWarmenhoven/ISLR-python/master/Notebooks/Data/Boston.csv"
Boston = pd.read_csv(url)
X=Boston.values

ridx, cidx, Xna, Xna_scaled = make_nas(10,X)
inputter_object = PcaInputter(Xna_scaled)
Xinputted = inputter_object.iterfill()

print("\n")
print(Xna_scaled)
print("\n")
print(Xinputted)

In [None]:
#correlation of the original values with the inputted values
scaler = StandardScaler(with_std=True, with_mean=True)
original_data= scaler.fit(Xna).transform(X)
np.corrcoef(original_data[ridx, cidx], Xinputted[ridx, cidx])[0,1]

#### Filling the NA values on the US arrests dataset

In [None]:

url = "https://raw.githubusercontent.com/JWarmenhoven/ISLR-python/master/Notebooks/Data/USArrests.csv"
USArrests = pd.read_csv(url)
USArrests = USArrests.set_index('Unnamed: 0')

X = USArrests.values

ridx, cidx, Xna, Xna_scaled = make_nas(20,X)

inputter_object = PcaInputter(Xna_scaled)
Xinputted = inputter_object.iterfill()

print(Xna_scaled)
print("/n")
print(Xinputted)




In [None]:
#correlation of the original values with the inputted values
scaler = StandardScaler(with_std=True, with_mean=True)
original_data= scaler.fit(Xna).transform(X)
np.corrcoef(original_data[ridx, cidx], Xinputted[ridx, cidx])[0,1]

In [None]:
print("")