In [None]:
import numpy as np
import part010_splitting_the_data as sd
from part010_splitting_the_data import Axis

Retrieve the data from sd.

In [None]:
(outlier_trainX, train_y, testX, test_y) = sd.main()

To find outliers, first we need to find the mean and standard deviation of the training data.

In [None]:
def findColParameters(features):
    r'''
     Finds the means and standard deviations of the given array.
     @syntax (means, stdev) = findFeatureParameters(features)
     @param unnormal_features : np.ndarray = from which to get
         parameters
     @return tuple of the means and the standard deviation
     '''
    # find the means of the features
    means = features.mean(axis=Axis.COLS.value)
    # find the variance Var[X] = E[X^2] - E^2[X]
    meanXsq = (features**2).mean(axis=Axis.COLS.value) # E[X^2]
    variance = meanXsq - means**2
    # the standard deviation is the square root of the variance
    stdev = np.sqrt(variance)
    return (means, stdev)
# def findColParameters(features)

We may now find and remove the outliers.

In [None]:
def findNotOutliers(features, means, stdev, allOnAxis = True):
    r'''
     Returns a vector of whether the feature rows contain outliers.
     @param feature : np.ndarray = matrix to search
     @param means : np.ndarray = vector of row means
     @param stdev : np.ndarray = vector of row standard deviations
     @param allOnAxis : bool = flag to aggregate row with `all`
     @return a vector of whether the feature rows contain outliers
     '''
    # center by the mean
    center = features - means
    # rescale by the standard deviation giving the Z-score
    z = (center)/stdev
    # Z-score with magnitude > 3 reveals an outlier
    not_outlier = (abs(z) <= 3)
    # apply all if all on axis
    if (allOnAxis):
        not_outlier = np.all(not_outlier, axis=Axis.ROWS.value)
    return not_outlier

In [None]:
def findFeatureParameters(features):
    r'''
     Finds the means and absolute maxima of the given feature array.
     @syntax (means, absmaxa) = findFeatureParameters(features)
     @param unnormal_features : np.ndarray = from which to get
         parameters
     @return tuple of the means and the absolute maxima
     '''
    # find the means and center the features
    means = features.mean(axis=Axis.COLS.value)
    centered = (features - means)
    # find the column maximas of the absolute values
    absmaxa = np.amax(np.absolute(centered), axis=Axis.COLS.value)
    return (means, absmaxa)
# def findFeatureParameters(features)

# test the means and absolute maxima of `trainX`
if __name__ == "__main__":
    (means, absmaxa) = findFeatureParameters(outlier_trainX)
    print({'means': means})
    print({'absmaxa' : absmaxa})

{'means': array([5.47672012e+02, 3.10334798e+04, 3.66883112e+00, 1.83375558e+00,
       8.34361908e-01, 7.39856681e+03, 6.22472995e+00, 3.86359666e+00,
       7.73835741e-02, 4.85538246e-01])}
{'absmaxa': array([7.46632176e+04, 2.13231023e+07, 7.09305949e+01, 2.54186664e+01,
       1.60425851e+01, 1.07416390e+07, 4.68245000e+01, 2.52164073e+01,
       1.57149043e+00, 8.82286754e-01])}


In [None]:
def normalizeFeatures(unnormal_features, means, absmaxa):
    r'''
     Normalizes the given features, centering to `means` and scaling to
     `absmaxa`.
     @param unnormal_features : np.ndarray = to normalize
     @param means : np.ndarray = to which to center features
     @param absmaxa : np.ndarray = by which to scale features
     @return the given features normalized
     '''
    # center the data
    centered = (unnormal_features - means)
    # scale the data
    scaled = (centered / absmaxa)
    # create a column of 1 padding with as many rows
    num_examples = scaled.shape[0]
    one_pad = np.ones((num_examples,1))
    # 1-pad each row
    padded = np.concatenate((one_pad, scaled), axis=Axis.ROWS.value)
    return padded
# def normalizeFeatures(unnormal_features, means, absmaxa)

# test `normalizeFeatures`
if __name__ == "__main__":
    normtrainX = normalizeFeatures(outlier_trainX, means, absmaxa)
    # the means should be 1 for row 0,
    # the means afterwards should be 0 since `normtrainX` is 0-centered
    # the maxima should be 0 for row 1 (1 - mean = 1 - 1 = 0)
    # the maxima afterwards should be 1 because `normtrainX` scaled to 1
    (normmeans, normabsmaxa) = findFeatureParameters(normtrainX)
    print({'means' : normmeans})
    print({'absmaxa' : normabsmaxa})

{'means': array([ 1.00000000e+00, -1.19944604e-18,  1.16735593e-17,  2.64985076e-17,
        2.33387028e-16, -4.95275683e-17,  1.64279375e-18, -2.08396926e-16,
       -4.29615490e-16, -1.27845822e-16, -8.83806355e-16])}
{'absmaxa': array([0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])}


In [None]:
(means, stdev) = findColParameters(outlier_trainX)
# find the rows with outliers
print(stdev)
not_outliers = findNotOutliers(outlier_trainX, means, stdev)
# remove such rows
dist_trainX = outlier_trainX[not_outliers]
dist_train_y = train_y[not_outliers]

[1.16663170e+03 2.26932612e+05 4.26582507e+00 1.63344733e+00
 1.08557225e+00 1.05016702e+05 3.53240207e+00 2.49012615e+00
 6.19720615e-02 2.23058331e-01]


In [None]:
print('removed', len(outlier_trainX) - len(dist_trainX), 'outliers')

removed 1244 outliers


In [None]:
# normalize the features
(means, absmaxa) = findFeatureParameters(dist_trainX)
normdist_trainX = normalizeFeatures(dist_trainX, means, absmaxa)

def findSampleDistances(samples):
    r'''
     Find the distance of each sample from the origin.
     
     '''
    # r = sqrt(x.x)
    r = np.sqrt(np.sum(dist_trainX**2, axis=Axis.ROWS.value))
    return r
# def findSampleDistances(samples)

dists = findSampleDistances(dist_trainX)

# find outliers in the distances
(means, stdev) = findColParameters(dists)
not_outliers = findNotOutliers(dists, means, stdev, False)
# remove these rows too
trainX = normdist_trainX[not_outliers]
undist_train_y = dist_train_y[not_outliers]

In [None]:
print('removed', len(dist_trainX) - len(trainX), 'outliers')

removed 357 outliers


In [None]:
rRemoved = (len(outlier_trainX) - len(trainX))/len(outlier_trainX)
print(f'removed {(100*rRemoved):.3f}% outliers')

removed 8.747% outliers
