In [106]:
%load_ext autoreload
%autoreload 2

import numpy as np
import utils
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
import scipy

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [107]:
# Number of clusters (excl. outliers)
k = 4

# Points per cluster (excl. outliers)
n = [500] * k

# Number of dimensions
d = 100

# Number of outliers
o = 10

# Cluster sizes
cluster_sizes = n + [o]

# Total clusters
k_total = k + (1 if o > 0 else 0)

In [108]:
"""
Generate cluster centers
"""

alpha = 5

x = np.empty((0, d))
# C1 = first half 0, second half 1
half = np.zeros((1, ((3 * d) // 5))) + 0.1
half2 = np.zeros((1, d - ((3 * d) // 5))) + 1.1
x = np.vstack((x, np.hstack((half, half2))))
# C2 = first half 0.25, second half 0.75
half = np.zeros((1, d // 2)) + 0.25
half2 = np.zeros((1, d // 2)) + 0.75
x = np.vstack((x, np.hstack((half, half2))))
# C3 = first half 1, second half 0
half = np.zeros((1, d // 2)) + 1
half2 = np.zeros((1, d // 2))
x = np.vstack((x, np.hstack((half, half2))))
# C4 = first third 0.8, second third 0.2, last third 0.6
one = np.zeros((1, d//3)) + 0.8 
two = np.zeros((1, d//3)) + 0.2
three = np.zeros((1, (d - 2*(d//3)))) + 0.6
x = np.vstack((x, np.hstack((one, two, three))))

x = x * alpha

assert x.shape == (k, d)
assert np.linalg.matrix_rank(x) == k

In [109]:
"""
Generate non-outlier points
"""

p = 0.3
Y = np.zeros((0, d))
noise_arr = []
for i in range(k): 
    noise_idx = np.random.randint(0, n[i])
    for j in range(n[i]):    
        m = np.random.uniform(2, 6)
        noise = np.random.choice([m, 0, -m], size=(1, d), p=[(1-p)/2., p, (1-p)/2.])
        point = x[i] + noise
        if j == noise_idx: 
            noise_arr.append(noise)
        Y = np.vstack((Y, point))

assert Y.shape == (sum(n), d)

In [110]:
"""
Generate outliers
Weighted average of two random points in different clusters
"""

for i in range(o): 
    clusters = np.random.choice(range(len(cluster_sizes) - 1), size=2, replace=False)
    weight = np.random.uniform(0.5, 0.7)
    sample = [
        Y[np.random.randint(sum(cluster_sizes[:clusters[0]]), sum(cluster_sizes[:clusters[0]+1]))],
        Y[np.random.randint(sum(cluster_sizes[:clusters[1]]), sum(cluster_sizes[:clusters[1]+1]))]
    ]
    point = (sample[0] * weight) + (sample[1] * (1 - weight))
    Y = np.vstack((Y, point))


assert Y.shape == (sum(cluster_sizes), d)

In [111]:
"""
Distance matrix and intra and inter cluster distances
"""

D_pre = utils.get_distance_matrix(Y)
avg_intercluster_distance, avg_intracluster_distance = utils.get_average_compression(D_pre, cluster_sizes, k_total)
avg_intercluster_distance, avg_intracluster_distance

(array([57.64824614, 54.57225423, 61.07115823, 53.78785418, 46.87968081]),
 array([47.70252613, 48.17834849, 48.42863124, 47.82211141, 37.50688938]))

In [112]:
"""
Compare norms of outliers with sample of cluster points
"""
l2 = []
for i in range(len(cluster_sizes) - 1): 
    point_idx = np.random.randint(sum(cluster_sizes[:i]), sum(cluster_sizes[:i+1]))
    l2.append(np.linalg.norm(Y[point_idx]))

print("Non-outliers:")
print(l2)

l2_outliers = []
for i in range(o): 
    idx = -1 * (i + 1)
    dist = round(np.linalg.norm(Y[idx]), 3)
    l2_outliers.append(dist)

print("Outliers:")
print(l2_outliers)

Non-outliers:
[42.548497197830514, 39.57013559048663, 58.03756596163801, 40.62889145051015]
Outliers:
[37.071, 47.7, 36.998, 35.895, 41.288, 36.359, 34.919, 39.76, 37.243, 40.697]


In [113]:
"""
Compute PCA
"""
components = 4
pca = PCA(n_components=components)
pca.fit(Y)
Y_pca = pca.transform(Y)


assert Y_pca.shape == (sum(cluster_sizes), components)

In [114]:
"""
Compute post PCA distances and compression matrix
"""
D_post = utils.get_distance_matrix(Y_pca)
C = D_pre / D_post
np.nan_to_num(C, copy=False, nan=0.0)
avg_intercluster_compression, avg_intracluster_compression = utils.get_average_compression(C, cluster_sizes, k_total)
avg_intercluster_compression, avg_intracluster_compression

  C = D_pre / D_post


(array([2.10314121, 2.33271973, 1.69959938, 2.17387779, 2.59786747]),
 array([5.78809616, 5.81003083, 5.73523232, 5.84847506, 2.95552659]))

In [118]:
"""
Variance of compressibility for each point and whether it is an outlier 
"""
np.nan_to_num(C, copy=False, nan=0.0)
comp_var = [0] * len(C)
for i in range(len(C)): 
    comp_var[i] = np.var(C[i])
combined_var = [(True if i >= sum(n) else False, comp_var[i], i) for i in range(len(C))]
combined_var.sort(key=lambda x: x[1])
print(combined_var)

[(False, 0.4261481170532655, 767), (False, 0.47356767436770086, 1057), (False, 0.4976252273591256, 588), (False, 0.5303639376319687, 614), (False, 0.578385186859189, 598), (False, 0.5898922956140393, 707), (False, 0.6023918818159316, 673), (False, 0.6065005588836336, 1920), (False, 0.61262011132361, 1570), (False, 0.6221458289703234, 1219), (False, 0.6318594319508163, 1857), (False, 0.6354259918269985, 1186), (False, 0.6395768278688587, 779), (False, 0.6431876882507641, 532), (False, 0.6516269145867052, 1302), (False, 0.6581421410302946, 1898), (False, 0.6663348508033801, 1265), (False, 0.6738619862938924, 1293), (False, 0.6850374686039713, 753), (True, 0.6979683981703912, 2005), (False, 0.7149428804693022, 1210), (False, 0.7176564412283578, 287), (False, 0.7325243320402725, 585), (False, 0.7415873744749637, 501), (False, 0.7425642198921223, 305), (True, 0.7443699072885956, 2004), (False, 0.7581089504094337, 1403), (False, 0.7617192844227029, 161), (False, 0.7654237639914426, 1558), (T

In [119]:
"""
Median variance of compressibility for outliers
"""
outlier_var = [np.var(C[i]) for i in range(sum(cluster_sizes) - o, sum(cluster_sizes))]
np.median(outlier_var)

1.1503598066420209

In [122]:
"""
Median variance of compressibility for non-outliers
"""
non_outlier_var = [np.var(C[i]) for i in range(sum(cluster_sizes) - o)]
np.median(non_outlier_var)

4.612057112138631

In [128]:
"""
Discretized compressibility ratio counts for sample non-outlier point with low variance
"""
print(f"Variance: {np.var(C[1265])}")
point = C[1265] // 1
np.unique(point, return_counts=True)

Variance: 0.6663348508033801


(array([0., 1., 2., 3., 4., 5., 6., 7.]),
 array([   1, 1182,  540,  225,   38,   10,   12,    2]))

In [129]:
""" 
Discretized compressibility ratio couts for sample outlier point
"""
print(f"Variance: {np.var(C[2004])}")
point_out = C[2004] // 1
np.unique(point_out, return_counts=True)

Variance: 0.7443699072885956


(array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.]),
 array([  1, 828, 865, 239,  45,  16,   7,   1,   7,   1]))