In [1]:
# learn material:
# DM-3-HDData.ClusteringHighDimensionalDataPart1.pdf , page 70-75
# https://blog.xa0.de/post/PreDeCon%20---%20Density-based-Projected-Subspace-Clustering/#fn:predecon
import numpy as np

Points = np.array([
    [1, 1],  # A 
    [1, 2],  # B
    [2, 2],  # C
    [2, 1],  # D
    [3, 1],  # E
    [4, 1],  # F
    [4, 2],  # G
    [5, 1],  # H
    [5, 2],  # I
    [5, 3],  # J
    [5, 0],  # K
    [6, 1]   # L
])

minpts = 4
eps = 1
delta = 0.3
lmbda = 1
k = 100

In [2]:
# calculate list of euclidean neighbors
def euclidean_neighbor(candidate):
    euclidean_neighbors = []
    for pt in Points:
        euclidean_distance = np.linalg.norm(pt - candidate)
        if euclidean_distance <= eps:
            euclidean_neighbors.append(pt)
    return euclidean_neighbors

In [3]:
# Wp
# page 71
def subspace_prefrence_vector(candidate):  
    distance_projection_x = []
    distance_projection_y = []
     
    for neighbor in euclidean_neighbor(candidate):
        distance_projection_x.append(np.linalg.norm(neighbor[0] - candidate[0]))
        distance_projection_y.append(np.linalg.norm(neighbor[1] - candidate[1]))
        
    var_x = sum(distance_projection_x) / len(distance_projection_x)
    var_y = sum(distance_projection_y) / len(distance_projection_y)
    
    if var_x > delta:
        w_x = 1
    else:
        w_x = k
            
    if var_y > delta:
        w_y = 1
    else:
        w_y = k

    return w_x, w_y

In [4]:
# page 72
def pref_weighted_dist(neighbor, candidate):    
    weights = subspace_prefrence_vector(neighbor)
    distance = ((weights[0] * np.linalg.norm(neighbor[0] - candidate[0])) + (weights[1] * np.linalg.norm(neighbor[1] - candidate[1])))  ** .5
    
    return distance

In [5]:
# page 74-75
def is_core(candidate):
    preferred_epsilon_neighborhood = []
    for neighbor in euclidean_neighbor(candidate):
        # Preferred Distance
        dist = max(
            pref_weighted_dist(neighbor, candidate),
            pref_weighted_dist(candidate, neighbor)
        )
        if dist <= eps:
            preferred_epsilon_neighborhood.append(dist)
    return len(preferred_epsilon_neighborhood) >= minpts

In [6]:
print('D:', euclidean_neighbor(Points[3])) # D
print('H:', euclidean_neighbor(Points[7])) # H

D: [array([1, 1]), array([2, 2]), array([2, 1]), array([3, 1])]
H: [array([4, 1]), array([5, 1]), array([5, 2]), array([5, 0]), array([6, 1])]


In [7]:
print('D', is_core(Points[3]))
print('H', is_core(Points[7]))

D False
H True
