Declaration

In [25]:
import numpy as np
import os
import urllib.request
from sklearn import mixture
from scipy.spatial import distance
import math
import statistics 

CUT_OFF_IN_PERCENT = 0.02

# Clustering data set
# http://cs.joensuu.fi/sipu/datasets/
def download_data_set(file_url = 'http://cs.joensuu.fi/sipu/datasets/s1.txt'):
  local_filename, headers = urllib.request.urlretrieve(url = file_url)
  data = np.loadtxt(local_filename)
  dims = data.shape[1]
  if dims > 2:
    return np.delete(data, 2, 1)
  return data

def gmm_selection(data_set):
  lowest_bic = np.infty
  bic = []
  n_components_range = range(1, min(10, data_set.shape[0]))
  cv_types = ['spherical', 'tied', 'diag', 'full']
  for cv_type in cv_types:
    for n_components in n_components_range:
        # Fit a Gaussian mixture with EM
        gmm = mixture.GaussianMixture(n_components=n_components, covariance_type=cv_type)
        gmm.fit(data_set)
        bic.append(gmm.bic(data_set))
        if bic[-1] < lowest_bic:
            lowest_bic = bic[-1]
            best_gmm = gmm
  return best_gmm

# Get all euclid distance between points with keeping their respective order
def get_all_distances(data_set):
  arr_leng = data_set.shape[0]
  all_distances = []
  for i in range(arr_leng):
    current_point_distances = []
    for j in range(arr_leng):
      if i == j:
        continue
      current_point_distances.append(distance.euclidean(data_set[i], data_set[j]))
    all_distances.append(current_point_distances)
  return np.array(all_distances)

def get_cut_off_distance(distances_data):
  arr_leng, dims = distances_data.shape
  # Get number of neighbors
  number_of_items = math.ceil(CUT_OFF_IN_PERCENT * dims)
  all_distances = []
  for i in range(arr_leng):
    point_distance = np.array(distances_data[i])
    # Get number_of_items smallest distance
    point_distance = np.partition(point_distance,number_of_items)[:number_of_items]
    for dist in point_distance:
      all_distances.append(dist)
  return statistics.mean(all_distances) 

def get_density_of_point(point_distance, cut_off_distance):
  arr_leng = len(point_distance)
  result = 0
  for i in range(arr_leng):
    if point_distance[i] < cut_off_distance:
      result += 1
  return result

def get_densities(distances_data, cut_off_distance):
  arr_leng = distances_data.shape[0]
  result = []
  for i in range(arr_leng):
    result.append(get_density_of_point(distances_data[i], cut_off_distance))
  return np.array(result)

def get_distribution(data_set):
  unique, counts = np.unique(data_set, return_counts=True)
  arr_leng = unique.shape[0]
  result = []
  for i in range(arr_leng):
    result.append([unique[i], counts[i]])
  return np.array(result)

def density_groups(densities, densities_distribution, groups):
  result = []
  for i in range(len(densities)):
    current_density = densities[i]
    for j in range(len(densities_distribution)):
      if current_density == densities_distribution[j][0]:
        result.append(groups[j])
        break
  return np.array(result)
  

Download data set

In [28]:
# Download data set
raw_data_set = download_data_set('http://cs.joensuu.fi/sipu/datasets/R15.txt')

Download data, calculate all relevant variables

In [29]:
# calculate all distances
all_distances = get_all_distances(raw_data_set)

# get cut off distance
cut_off_distance = get_cut_off_distance(all_distances)

# calculate densitive base on cut off distance
densities = get_densities(all_distances, cut_off_distance)

# form the distribution to perform gmm selection
densities_distribution = get_distribution(densities)

# get gmm models
gmm_model = gmm_selection(densities_distribution)

# get respective groups
groups = gmm_model.predict(densities_distribution)

# get a respective vector with original densities to represent the desitive of respective point
desities_groups = density_groups(densities, densities_distribution, groups)

# Referenced of the original array for futher classification point
densities_argsort = densities.argsort()[::-1]

# Copy to avoid sam referenced then sort in decending order
sorted_desities = np.array(densities)
sorted_desities.sort()
sorted_desities = sorted_desities[::-1]

In [22]:
abc = np.array([1,2,3,4,5,6,8,7])
xxx = abc.argsort()[::-1]
print(abc)
abc.sort()
abc = abc[::-1]
print(abc)
print(xxx)

[1 2 3 4 5 6 8 7]
[8 7 6 5 4 3 2 1]
19
