Declaration

In [8]:
import numpy as np
import os
import urllib.request
from sklearn import mixture
from scipy.spatial import distance
import math

CUT_OFF_IN_PERCENT = 0.02

# Clustering data set
# http://cs.joensuu.fi/sipu/datasets/
def download_data_set(file_url = 'http://cs.joensuu.fi/sipu/datasets/s1.txt'):
  local_filename, headers = urllib.request.urlretrieve(url = file_url)
  return np.loadtxt(local_filename)

def gmm_selection(data_set):
  lowest_bic = np.infty
  bic = []
  n_components_range = range(1, 10)
  cv_types = ['spherical', 'tied', 'diag', 'full']
  for cv_type in cv_types:
    for n_components in n_components_range:
        # Fit a Gaussian mixture with EM
        gmm = mixture.GaussianMixture(n_components=n_components, covariance_type=cv_type)
        gmm.fit(data_set)
        bic.append(gmm.bic(data_set))
        if bic[-1] < lowest_bic:
            lowest_bic = bic[-1]
            best_gmm = gmm
  return best_gmm

# Get all euclid distance between points with keeping their respective order
def get_all_distances(data_set):
  arr_leng, dims = data_set.shape
  all_distances = []
  for i in range(arr_leng):
    current_point_distances = []
    for j in range(arr_leng):
      if i == j:
        continue
      current_point_distances.append(distance.euclidean(data_set[i], data_set[j]))
    all_distances.append(current_point_distances)
  return np.array(all_distances)

def get_cut_off_distance(distances_data):
  arr_leng, dims = distances_data.shape
  all_distances = []
  # Merge all the distances to a single list
  for i in range(arr_leng):
    for j in range(dims):
      all_distances.append(distances_data[i][j])
  # Calculate the number of items to retrieve
  number_of_items = math.ceil(CUT_OFF_IN_PERCENT * len(all_distances))
  total = 0
  for i in range(number_of_items):
    total += all_distances[i]
  return total / number_of_items

def get_density_of_point(point_distance, cut_off_distance):
  arr_leng = len(point_distance)
  result = 0
  for i in range(arr_leng):
    if point_distance[i] < cut_off_distance:
      result += 1
  return result

def get_densities(distances_data, cut_off_distance):
  arr_leng, dims = distances_data.shape
  result = []
  for i in range(arr_leng):
    result.append(get_density_of_point(distances_data[i], cut_off_distance))
  return np.array(result)

Download S1 data set, get all distances arrays and calculate cut off distance

In [9]:
r15 = download_data_set("http://cs.joensuu.fi/sipu/datasets/R15.txt")
all_distances = get_all_distances(r15)
cut_off_distance = get_cut_off_distance(all_distances)
print(all_distances.shape)

(600, 599)


In [125]:
abc = np.array([1,2,3,4,5,6,7,8,9,10,11,12])
vvv = gmm_selection(np.expand_dims(abc,1))
print(vvv)
print(vvv.predict(np.expand_dims(abc,1)))



GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=100,
                means_init=None, n_components=9, n_init=1, precisions_init=None,
                random_state=None, reg_covar=1e-06, tol=0.001, verbose=0,
                verbose_interval=10, warm_start=False, weights_init=None)
[2 6 7 4 0 0 8 3 3 3 5 1]


In [12]:
print(all_distances)
print(cut_off_distance)
densities = get_densities(all_distances, cut_off_distance * 0.5)
print(densities)

[[ 0.65787537  0.32916865  0.2333838  ... 15.32471703 15.72323885
  15.30372007]
 [ 0.65787537  0.33452055  0.63605346 ... 15.08311294 15.44720505
  15.05737746]
 [ 0.32916865  0.33452055  0.37617549 ... 15.20757522 15.58921319
  15.18268619]
 ...
 [15.32471703 15.08311294 15.20757522 ...  0.92856879  0.95651451
   0.55160856]
 [15.72323885 15.44720505 15.58921319 ...  0.11160645  0.95651451
   1.0343423 ]
 [15.30372007 15.05737746 15.18268619 ...  1.06704077  0.55160856
   1.0343423 ]]
8.189126572820593
[159 157 159 159 159 159 158 159 158 160 159 159 159 159 159 159 157 158
 159 159 159 159 159 138 158 159 159 157 159 157 159 159 158 159 159 159
 159 159 159 142 157 157 150 157 148 152 156 141 159 145 159 158 148 143
 148 152 148 153 157 159 142 159 148 157 159 159 157 135 159 148 154 157
 153 159 150 153 157 148 157 157 196 198 198 197 197 198 198 190 198 198
 170 197 197 198 175 198 199 198 188 198 196 199 175 196 195 196 181 199
 198 173 199 162 198 174 197 170 193 195 199 176 205