# Fast and Accurate Kernel Density Estimation using Nearest Neighbor Data Structures

This file contains the code used for (1) generating the data, (2) running the methods, and (3) plotting the final results.

In [1]:
import glob
import time
import os
import sys
import random
import subprocess
import numpy as np
import scipy.stats
import matplotlib.pyplot as plt

In [2]:
# Set the global seed
np.random.seed(0)

In [3]:
# Parameter settings
n_mixtures = 10
n_points = 100000
n_digits = 5
dimensions = [2, 4, 8, 16, 32, 64]

In [4]:
data_path = './data'
cpp_source_path = './source/source_C++'
output_path ='./output'

## Generating the data

This part generates Gaussian random data of size $10^5$ with $10$ mixtures.

In [5]:
dists = []
for dimension in dimensions:
    data = []
    for mixture_id in range(n_mixtures):
        mean_vector = np.random.rand(dimension)
        rand_matrix = np.random.rand(dimension,dimension)
        cov_matrix = rand_matrix @ rand_matrix.T # Guarantee PSD condition
        data.append(np.random.multivariate_normal(mean=mean_vector, cov=cov_matrix, size=n_points//n_mixtures))
    
    data = np.vstack(data)
    data = np.round(data, n_digits) # rounding
    np.random.shuffle(data)     
    
    if not os.path.exists(data_path):
        os.makedirs(data_path)
    
    # Write to file
    filename = '{:}d.txt'.format(dimension)
    f = open(os.path.join(data_path, filename), 'w')
    for r in range(data.shape[0]):
        f.write(" ".join(map(str, data[r,:].tolist())) + "\n")

    print("Writing {:} successfully".format(filename))

Writing 2d.txt successfully
Writing 4d.txt successfully
Writing 8d.txt successfully
Writing 16d.txt successfully
Writing 32d.txt successfully
Writing 64d.txt successfully


## Running the proposed method

In [6]:
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [7]:
# Compile the source
subprocess.call(['make', '-C', cpp_source_path, 'clean'])
subprocess.call(['make', '-C', cpp_source_path])

0

In [8]:
for dimension in dimensions:
    data_filepath = os.path.join(data_path, '{:}d.txt'.format(dimension))
    print("Benchmarking {:}".format(data_filepath))
    start_time = time.perf_counter()
    subprocess.call(['./{:}'.format(os.path.join(cpp_source_path, 'experiments')),
                             data_filepath,
                             os.path.join(output_path,'{:}d'.format(dimension)),
                             str(dimension)])
    end_time = time.perf_counter()
    print("Benchmarking {:} took {:} sec".format(data_filepath, end_time-start_time))

Benchmarking ./data/2d.txt
Benchmarking ./data/2d.txt took 8935.386412563268 sec
Benchmarking ./data/4d.txt
Benchmarking ./data/4d.txt took 5677.770339895971 sec
Benchmarking ./data/8d.txt
Benchmarking ./data/8d.txt took 6712.266199314967 sec
Benchmarking ./data/16d.txt
Benchmarking ./data/16d.txt took 1856.4596596793272 sec
Benchmarking ./data/32d.txt
Benchmarking ./data/32d.txt took 1051.8898305860348 sec
Benchmarking ./data/64d.txt
Benchmarking ./data/64d.txt took 1420.4419205570593 sec


## Running other methods

In [9]:
python_source_path = './source/source_python'

## Obtaining results