In [177]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import PolynomialFeatures
import datetime
import pytz
from pytz import timezone
import tarfile
from numpy import load
import os
from sklearn.manifold import TSNE
import h5py
from sklearn.decomposition import PCA as sklearnPCA
from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn import datasets
from sklearn.metrics import davies_bouldin_score
from sklearn.cluster import DBSCAN
from sklearn.cluster import Birch
import scipy as sp
from scipy import spatial
from scipy import stats

In [3]:
h5 = h5py.File('sound_data_improved.hdf5', 'r')

d = h5['sound_data']

In [182]:
sample_nums = np.random.choice(range(d.shape[0]), 10000, replace=False)

index = np.zeros(d.shape[0]).astype('bool')
index[sample_nums] = True


In [5]:
pca_106 = sklearnPCA(106)
projected = pca_106.fit_transform(d['feature_vector'])

In [178]:
def get_cluster_model(num_clusters):
    """

    Parameters
    ----------
    num_clusters

    Returns
    -------
    mbk.cluster_centers_ : array of shape (num_clusters, 45)
        An array of the feature vectors for each centroid in each cluster.

    """
    mbk = MiniBatchKMeans(n_clusters=num_clusters, random_state=0)
    mbk.fit(projected[index])
    return mbk

In [186]:
def cut_file_path(neighbors_file_path):
    """
    Cuts a file name to start with the sensor name.
    """
    return(neighbors_file_path[32:])   

In [26]:
def make_neighbors_dataframe(num_clusters):
    """
    Returns a DataFrame with the information (timestamp, filepath, etc) from five neighbors of each centroid in each
    cluster the data is grouped into.

    Parameters
    ----------
    num_clusters : int
        Number of clusters to group the projected data into.

    Returns
    -------
    df : DataFrame
        pandas DataFrame listing information about each neighbor, including timestamp, filepath, the centroid it is
        associated with, and the number of clusters the projected data is grouped into.
    """
    cluster_assignments = get_cluster_model(num_clusters).predict(projected[index])
    cluster_centers = get_cluster_model(num_clusters).cluster_centers_
    centroid_cluster_assignments = get_cluster_model(num_clusters).predict(cluster_centers)
    
    print(centroid_cluster_assignments)
    print(len(cluster_centers))
    
    nearest_neighbors = []
   
    for cluster_index in range(num_clusters):
        #for each cluster center, query only the cluster it belongs to
        #projected[index] is all the points, how to filter out only the points belonging to one cluster
        cluster_test = []
        for i in cluster_assignments:
            if i == cluster_index:
                cluster_test.append(projected[index][i])
        tree = spatial.KDTree(cluster_test)
        nearest_neighbors = tree.query(cluster_centers, 5)
    
    
    # Creates array of indices of elements in projected_45 that match the neighbors
    # Creating array of centroid feature vectors corresponding to each neighbor, also which centroid the neighbor belongs to

    neighbors_arr = []
    centroids = []
    centroid_num_arr = []
    print(nearest_neighbors[0])
    for centroid_num, x in enumerate(nearest_neighbors[1]):
        for y in x:
            neighbors_idx = np.nonzero(index)[0][y]
            neighbors_arr.append(neighbors_idx)
#             neighbors_arr.append(y)
            centroids.append(cluster_centers[centroid_num])
            centroid_num_arr.append(centroid_num + 1)

    # Mask for elements of d that are neighbors
    index_2 = np.zeros(d.shape[0]).astype('bool')
#     print('unsorted: ' + str(len(neighbors_arr)))
#     print('sorted: ' + str(len(np.sort(neighbors_arr))))
    index_2[np.sort(neighbors_arr)] = True
#     print(np.sort(neighbors_arr))
#     print('i: ' + str(len(index_2[np.sort(neighbors_arr)])))
#     import pdb; pdb.set_trace()
    a_count = 0
    for a in np.sort(neighbors_arr):
        if index_2[a]:
#             print(str(a) + ': ' + str(index_2[a]))
            
            a_count += 1
    
    count = 0
    for b in index_2:
        if b:
#             print(str(b))
            count += 1
    
    print(str(a_count))
    print(str(count))
            

    # Creating array with number of clusters for each entry
    num_clusters_arr = len(neighbors_arr) * [num_clusters]

    # Converting timestamps to datetime format
#     print(d['timestamp'].shape)
    neighbors_timestamps_dt = []
    for i in neighbors_arr:
#         print(i) 
        j = d[i, 'timestamp']
        dt = datetime.datetime.utcfromtimestamp(j)
        dt = pytz.UTC.localize(dt)
        dt = dt.astimezone(pytz.timezone('US/Eastern'))
        neighbors_timestamps_dt.append(dt)

    # Cutting the filepath so it starts with the sensor name
    test_cut_path = cut_file_path(d[index_2]['file_path'])
    print('cut file path size: ' + str(len(test_cut_path)))

    # Making the dataframe
    df = pd.DataFrame(centroids)
    print('centroid: ' + str(len(centroids)))
    df.insert(0, "timestamp_orig", d[index_2]['timestamp_orig'], True)
    df.insert(1, "timestamp_dt", neighbors_timestamps_dt, True)
    df.insert(2, "sensor_id", d[index_2]['sensor_id'], True)
    df.insert(3, "file_path", test_cut_path, True)
    df.insert(4, "centroid_num", centroid_num_arr, True)
    df.insert(5, "num_clusters", num_clusters_arr, True)

    return df


In [25]:
df_2 = make_neighbors_dataframe(2 ** 6)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63]
64


KeyboardInterrupt: 

In [255]:
cluster_assignments = get_cluster_model(64).predict(projected[index])
cluster_centers = get_cluster_model(64).cluster_centers_
centroid_cluster_assignments = get_cluster_model(64).predict(cluster_centers)
    
# print(centroid_cluster_assignments)
# print(len(cluster_centers))

centroids = []
centroid_num_arr = []
num_centroids = 64*5*[64]
neighbor_timestamps = np.empty((64,5))
neighbor_timestamps_orig = np.empty((64,5))
neighbor_file_path = np.empty((64,5), dtype='S92')
# neighbor_file_path = []
neighbor_sensor_id = np.empty((64,5), dtype='S60')
# neighbor_timestamps_dt = np.empty((64*5), dtype = datetime.datetime)
# print(neighbor_timestamps_dt.dtype)
   
for i,cluster_index in enumerate(range(64)):
    #for each cluster center, query only the cluster it belongs to
    
    #Filter out only the points belonging to one cluster
    cluster_mask = (cluster_assignments==cluster_index)
    cluster_test = projected[index][cluster_mask]
    
    #Makes a list of the centroid of the cluster with length of the number of the points in the cluster
    centroid_list = 5*[cluster_centers[cluster_index]]
    centroids += centroid_list
    
    #Makes a list of the cluster index with length of the number of the points in the cluster
    centroid_num_list = 5*[cluster_index]
    centroid_num_arr += centroid_num_list
    
    print(len(cluster_test))
    nearest_neighbors = []
    tree = spatial.KDTree(cluster_test)
#     print(cluster_centers[cluster_index])
    nearest_neighbors = tree.query(cluster_centers[cluster_index], 5)[1]
    
    #from only the points corresponding to a certain cluster in the 10000 subset of projected, apply the nearest
    #neighbors mask to obtain the other characteristics like file path, timestamp, etc
    
    neighbors_mask = np.zeros(len(cluster_test)).astype('bool')
    neighbors_mask[np.sort(nearest_neighbors)] = True
    
    neighbor_timestamps_prelim = d[index][cluster_mask][neighbors_mask]['timestamp']
#     neighbor_timestamps.append(neighbor_timestamps_prelim)
#     neighbor_timestamps_orig.append(d[index][cluster_mask][neighbors_mask]['timestamp_orig'])
#     neighbor_cut_file_path.append(cut_file_path(d[index][cluster_mask][neighbors_mask]['file_path']))
#     neighbor_sensor_id.append(d[index][cluster_mask][neighbors_mask]['sensor_id'])
    
    neighbor_timestamps[i] = (neighbor_timestamps_prelim)
    neighbor_timestamps_orig[i] = (d[index][cluster_mask][neighbors_mask]['timestamp_orig'])
    neighbor_file_path_inner = np.empty(5, dtype= 'S92')
    neighbor_file_path_inner = (d[index][cluster_mask][neighbors_mask]['file_path'])
#     print(neighbor_file_path_inner)
    neighbor_file_path[i] = neighbor_file_path_inner
    neighbor_sensor_id[i] = (d[index][cluster_mask][neighbors_mask]['sensor_id'])
    
#     for time in range(len(neighbor_timestamps_prelim)):
# #         print(i) 
#         j = neighbor_timestamps_prelim[time]
#         dt = datetime.datetime.utcfromtimestamp(j)
#         dt = pytz.UTC.localize(dt)
#         dt = dt.astimezone(pytz.timezone('US/Eastern'))
#         neighbor_timestamps_dt[i][time] = dt

neighbor_timestamps_dt = [convert_to_dt(x) for x in neighbor_timestamps.flatten()]
neighbor_cut_file_path = [cut_file_path(x) for x in neighbor_file_path.flatten()]

121
152
177
258
108
98
337
212
349
98
364
19
96
236
142
217
204
122
11
41
175
89
96
93
300
308
104
108
137
220
85
184
129
353
151
134
61
182
202
29
170
63
45
139
269
374
12
42
117
112
173
301
93
127
82
175
80
276
114
63
143
131
114
283


In [166]:
def convert_to_dt(timestamp):
    dt = datetime.datetime.utcfromtimestamp(timestamp)
    dt = pytz.UTC.localize(dt)
    dt = dt.astimezone(pytz.timezone('US/Eastern'))
    return dt

In [257]:
df = pd.DataFrame(centroids)
df.insert(0, "timestamp_orig", neighbor_timestamps_orig.flatten(), True)
df.insert(1, "timestamp_dt", neighbor_timestamps_dt, True)
df.insert(2, "sensor_id", neighbor_sensor_id.flatten(), True)
df.insert(3, "file_path", neighbor_cut_file_path, True)
df.insert(4, "centroid_num", centroid_num_arr, True)
df.insert(5, "num_clusters", num_centroids, True)

In [258]:
df.head()

Unnamed: 0,timestamp_orig,timestamp_dt,sensor_id,file_path,centroid_num,num_clusters,0,1,2,3,...,96,97,98,99,100,101,102,103,104,105
0,1559585000.0,2019-06-03 14:09:47-04:00,b'sonycnode-b827ebc178d2.sonyc',b'sonycnode-b827ebc178d2.sonyc/2019-06-03/14/1...,0,64,-2.499462,2.348578,2.582144,0.499978,...,-0.002518,0.073646,0.03729,-0.005156,-0.04802,0.039227,0.006568,-0.006354,0.054845,-0.025968
1,1559610000.0,2019-06-03 20:58:56-04:00,b'sonycnode-b827ebc178d2.sonyc',b'sonycnode-b827ebc178d2.sonyc/2019-06-03/20/1...,0,64,-2.499462,2.348578,2.582144,0.499978,...,-0.002518,0.073646,0.03729,-0.005156,-0.04802,0.039227,0.006568,-0.006354,0.054845,-0.025968
2,1560003000.0,2019-06-08 10:08:55-04:00,b'sonycnode-b827ebc178d2.sonyc',b'sonycnode-b827ebc178d2.sonyc/2019-06-08/10/1...,0,64,-2.499462,2.348578,2.582144,0.499978,...,-0.002518,0.073646,0.03729,-0.005156,-0.04802,0.039227,0.006568,-0.006354,0.054845,-0.025968
3,1560812000.0,2019-06-17 18:51:37-04:00,b'sonycnode-b827ebc178d2.sonyc',b'sonycnode-b827ebc178d2.sonyc/2019-06-17/18/1...,0,64,-2.499462,2.348578,2.582144,0.499978,...,-0.002518,0.073646,0.03729,-0.005156,-0.04802,0.039227,0.006568,-0.006354,0.054845,-0.025968
4,1561846000.0,2019-06-29 18:09:45-04:00,b'sonycnode-b827ebc178d2.sonyc',b'sonycnode-b827ebc178d2.sonyc/2019-06-29/18/1...,0,64,-2.499462,2.348578,2.582144,0.499978,...,-0.002518,0.073646,0.03729,-0.005156,-0.04802,0.039227,0.006568,-0.006354,0.054845,-0.025968


In [246]:
test = []
test.append(d[0, 'file_path'])

In [247]:
test[0]

b'/Users/marin/redhook/embeddings/sonycnode-b827ebc178d2.sonyc/2019-06-01/00/1559361616.89.npz'