In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import PolynomialFeatures
import datetime
import pytz
from pytz import timezone
import tarfile
from numpy import load
import os
from sklearn.manifold import TSNE
import h5py
from sklearn.decomposition import PCA as sklearnPCA
from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn import datasets
from sklearn.metrics import davies_bouldin_score
from sklearn.cluster import DBSCAN
from sklearn.cluster import Birch
import scipy as sp
from scipy import spatial
from scipy import stats

In [19]:
#TODO: sonycnode-b827eb491436.sonyc and sonycnode-b827ebc178d2.sonyc are the ones with the most coverage, filter
#for those! Also, take 5 random samples from each cluster along with the neighbors

In [20]:
h5 = h5py.File('sound_data_improved.hdf5', 'r')

d = h5['sound_data']

In [22]:
middle_sensor_mask = (d['sensor_id'] == b'sonycnode-b827ebc178d2.sonyc') | (d['sensor_id'] == b'sonycnode-b827eb491436.sonyc')

In [52]:
d_middle = d[middle_sensor_mask]

In [39]:
sample_nums = np.random.choice(range(d_middle.shape[0]), 10000, replace=False)

index = np.zeros(d_middle.shape[0]).astype('bool')
index[sample_nums] = True

In [24]:
pca_106 = sklearnPCA(106)
projected = pca_106.fit_transform(d_middle['feature_vector'])

In [25]:
def get_cluster_model(num_clusters):
    """

    Parameters
    ----------
    num_clusters

    Returns
    -------
    mbk.cluster_centers_ : array of shape (num_clusters, 45)
        An array of the feature vectors for each centroid in each cluster.

    """
    mbk = MiniBatchKMeans(n_clusters=num_clusters, random_state=0)
    mbk.fit(projected[index])
    return mbk

In [26]:
def convert_to_dt(timestamp):
    """
    Converts a float timestamp to a datetime object.
    
    Parameters
    ----------
    timestamp : float
        A float representing the time.
        
    Returns
    -------
    dt : datetime object
        A datetime object corresponding to the time represented by timestamp.
    """
    dt = datetime.datetime.utcfromtimestamp(timestamp)
    dt = pytz.UTC.localize(dt)
    dt = dt.astimezone(pytz.timezone('US/Eastern'))
    return dt

In [27]:
def cut_file_path(neighbors_file_path):
    """
    Cuts a file name to start with the sensor name.
    """
    return(neighbors_file_path[32:])   

In [60]:
def make_neighbors_dataframe(num_clusters):
    """
    Returns a DataFrame with the information (timestamp, filepath, etc) from five neighbors of each centroid in each
    cluster the data is grouped into.

    Parameters
    ----------
    num_clusters : int
        Number of clusters to group the projected data into.

    Returns
    -------
    df : DataFrame
        pandas DataFrame listing information about each neighbor, including timestamp, filepath, the centroid it is
        associated with, and the number of clusters the projected data is grouped into.
    """
    cluster_assignments = get_cluster_model(num_clusters).predict(projected[index])
    cluster_centers = get_cluster_model(num_clusters).cluster_centers_
    centroid_cluster_assignments = get_cluster_model(num_clusters).predict(cluster_centers)

    # print(centroid_cluster_assignments)
    # print(len(cluster_centers))

    centroids = []
    centroid_num_arr = []
    num_centroids = num_clusters*10*[num_clusters]
    
    is_neighbor = []
    
    #arrays for neighbors
    timestamps = np.empty((num_clusters,10))
    timestamps_orig = np.empty((num_clusters,10))
    file_path = np.empty((num_clusters,10), dtype='S92')
    # neighbor_file_path = []
    sensor_id = np.empty((num_clusters,10), dtype='S60')
    # neighbor_timestamps_dt = np.empty((64*5), dtype = datetime.datetime)
    # print(neighbor_timestamps_dt.dtype)

    for i,cluster_index in enumerate(range(num_clusters)):
        #for each cluster center, query only the cluster it belongs to

        #Filter out only the points belonging to one cluster
        cluster_mask = (cluster_assignments==cluster_index)
        cluster_test = projected[index][cluster_mask]

        #Makes a list of the centroid of the cluster with length of the number of the points in the cluster
        centroid_list = 10*[cluster_centers[cluster_index]]
        centroids += centroid_list

        #Makes a list of the cluster index with length of the number of the points in the cluster
        centroid_num_list = 10*[cluster_index+1]
        centroid_num_arr += centroid_num_list

#         print(len(cluster_test))
        nearest_neighbors = []
        tree = spatial.KDTree(cluster_test)
    #     print(cluster_centers[cluster_index])
        nearest_neighbors = tree.query(cluster_centers[cluster_index], 5)[1]

        #from only the points corresponding to a certain cluster in the 10000 subset of projected, apply the nearest
        #neighbors mask to obtain the other characteristics like file path, timestamp, etc

        neighbors_mask = np.zeros(len(cluster_test)).astype('bool')
        neighbors_mask[np.sort(nearest_neighbors)] = True
        is_neighbor += 5*['Y']
        
         #random sampling from cluster 
        random_nums = np.random.choice(range(cluster_test.shape[0]), 5, replace=False)
        random_cluster_mask = np.zeros(cluster_test.shape[0]).astype('bool')
        random_cluster_mask[random_nums] = True
        is_neighbor += 5*['N']
        
        d_neighbors = d_middle[index][cluster_mask][neighbors_mask]
        d_random = d_middle[index][cluster_mask][random_cluster_mask]
        
        timestamps_empty = np.empty((2, 5))
        timestamps_empty[0] = d_neighbors['timestamp']
        timestamps_empty[1] = d_random['timestamp']
        timestamps[i] = (timestamps_empty.flatten())
        
        timestamps_orig_empty = np.empty((2, 5))
        timestamps_orig_empty[0] = d_neighbors['timestamp_orig']
        timestamps_orig_empty[1] = d_random['timestamp_orig']
        timestamps_orig[i] = timestamps_orig_empty.flatten()
        
        file_path_empty = np.empty((2, 5), dtype='S92')
        file_path_empty[0] = d_neighbors['file_path']
        file_path_empty[1] = d_random['file_path']
    #     print(neighbor_file_path_inner)
        file_path[i] = file_path_empty.flatten()
        
        sensor_id_empty = np.empty((2, 5), dtype='S60')
        sensor_id_empty[0] = d_neighbors['sensor_id']
        sensor_id_empty[1] = d_random['sensor_id']
        sensor_id[i] = sensor_id_empty.flatten()
        
        print('done with cluster ' + str(cluster_index))

    timestamps_dt = [convert_to_dt(x) for x in timestamps.flatten()]
    file_path_cut = [cut_file_path(x) for x in file_path.flatten()]
    
    print(len(is_neighbor))
    
    # Making the dataframe
    df = pd.DataFrame(centroids)
    df.insert(0, 'is_neighbor', is_neighbor, True)
    df.insert(1, "timestamp_orig", timestamps_orig.flatten(), True)
    df.insert(2, "timestamp_dt", timestamps_dt, True)
    df.insert(3, "sensor_id", sensor_id.flatten(), True)
    df.insert(4, "file_path", file_path_cut, True)
    df.insert(5, "centroid_num", centroid_num_arr, True)
    df.insert(6, "num_clusters", num_centroids, True)

    return df


In [61]:
df_2 = make_neighbors_dataframe(2 ** 6)

done with cluster 0
done with cluster 1
done with cluster 2
done with cluster 3
done with cluster 4
done with cluster 5
done with cluster 6
done with cluster 7
done with cluster 8
done with cluster 9
done with cluster 10
done with cluster 11
done with cluster 12
done with cluster 13
done with cluster 14
done with cluster 15
done with cluster 16
done with cluster 17
done with cluster 18
done with cluster 19
done with cluster 20
done with cluster 21
done with cluster 22
done with cluster 23
done with cluster 24
done with cluster 25
done with cluster 26
done with cluster 27
done with cluster 28
done with cluster 29
done with cluster 30
done with cluster 31
done with cluster 32
done with cluster 33
done with cluster 34
done with cluster 35
done with cluster 36
done with cluster 37
done with cluster 38
done with cluster 39
done with cluster 40
done with cluster 41
done with cluster 42
done with cluster 43
done with cluster 44
done with cluster 45
done with cluster 46
done with cluster 47
do

In [63]:
df_2.head()

Unnamed: 0,is_neighbor,timestamp_orig,timestamp_dt,sensor_id,file_path,centroid_num,num_clusters,0,1,2,...,96,97,98,99,100,101,102,103,104,105
0,Y,1559518000.0,2019-06-02 19:31:45-04:00,b'sonycnode-b827ebc178d2.sonyc',b'sonycnode-b827ebc178d2.sonyc/2019-06-02/19/1...,1,64,17.653128,3.403607,-0.457391,...,-0.009764,-0.009314,-0.038994,-0.091355,-0.115825,0.087878,0.086923,-0.06943,0.070162,-0.049835
1,Y,1559693000.0,2019-06-04 19:55:20-04:00,b'sonycnode-b827ebc178d2.sonyc',b'sonycnode-b827ebc178d2.sonyc/2019-06-04/19/1...,1,64,17.653128,3.403607,-0.457391,...,-0.009764,-0.009314,-0.038994,-0.091355,-0.115825,0.087878,0.086923,-0.06943,0.070162,-0.049835
2,Y,1560626000.0,2019-06-15 15:19:10-04:00,b'sonycnode-b827ebc178d2.sonyc',b'sonycnode-b827ebc178d2.sonyc/2019-06-15/15/1...,1,64,17.653128,3.403607,-0.457391,...,-0.009764,-0.009314,-0.038994,-0.091355,-0.115825,0.087878,0.086923,-0.06943,0.070162,-0.049835
3,Y,1561209000.0,2019-06-22 09:17:31-04:00,b'sonycnode-b827ebc178d2.sonyc',b'sonycnode-b827ebc178d2.sonyc/2019-06-22/09/1...,1,64,17.653128,3.403607,-0.457391,...,-0.009764,-0.009314,-0.038994,-0.091355,-0.115825,0.087878,0.086923,-0.06943,0.070162,-0.049835
4,Y,1561616000.0,2019-06-27 02:21:17-04:00,b'sonycnode-b827ebc178d2.sonyc',b'sonycnode-b827ebc178d2.sonyc/2019-06-27/02/1...,1,64,17.653128,3.403607,-0.457391,...,-0.009764,-0.009314,-0.038994,-0.091355,-0.115825,0.087878,0.086923,-0.06943,0.070162,-0.049835


In [33]:
test = np.array([1, 2, 3, 4])
test_2 = np.array([5, 6, 7, 8])
np.concatenate((test, test_2))

array([1, 2, 3, 4, 5, 6, 7, 8])

In [49]:
f_1 = np.empty((2, 3))
flatten_test = np.array([5, 6, 7])
f_1[0] = flatten_test
f_2 = np.array([8, 9, 10])
f_1[1] = f_2
f_1.flatten()

array([ 5.,  6.,  7.,  8.,  9., 10.])