In [163]:
import numpy as np
import pandas as pd
import matplotlib
from sklearn.decomposition import PCA as sklearnPCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import os
import pickle
from sklearn.model_selection import KFold
import sys
sys.path.insert(1, '../modules')
import data

Create numpy array of embeddings

In [2]:
embedding_list = []

In [3]:
timestamps = []

In [4]:
for folder in \
    os.listdir('/green-projects/project-sonyc_redhook/workspace/share/redhook-analysis/output/embeddings'):
        for file in os.listdir\
        ('/green-projects/project-sonyc_redhook/workspace/share/redhook-analysis/output/embeddings/' + folder):
            timestamps.append(file.split('.')[0])
            data = \
            np.load('/green-projects/project-sonyc_redhook/workspace/share/redhook-analysis/output/embeddings/'\
                    + folder + '/' + file)
            emb = data['embedding']
            embedding_list.append(emb)
            data.close()

In [5]:
len(embedding_list)

1523

In [6]:
embedding_list = np.asarray(embedding_list)

In [7]:
embedding_list.shape

(1523, 19, 512)

In [8]:
expanded_embedding_list = []

In [9]:
for embedding in embedding_list:
    for i in embedding:
        expanded_embedding_list.append(i)

In [10]:
len(expanded_embedding_list)

28937

In [11]:
expanded_embedding_list = np.asarray(expanded_embedding_list)

In [12]:
expanded_embedding_list.shape

(28937, 512)

In [13]:
pca_45 = sklearnPCA(45)
pca_45.fit(expanded_embedding_list)

PCA(copy=True, iterated_power='auto', n_components=45, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [14]:
embedding_list = np.asarray([pca_45.transform(embedding) for embedding in embedding_list])

In [15]:
embedding_list.shape

(1523, 19, 45)

In [16]:
def get_summary(embedding):
    embedding_mean = np.mean(embedding, axis=0)
    embedding_std = np.std(embedding, axis=0)
    embedding_max = np.amax(embedding, axis=0)
    embedding_min = np.amin(embedding, axis=0)
    embedding_summary = np.concatenate((embedding_mean, embedding_max, embedding_min, embedding_std))
    return embedding_summary

In [17]:
embedding_summaries = np.asarray([get_summary(embedding) for embedding in embedding_list])

In [18]:
embedding_summaries.shape

(1523, 180)

Joining embeddings with timestamp label
NOTE: 1523 is the accurate number of counts

In [20]:
timestamps = np.asarray(timestamps)

In [32]:
timestamps.shape

(1523,)

In [63]:
embedding_summaries.shape

(1523, 180)

In [73]:
labeled_embedding_summaries = []

In [74]:
for i in range(len(timestamps)):
    labeled_embedding_summaries.append([timestamps[i], embedding_summaries[i]])

In [118]:
labeled_embedding_summaries = np.asarray([[int(timestamp), embedding] for [timestamp, embedding] in labeled_embedding_summaries])

In [119]:
labeled_embedding_summaries.shape

(1523, 2)

In [121]:
labeled_embedding_summaries[0][0]

1573060932

In [89]:
labeled_embedding_summaries[0][1].shape

(180,)

In [143]:
embedding_sorted = np.asarray(sorted(labeled_embedding_summaries, key=lambda tup: tup[0]))

Random Forest on 

Use K-Fold Cross Validation

In [150]:
embeddings_day = []
timestamps_day = []

In [151]:
for folder in \
    os.listdir('/green-projects/project-sonyc_redhook/workspace/share/redhook-analysis/output/embeddings'):
    day_arr = []
    for file in os.listdir\
        ('/green-projects/project-sonyc_redhook/workspace/share/redhook-analysis/output/embeddings/' + folder):
        timestamps_day.append(file.split('.')[0])
        data = \
        np.load('/green-projects/project-sonyc_redhook/workspace/share/redhook-analysis/output/embeddings/'\
                + folder + '/' + file)
        emb = data['embedding']
        day_arr.append(emb)
        data.close()
    embeddings_day.append(day_arr)

In [158]:
embeddings_day = np.asarray(embeddings_day)
timestamps_day = np.asarray(timestamps_day)

In [175]:
len(embeddings_day[0])

94

Divide annotations by day

Creating Y list of classifications

In [81]:
with open('/green-projects/project-sonyc_redhook/workspace/share/redhook-analysis/output/annotation_list.pickle', "rb") as f:
       annotation_list = pickle.load(f)

In [95]:
len(annotation_list)

1538

In [126]:
annotation_list = [(int(timestamp), annotation) for (timestamp, annotation) in annotation_list]

In [127]:
annotation_list[0]

(1573060932, 'n')

Deleting timestamps that aren't in labeled_embedding_summaries, so both arrays have the same size

In [129]:
annotation_list_cut = []
for annotation in annotation_list:
    if annotation[0] in labeled_embedding_summaries[:,0]:
        annotation_list_cut.append(annotation)

In [130]:
len(annotation_list_cut)

1523

In [148]:
annotation_list_cut[7]

(1573062134, 'n')

In [138]:
annotation_list_sorted = sorted(annotation_list_cut, key=lambda tup: tup[0])

In [165]:
annotation_list_sorted[0][0]

1573060932

In [193]:
annotation_list_day = []

In [194]:
day_list = [6, 7, 8, 9, 10, 11, 12, 13, 15]

In [195]:
current_day = start_day
current_day_list = []
for day in day_list:
    i = 0
    while data.convert_timestamps(annotation_list_sorted[i][0]).day == day:
        current_day = data.convert_timestamps(annotation_list_sorted[i][0]).day
        current_day_list.append(annotation_list_sorted[i])
        i += 1
    annotation_list_day.append(current_day_list)

In [198]:
len(current_day_list)
len(annotation_list_day)

9

In [202]:
annotation_list_day = np.asarray(annotation_list_day)

Now we have a list of annotations grouped by day!

In [203]:
kf = KFold(n_splits=9)
kf.get_n_splits(embeddings_day)

9

In [204]:
for train_index, test_index in kf.split(embeddings_day):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = embeddings_day[train_index], embeddings_day[test_index]
    y_train, y_test = annotation_list_day[train_index], annotation_list_day[test_index]

TRAIN: [1 2 3 4 5 6 7 8] TEST: [0]
TRAIN: [0 2 3 4 5 6 7 8] TEST: [1]
TRAIN: [0 1 3 4 5 6 7 8] TEST: [2]
TRAIN: [0 1 2 4 5 6 7 8] TEST: [3]
TRAIN: [0 1 2 3 5 6 7 8] TEST: [4]
TRAIN: [0 1 2 3 4 6 7 8] TEST: [5]
TRAIN: [0 1 2 3 4 5 7 8] TEST: [6]
TRAIN: [0 1 2 3 4 5 6 8] TEST: [7]
TRAIN: [0 1 2 3 4 5 6 7] TEST: [8]
