In [542]:
import numpy as np
import pandas as pd
import matplotlib
from sklearn.decomposition import PCA as sklearnPCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import os
import pickle
from sklearn.model_selection import KFold
import sys
sys.path.insert(1, '../modules')
import data
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import cross_validate
from sklearn.metrics import precision_recall_fscore_support

# Manipulating Embeddings

Brief outline of what this notebook should look like eventually:
1. load the annotations
2. filter out maybes
3. load the embeddings into the list based on what's in the annotation list (so both have the same size)
4. cross validation time

In [2]:
embedding_list = []

In [3]:
timestamps = []

In [4]:
for folder in \
    os.listdir('/green-projects/project-sonyc_redhook/workspace/share/redhook-analysis/output/embeddings'):
        for file in os.listdir\
        ('/green-projects/project-sonyc_redhook/workspace/share/redhook-analysis/output/embeddings/' + folder):
            timestamps.append(file.split('.')[0])
            data = \
            np.load('/green-projects/project-sonyc_redhook/workspace/share/redhook-analysis/output/embeddings/'\
                    + folder + '/' + file)
            emb = data['embedding']
            embedding_list.append(emb)
            data.close()

In [16]:
def get_summary(embedding):
    embedding_mean = np.mean(embedding, axis=0)
    embedding_std = np.std(embedding, axis=0)
    embedding_max = np.amax(embedding, axis=0)
    embedding_min = np.amin(embedding, axis=0)
    embedding_summary = np.concatenate((embedding_mean, embedding_max, embedding_min, embedding_std))
    return embedding_summary

In [442]:
def manipulate_embeddings_train(embedding_list):
#     print(embedding_list.shape)


    expanded_embedding_list = \
    np.reshape(embedding_list, (embedding_list.shape[0]*embedding_list.shape[1], 512))
    
    #should be something like (28937, 512)
#     print(expanded_embedding_list.shape)
    
    pca_45 = sklearnPCA(45)
    pca_45.fit(expanded_embedding_list)
    
    embedding_list = np.asarray([pca_45.transform(embedding) for embedding in embedding_list])
    
    #should be something like (1523, 19, 45)
#     print(embedding_list.shape)
    
    embedding_summaries = np.asarray([get_summary(embedding) for embedding in embedding_list])

#     print(embedding_summaries.shape)
    
    return(embedding_summaries)

In [None]:
def manipulate_embeddings_test(embedding_list):
#     print(embedding_list.shape)


    expanded_embedding_list = \
    np.reshape(embedding_list, (embedding_list.shape[0]*embedding_list.shape[1], 512))
    
    #should be something like (28937, 512)
#     print(expanded_embedding_list.shape)
    
    pca_45.fit(expanded_embedding_list)
    
    embedding_list = np.asarray([pca_45.transform(embedding) for embedding in embedding_list])
    
    #should be something like (1523, 19, 45)
#     print(embedding_list.shape)
    
    embedding_summaries = np.asarray([get_summary(embedding) for embedding in embedding_list])

#     print(embedding_summaries.shape)
    
    return(embedding_summaries)

Joining embeddings with timestamp label
NOTE: 1509 is the accurate number of counts

In [20]:
timestamps = np.asarray(timestamps)

In [32]:
timestamps.shape

(1523,)

In [63]:
embedding_summaries.shape

(1523, 180)

In [73]:
labeled_embedding_summaries = []

In [74]:
for i in range(len(timestamps)):
    labeled_embedding_summaries.append([timestamps[i], embedding_summaries[i]])

In [118]:
labeled_embedding_summaries = np.asarray([[int(timestamp), embedding] for [timestamp, embedding] in labeled_embedding_summaries])

In [119]:
labeled_embedding_summaries.shape

(1523, 2)

In [121]:
labeled_embedding_summaries[0][0]

1573060932

In [89]:
labeled_embedding_summaries[0][1].shape

(180,)

In [143]:
embedding_sorted = np.asarray(sorted(labeled_embedding_summaries, key=lambda tup: tup[0]))

Divide annotations by day

Creating Y list of classifications

In [248]:
with open('/green-projects/project-sonyc_redhook/workspace/share/redhook-analysis/output/annotation_list.pickle', "rb") as f:
       annotation_list = pickle.load(f)

In [249]:
len(annotation_list)

1538

In [250]:
annotation_list = [(int(timestamp), annotation) for (timestamp, annotation) in annotation_list]

In [251]:
annotation_list[0]

(1573060932, 'n')

Deleting timestamps that aren't in labeled_embedding_summaries, so both arrays have the same size

In [252]:
annotation_list_cut = []
for annotation in annotation_list:
    if annotation[0] in labeled_embedding_summaries[:,0]:
        annotation_list_cut.append(annotation)

In [253]:
len(annotation_list_cut)

1523

In [254]:
annotation_list_cut[7]

(1573063463, 'n')

Sort annotation list by timestamp, then filter out anything that's not a yes or no

In [306]:
annotation_list_sorted = sorted(annotation_list_cut, key=lambda tup: tup[0])

In [307]:
annotation_list_sorted[0]

(1573060932, 'n')

In [397]:
annotation_list_sorted = \
np.asarray([annotation for annotation in annotation_list_sorted if (annotation[1] == 'y' or annotation[1] == 'n')])

In [398]:
len(annotation_list_sorted)

1509

In [400]:
annotation_list_sorted[:,0]

array(['1573060932', '1573061031', '1573061130', ..., '1573852939',
       '1573853129', '1573853688'], dtype='<U21')

1509 is the number of annotations excluding "maybes"

In [368]:
annotation_list_day = []

In [369]:
day_list = [6, 7, 8, 9, 10, 11, 12, 13, 15]

In [370]:
# current_day = day_list[0]
for day in day_list:
#     print(day)
    i = 0
    current_day_list = []
#     while data.convert_timestamps(annotation_list_sorted[i][0]).day == day:
#         current_day = data.convert_timestamps(annotation_list_sorted[i][0]).day
#         current_day_list.append(annotation_list_sorted[i])
#         i += 1
    for annotation in annotation_list_sorted:
        if data.convert_timestamps(annotation[0]).day == day:
            current_day_list.append(annotation)
        
    annotation_list_day.append(current_day_list)

In [383]:
annotation_list_day = np.asarray(annotation_list_day)

In [384]:
len(current_day_list)
len(annotation_list_day[2])

160

In [388]:
annotation_list_day

(9,)

Now we have a list of annotations grouped by day!

Collecting embeddings by day, filtered to make sure we only have embeddings corresponding to yes/no annotations

In [412]:
embeddings_day = []
timestamps_day = []

In [416]:
for folder in \
    os.listdir('/green-projects/project-sonyc_redhook/workspace/share/redhook-analysis/output/embeddings'):
    day_arr = []
    day_time_arr = []
    for file in os.listdir\
        ('/green-projects/project-sonyc_redhook/workspace/share/redhook-analysis/output/embeddings/' + folder):
#         print(int(file.split('.')[0]))
        if file.split('.')[0] in annotation_list_sorted[:,0]:
            day_time_arr.append(file.split('.')[0])
            data = \
            np.load('/green-projects/project-sonyc_redhook/workspace/share/redhook-analysis/output/embeddings/'\
                    + folder + '/' + file)
            emb = data['embedding'] 
            day_arr.append(emb)
            data.close()
    timestamps_day.append(np.asarray(day_time_arr))
    embeddings_day.append(np.asarray(day_arr))

In [417]:
embeddings_day = np.asarray(embeddings_day)
timestamps_day = np.asarray(timestamps_day)

In [420]:
total_embeddings = 1509

In [421]:
kf = KFold(n_splits=9)
kf.get_n_splits(embeddings_day)

9

For every train/test split, run PCA, get embedding summaries, use standard scaler

In [458]:
for train_index, test_index in kf.split(embeddings_day):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = embeddings_day[train_index], embeddings_day[test_index]
    y_train, y_test = annotation_list_day[train_index], annotation_list_day[test_index]
    
    total_train= []
    for day in X_train:
        for embedding in day:
            total_train.append(embedding)
     
    total_train = np.asarray(total_train)
    
    total_train_annotations = []
    for day in y_train:
        for annotation in day:
            total_train_annotations.append(annotation[1])
            
    total_train_annotations = np.asarray(total_train_annotations)
    
    #manipulate embeddings for each X_train
    expanded_total_train = \
    np.reshape(total_train, (total_train.shape[0]*total_train.shape[1], 512))
    
    pca_45 = sklearnPCA(45)
    pca_45.fit(expanded_total_train)
    
    total_train = np.asarray([pca_45.transform(embedding) for embedding in total_train])

    total_train_summaries = np.asarray([get_summary(embedding) for embedding in total_train])

    #standard scaler
    scaler = StandardScaler()
    scaler.fit(total_train_summaries)
    total_train_scaler = scaler.transform(total_train_summaries)

    #random forest on training data
    clf = RandomForestClassifier()
    clf.fit(total_train_scaler, total_train_annotations)
    
    #now apply to test data!
    test = np.asarray(X_test[0])
    print(test.shape)
    
    #manipulate embeddings for each X_test
    expanded_test = \
    np.reshape(test, (test.shape[0]*test.shape[1], 512))
    
    test = np.asarray([pca_45.transform(embedding) for embedding in test])

    test_summaries = np.asarray([get_summary(embedding) for embedding in test])

    print(test_summaries.shape)
    
    #standard scaler on test data
    test_scaler = scaler.transform(test_summaries)
    
    #random forest on test data
    predictions = clf.predict(test_scaler)

TRAIN: [1 2 3 4 5 6 7 8] TEST: [0]
(88, 19, 512)
(88, 180)
TRAIN: [0 2 3 4 5 6 7 8] TEST: [1]
(199, 19, 512)
(199, 180)
TRAIN: [0 1 3 4 5 6 7 8] TEST: [2]
(160, 19, 512)
(160, 180)
TRAIN: [0 1 2 4 5 6 7 8] TEST: [3]
(107, 19, 512)
(107, 180)
TRAIN: [0 1 2 3 5 6 7 8] TEST: [4]
(115, 19, 512)
(115, 180)
TRAIN: [0 1 2 3 4 6 7 8] TEST: [5]
(182, 19, 512)
(182, 180)
TRAIN: [0 1 2 3 4 5 7 8] TEST: [6]
(190, 19, 512)
(190, 180)
TRAIN: [0 1 2 3 4 5 6 8] TEST: [7]
(215, 19, 512)
(215, 180)
TRAIN: [0 1 2 3 4 5 6 7] TEST: [8]
(253, 19, 512)
(253, 180)


# Starting over, using Leave One Group Out method instead

In [475]:
#y list is just the annotations
y = annotation_list_sorted[:,1]

In [477]:
y[0]

'n'

In [498]:
embeddings_cut = []

In [499]:
for folder in \
    os.listdir('/green-projects/project-sonyc_redhook/workspace/share/redhook-analysis/output/embeddings'):
        for file in os.listdir\
        ('/green-projects/project-sonyc_redhook/workspace/share/redhook-analysis/output/embeddings/' + folder):
            if(file.split('.')[0]) in annotation_list_sorted[:,0]:
                data = \
                np.load('/green-projects/project-sonyc_redhook/workspace/share/redhook-analysis/output/embeddings/'\
                    + folder + '/' + file)
                emb = data['embedding']
                embeddings_cut.append(emb)
#             timestamps.append(file.split('.')[0])
#             data = \
#             np.load('/green-projects/project-sonyc_redhook/workspace/share/redhook-analysis/output/embeddings/'\
#                     + folder + '/' + file)
#             emb = data['embedding']
#             embedding_list.append(emb)
#             data.close()

In [513]:
X = np.asarray(embeddings_cut)

In [514]:
X.shape

(1509, 19, 512)

In [515]:
groups = np.asarray([data.convert_timestamps(int(timestamp)).day for timestamp in annotation_list_sorted[:,0]])

In [516]:
logo = LeaveOneGroupOut()

In [518]:
logo.get_n_splits(X, y, groups)

9

In [519]:
logo.get_n_splits(groups=groups)

9

In [543]:
for train_index, test_index in logo.split(X, groups=groups):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    expanded_X_train = \
    np.reshape(X_train, (X_train.shape[0]*X_train.shape[1], 512))
  
    pca_45 = sklearnPCA(45)
    pca_45.fit(expanded_X_train)
   
    X_transformed = np.asarray([pca_45.transform(embedding) for embedding in X_train])

    X_summaries = np.asarray([get_summary(embedding) for embedding in X_transformed])
    
    #standard scaler
    scaler = StandardScaler()
    scaler.fit(X_summaries)
    X_scaler = scaler.transform(X_summaries)

    #random forest on training data
    clf = RandomForestClassifier().fit(X_scaler, y_train)
    
    #manipulate embeddings for each X_test
    expanded_X_test = \
    np.reshape(X_test, (X_test.shape[0]*X_test.shape[1], 512))
    
    X_test = np.asarray([pca_45.transform(embedding) for embedding in X_test])

    X_test_summaries = np.asarray([get_summary(embedding) for embedding in X_test])
    
    #standard scaler on test data
    X_test_scaler = scaler.transform(X_test_summaries)
    
    #get cross validation scores
    print('clf score: ', clf.score(X_test_scaler, y_test))
    
    #get f score, precision, recall
    #note: what average should I use???
    y_predicted = clf.predict(X_test_scaler)
    print\
    ('precision, recall, f score: ', precision_recall_fscore_support(y_test, y_predicted, average='macro'))
    
    

clf score:  0.6363636363636364
precision, recall, f score:  (0.813953488372093, 0.5294117647058824, 0.44126984126984126, None)
clf score:  0.5879396984924623
precision, recall, f score:  (0.4260471204188482, 0.4880271243907608, 0.39184555754323197, None)
clf score:  0.6
precision, recall, f score:  (0.4692144373673036, 0.4976272295859925, 0.3891672631830112, None)
clf score:  0.6261682242990654
precision, recall, f score:  (0.46785714285714286, 0.49131274131274133, 0.4278074866310161, None)
clf score:  0.8347826086956521
precision, recall, f score:  (0.43636363636363634, 0.4752475247524752, 0.4549763033175355, None)
clf score:  0.6703296703296703
precision, recall, f score:  (0.4857142857142857, 0.4975438596491228, 0.43124999999999997, None)
clf score:  0.6631578947368421
precision, recall, f score:  (0.6648250460405156, 0.5325809974517656, 0.4736842105263158, None)
clf score:  0.6837209302325581
precision, recall, f score:  (0.5604395604395604, 0.508874546187979, 0.4450349225630124, N