In [61]:
import numpy as np
import pandas as pd
import matplotlib
from sklearn.decomposition import PCA as sklearnPCA
#classifications
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.datasets import make_classification
import os
import pickle
from sklearn.model_selection import KFold
import sys
sys.path.insert(1, '../modules')
import data
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import cross_validate
from sklearn.metrics import precision_recall_fscore_support
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import sklearn.utils

Brief outline of what this notebook should look like eventually:
1. load the annotations
2. filter out maybes
3. load the embeddings into the list based on what's in the annotation list (so both have the same size)
4. cross validation time

In [54]:
def get_summary(embedding):
    embedding_mean = np.mean(embedding, axis=0)
    embedding_std = np.std(embedding, axis=0)
    embedding_max = np.amax(embedding, axis=0)
    embedding_min = np.amin(embedding, axis=0)
    embedding_summary = np.concatenate((embedding_mean, embedding_max, embedding_min, embedding_std))
    return embedding_summary

# Creating Y list of classifications

Divide annotations by day

In [5]:
with open('/green-projects/project-sonyc_redhook/workspace/share/redhook-analysis/output/annotation_list.pickle', "rb") as f:
       annotation_list = pickle.load(f)

In [6]:
len(annotation_list)

1538

In [11]:
annotation_list = np.asarray([(int(timestamp), annotation) for (timestamp, annotation) in annotation_list])

In [12]:
annotation_list[0]

array(['1573060932', 'n'], dtype='<U21')

Sort annotation list by timestamp, then filter out anything that's not a yes or no

In [17]:
annotation_list_sorted = sorted(annotation_list, key=lambda tup: tup[0])

In [18]:
annotation_list_sorted[0]

array(['1573060932', 'n'], dtype='<U21')

In [19]:
annotation_list_sorted = \
np.asarray([annotation for annotation in annotation_list_sorted if (annotation[1] == 'y' or annotation[1] == 'n')])

In [20]:
len(annotation_list_sorted)

1524

In [400]:
annotation_list_sorted[:,0]

array(['1573060932', '1573061031', '1573061130', ..., '1573852939',
       '1573853129', '1573853688'], dtype='<U21')

# Manipulating Embeddings

In [25]:
embeddings = []

In [26]:
timestamps = []

In [27]:
for folder in \
    os.listdir('/green-projects/project-sonyc_redhook/workspace/share/redhook-analysis/output/embeddings'):
        for file in os.listdir\
        ('/green-projects/project-sonyc_redhook/workspace/share/redhook-analysis/output/embeddings/' + folder):
            if(file.split('.')[0]) in annotation_list_sorted[:,0]:
                data = \
                np.load('/green-projects/project-sonyc_redhook/workspace/share/redhook-analysis/output/embeddings/'\
                    + folder + '/' + file)
                emb = data['embedding']
                embeddings.append(emb)
                timestamps.append(file.split('.')[0])
#             data = \
#             np.load('/green-projects/project-sonyc_redhook/workspace/share/redhook-analysis/output/embeddings/'\
#                     + folder + '/' + file)
#             emb = data['embedding']
#             embedding_list.append(emb)
#             data.close()

In [28]:
X = np.asarray(embeddings)

In [29]:
X.shape

(1509, 19, 512)

Deleting timestamps that aren't in embeddings, so both arrays have the same size

In [37]:
annotation_list_cut = np.asarray([annotation for annotation in annotation_list if annotation[0] in timestamps])

In [60]:
len(annotation_list_cut)

1509

In [39]:
annotation_list_cut[7]

array(['1573063463', 'n'], dtype='<U21')

1509 is the number of annotations excluding "maybes"

# Using Leave One Group Out method 

In [40]:
#y list is just the annotations
y = annotation_list_cut[:,1]

In [41]:
y.shape

(1509,)

Replacing y and n with 1 and 0

In [43]:
y_binary = np.where(y=='n', 0, 1)

In [44]:
print(y_binary[:10])
print(y[:10])

[0 0 1 1 1 0 0 0 0 1]
['n' 'n' 'y' 'y' 'y' 'n' 'n' 'n' 'n' 'y']


In [51]:
groups = np.asarray([data.convert_timestamps(int(timestamp)).day for timestamp in annotation_list_cut[:,0]])

In [52]:
logo = LeaveOneGroupOut()

In [55]:
for train_index, test_index in logo.split(X, groups=groups):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    expanded_X_train = \
    np.reshape(X_train, (X_train.shape[0]*X_train.shape[1], 512))
  
    pca_45 = sklearnPCA(45)
    pca_45.fit(expanded_X_train)
   
    X_transformed = np.asarray([pca_45.transform(embedding) for embedding in X_train])

    X_summaries = np.asarray([get_summary(embedding) for embedding in X_transformed])
    
    #standard scaler
    scaler = StandardScaler()
    scaler.fit(X_summaries)
    X_scaler = scaler.transform(X_summaries)

    #random forest on training data
    clf = RandomForestClassifier().fit(X_scaler, y_train)
    
    #manipulate embeddings for each X_test
    expanded_X_test = \
    np.reshape(X_test, (X_test.shape[0]*X_test.shape[1], 512))
    
    X_test = np.asarray([pca_45.transform(embedding) for embedding in X_test])

    X_test_summaries = np.asarray([get_summary(embedding) for embedding in X_test])
    
    #standard scaler on test data
    X_test_scaler = scaler.transform(X_test_summaries)
    
    #get cross validation scores
    print('clf score: ', clf.score(X_test_scaler, y_test))
    
    #get f score, precision, recall
    #note: what average should I use???
    y_predicted = clf.predict(X_test_scaler)
    print\
    ('precision, recall, f score: ', precision_recall_fscore_support(y_test, y_predicted, average='macro'))

clf score:  0.6136363636363636
precision, recall, f score:  (0.5581395348837209, 0.5054466230936819, 0.4063492063492063, None)
clf score:  0.6130653266331658
precision, recall, f score:  (0.8055555555555556, 0.5064102564102564, 0.3919685726756875, None)
clf score:  0.60625
precision, recall, f score:  (0.303125, 0.5, 0.377431906614786, None)


  _warn_prf(average, modifier, msg_start, len(result))


clf score:  0.6542056074766355
precision, recall, f score:  (0.5800970873786409, 0.5127413127413127, 0.44184407161990696, None)
clf score:  0.8782608695652174
precision, recall, f score:  (0.4391304347826087, 0.5, 0.4675925925925926, None)


  _warn_prf(average, modifier, msg_start, len(result))


clf score:  0.6538461538461539
precision, recall, f score:  (0.3380681818181818, 0.476, 0.39534883720930225, None)
clf score:  0.6368421052631579
precision, recall, f score:  (0.4475806451612903, 0.4952675646159447, 0.4024340216053603, None)
clf score:  0.6883720930232559
precision, recall, f score:  (0.5950704225352113, 0.5040843081887858, 0.4216949696896704, None)
clf score:  0.5849802371541502
precision, recall, f score:  (0.4654218533886584, 0.4934861907243356, 0.4018104438289536, None)


# with pipeline

In [56]:
#make class to do pca with summaries
class pca_with_summaries(BaseEstimator, TransformerMixin):
    
    def __init__(self, pca):
        self.pca = pca
        
    def fit(self, X, y=None):
        expanded_X = np.reshape(X, (X.shape[0]*X.shape[1], 512))

        self.pca.fit(expanded_X)
        
        return self
  
    def transform(self, X, y=None):
        X_transformed = np.asarray([self.pca.transform(embedding) for embedding in X])

        X_summaries = np.asarray([get_summary(embedding) for embedding in X_transformed])
        
        return X_summaries

In [57]:
pipeline = Pipeline(steps=[
    ('pca with summaries', pca_with_summaries(sklearnPCA(45))),
    ('standard scaler', StandardScaler()),
    ('random forest', RandomForestClassifier())
])

In [58]:
scoring = ['f1', 'precision', 'recall']
scores = cross_validate(pipeline, X, y_binary, scoring=scoring, cv=LeaveOneGroupOut(), groups=groups)

In [59]:
print(scores)

{'fit_time': array([8.54602718, 6.50821447, 7.07532597, 8.43795085, 7.82813549,
       7.45513868, 6.5654788 , 6.81613398, 6.595891  ]), 'score_time': array([0.23575258, 0.64273477, 0.19839716, 0.40478587, 0.28096747,
       0.22913241, 0.3909905 , 0.57481408, 0.61413431]), 'test_f1': array([0.05405405, 0.02469136, 0.03030303, 0.04878049, 0.        ,
       0.        , 0.02777778, 0.05633803, 0.08403361]), 'test_precision': array([0.33333333, 0.33333333, 0.33333333, 0.25      , 0.        ,
       0.        , 0.2       , 0.5       , 0.27777778]), 'test_recall': array([0.02941176, 0.01282051, 0.01587302, 0.02702703, 0.        ,
       0.        , 0.01492537, 0.02985075, 0.04950495])}


In [63]:
classifiers = [RandomForestClassifier(), KNeighborsClassifier(), LogisticRegression()]

In [64]:
for classifier in classifiers:
    pipeline_classifier = classifier
    
    pipeline = Pipeline(steps=[
    ('pca with summaries', pca_with_summaries(sklearnPCA(45))),
    ('standard scaler', StandardScaler()),
    ('classifier', pipeline_classifier)
    ])
    
    scoring = ['f1', 'precision', 'recall']
    scores = cross_validate(pipeline, X, y_binary, scoring=scoring, cv=LeaveOneGroupOut(), groups=groups)
    
    print('\033[1m' + str(classifier) + '\033[0m')
    
    print(scores)

[1mRandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)[0m
{'fit_time': array([8.12407041, 6.27114987, 6.74722505, 7.01042747, 7.51113319,
       7.1978333 , 6.63905334, 6.51876903, 6.5549562 ]), 'score_time': array([0.18472219, 0.4958334 , 0.32309985, 0.28850818, 0.27084851,
       0.45472217, 0.47264504, 0.58437204, 0.65323162]), 'test_f1': array([0.        , 0.        , 0.        , 0.05263158, 0.        ,
       0.03278689, 0.        , 0.        , 0.05555556]), 'test_precision': array([0.        