In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
import sys
import numpy as np
import matplotlib.pyplot as plt

import yaml
import collections
from tqdm import tqdm

# Load in all predictions

In [2]:
classnames = ['biotic', 'anthrop']

In [4]:
predictions = {}

for classname in classnames:
    
    base_savedir = '/media/michael/SeagateData/alison_data/predictions_%s/' % classname
    predictions[classname] = {}

    for root, dirnames, filenames in tqdm(os.walk(base_savedir)):

        for fname in filenames:

            if fname.endswith('.npy'):
                P = np.load(root + '/' + fname)[:, 1]

                if 'E29' in root:
                    sitename = 'E29RR'
                else:
                    sitename = root.replace('/SM2+', '').split('/')[-1]
                
                predictions[classname][fname.split('.npy')[0]] = (sitename, P)

151it [00:07,  9.59it/s]
151it [00:07,  9.65it/s]


In [5]:
# sites = set(xx[0] for xx in predictions['biotic'].values())
# to_exclude = set(['Card slot', 'copy', 'Sliced', '250515-010615', 'NOISE'])
# set_sites = set([xx for xx in sites if all(yy not in xx for yy in to_exclude)])


In [6]:
# fname_sites = set()
# for xx in predictions['biotic'].keys():
#     if 'SE154EE' in xx:
#         fname_sites.add('SE154EE')
#     else:
        
#         fname_sites.add(xx.split('-')[0])

In [7]:
# print len(fname_sites), len(set_sites)
# set_sites = set([xx.replace('_', '') for xx in set_sites])

# print fname_sites - set_sites
# print set_sites - fname_sites
# for xx, yy in zip(sorted(list(fname_sites)), sorted(list(set_sites))):
#     print xx, yy

# Filter by site name

In [8]:
summaries = collections.defaultdict(dict)
site_filtered = {}

to_exclude = ['Card slot', 'copy', 'Sliced', '250515-010615', 'NOISE']


for classname in classnames:
    
    print "Class: ", classname
    print "Found %d predictions" % len(predictions[classname])
    
    site_filtered[classname] = collections.defaultdict(list)

    for fname, (sitename, P) in predictions[classname].iteritems():

        if all(xx not in sitename for xx in to_exclude):
            site_filtered[classname][sitename].append(P)

    print "Found %d unique sites" % len(site_filtered[classname])

    # Summarising per-site
    site_summaries = {}
    for site, preds in site_filtered[classname].iteritems():
        site_summaries[site] = np.nanmean(np.hstack(preds).astype(float))
    
    # Sorting and printing
    site_summaries_l = site_summaries.items()
    site_summaries_l = sorted(site_summaries_l, key=lambda x:x[1])

    for site, pred in site_summaries_l:

        summaries[classname][site] = pred

Class:  biotic
Found 32004 predictions
Found 63 unique sites
Class:  anthrop
Found 32004 predictions
Found 63 unique sites


In [9]:
print site_filtered.keys()

['biotic', 'anthrop']


# Writing the summaries to disk

In [10]:
import pandas as pd
df = pd.DataFrame(summaries)
df.to_csv('/home/michael/Dropbox/engage/FairbrassFirmanetal_/data/predictions/massive_dataset/per_site_summaries.csv')

# Writing the raw data to disk

In [11]:
savedir = '/home/michael/Dropbox/engage/FairbrassFirmanetal_/data/predictions/massive_dataset/raw_predictions/'

for site in site_filtered['biotic']:
    
    dic = {'biotic': np.hstack(site_filtered['biotic'][site]),
     'anthrop': np.hstack(site_filtered['anthrop'][site])}

    df = pd.DataFrame(dic)

    savepath = savedir + site + '.csv'
    df.to_csv(savepath, index=False, float_format='%0.5f')       

# Doing per-site graphs

In [None]:
def summary_from_fname(fname):
    """For a single results file, return the predicted activity level"""
    preds = 
    preds[np.isnan(preds)] = 0
    return np.mean(preds)


def fname_to_time(fname):
    """From a filename, extract the time"""
    return fname.split('_')[-2][:4]


def datetime_to_decimal(dt):
    """Convert datetime object to time as hour in decmals"""
    return float(dt.hour) + dt.minute / 60.0


def get_times_and_averages(base, classname):
    results = collections.defaultdict(list)
    for fname in fnames:
        if base in fname and classname in fname:
            time = fname_to_time(fname)
            results[time].append(summary_from_fname(fname))

    averages = []
    times = []
    
    # loop over each time and get the keys and summaries
    for hour in range(24):
        for half in [0, 1]:
            key = '%02d%02d' % (hour, half*30)
            averages.append(np.mean(results[key]))
            times.append(hour + half * 0.5 + 0.25)
    return times, averages


def plot_results(base, classname):
    """For a filename base, plot all theresults summaries"""
    times, averages = get_times_and_averages(base, classname)

    # Plotting activity levels
    plt.plot(times, averages, label=mapper[classname])
    plt.xlabel('Hour of day', fontsize=16)
    plt.ylabel('Level of activity', fontsize=16)
    plt.xlim(0, 24)
    plt.ylim(0, 1)
    plt.xticks([0, 6, 12, 18, 24], ['00:00', '06:00', '12:00', '18:00', '24:00'], fontsize=14)
    #plt.yticks([0, 0.2, 0.4, 0.6, 0.8, 1.0], fontsize=14)
    plt.yticks([0, 1.0], fontsize=14)

    # Plotting sunset/sunrise times
    print "Warning - just using first recording for sunset..."
    datestr = [xx for xx in fnames if xx.startswith(base)][0].split('_')[1]
    when = datetime.datetime.strptime(datestr, '%Y%m%d')
    sunrise = s.sunrise(when=when)
    sunset = s.sunset(when=when)
    
    sun_colour = np.array([0.9, 0.3, 0.3])
    sr = datetime_to_decimal(sunrise)
    plt.plot([sr, sr], [0.075, 1], '--', color=sun_colour)
    #plt.text(sr - 1.3, 1.01, 'Sunrise', fontsize=16, color=sun_colour)
    plt.text(sr - 1.3, .015, 'Sunrise', fontsize=16, color=sun_colour)
    
    ss = datetime_to_decimal(sunset)
    plt.plot([ss, ss], [0.075, 1], '--', color=sun_colour)
    plt.text(ss - 1.3, .015, 'Sunset', fontsize=16, color=sun_colour)

    
def set_up_plot():
    plt.figure(figsize=(10, 4.5))
#     ax = plt.subplot(111)
#     box = ax.get_position()
#     ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])