In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import csv
import collections
import numpy as np
import seaborn as sns

In [2]:
human_noises = set(['mix traffic', 'braking', 'voices', 'electrical', 
                   'anthropogenic unknown', 'airplane', 'beep',
                   'metal', 'bus emitting', 'footsteps', 'mower', 'whistle',
                  'siren', 'coughing', 'music', 'horn', 'startthecar', 'bells', 
                    'applause', 'dog bark'])

animal_noises = set(['bird', 'wing beats', 'bat', 'fox', 
                     'grey squirrel', 'invertebrate'])

other = set(['rain', 'unknown sound'])

## Load in the annotations

I will want to load in annotations from both sets of labellers

In [3]:
mapper = {'road traffic': 'Mix traffic',
          'electrical disturbance': 'electrical',
         'vehicle': 'Mix traffic',
         'breaking vehicle': 'Braking',
         'Train doors (beeping)': 'beep'}

In [4]:
import pandas as pd

sample_rate = 24000 
file_len_in_seconds = 60

def read_labels(fname):

    df = pd.read_csv(fname)
    df = df.ix[1::2]  # we need to take every other row
    
    # cool, we read in the labels as a pandas df. 
    # Now we want to convert this to our own format, where we have a dictionary of arrays, one per label...
    labels = {}
    
    for labelname in df['Label'].unique():
#     for labelname in ['bird', 'airplane']:
        if labelname in mapper:
            labelname = mapper[labelname]

        these_labs = df.ix[df['Label'] == labelname]
        
        label_array = np.zeros(sample_rate * file_len_in_seconds)
        
        start_ends = np.array(
            these_labs[['LabelStartTime_Seconds', 'LabelEndTime_Seconds']])
        start_ends *= sample_rate
        start_ends  = start_ends.astype(int)

        for start, end in start_ends:
            label_array[start:end] = 1

        labels[labelname] = label_array

    return labels

In [5]:
import os

base_path = '/media/michael/Seagate/engage/engage_hackathon_data/data/multiple_humans_labelling_same_files/'
humans = ['Ali', 'Jiemin', 'Verity']
labels = {human: {} for human in humans}


for human in humans:
    fnames = os.listdir(base_path + human + '/')
    for fname in fnames:
        
        # generate key for this file
        filekey = fname.replace('_below12kHz.csv', '').replace('-sceneRect.csv', '')
        
#         if filekey != 'E105JP-13548_20131009_0717':
#         if filekey != 'CR05EF-13527_20130916_0823':
#             continue
        
        # load the file and add to the dictionary
        labels[human][filekey] = read_labels(base_path + human + '/' + fname)

In [6]:
all_labels = set()
for human in humans:
    for fname in labels[human].keys():
        all_labels.update(labels[human][fname].keys())
print all_labels

set(['siren', 'barking dog', 'unknown', 'Braking', 'anthropogenic unknown', 'Mix traffic', 'voices', 'beep', 'whistle', 'unknown sound', 'electrical', 'human voice', 'animal', 'lawnmower', 'airplane', 'invertebrate', 'bird', 'wind', 'machinery'])


In [7]:
tmp = labels['Ali']['E105JP-13548_20131009_0717']
print tmp['bird'].sum()

963364.0


In [17]:
# now compare the labels from the different labelers
classes = ['airplane', 'bird', 'Mix traffic', 'beep', 'electrical']
all_agreements = {cls:[] for cls in classes}

def get_labels(human, fname, cls):
    if cls not in labels[human][fname]:
        # if no labels, just give it an array of zeros
        return np.zeros(sample_rate * file_len_in_seconds)
    else:
        return labels[human][fname][cls]
    
results = pd.DataFrame(columns=['fname'] + classes)


# get the labels for this file, for this class, for all humans
for idx, fname in enumerate(labels['Ali'].keys()):

    fname_agreements = [fname]
    
    for cls in classes:
        labs = np.vstack([get_labels(human, fname, cls) for human in humans])
        agreements = labs.max(0) == labs.min(0)
        fname_agreements.append(agreements.mean())
       
    results.loc[idx] = fname_agreements

In [18]:
pd.set_option('precision',5)
results

Unnamed: 0,fname,airplane,bird,Mix traffic,beep,electrical
0,E105JP-13548_20131009_0717,0.0007,0.324,1.0,1.0,0.997
1,CR8-13548_20130918_1109,1.0,0.7192,0.5452,1.0,1.0
2,SE23-13527_20130907_1651,0.0013,0.4325,1.0,1.0,1.0
3,HA86RB-13527_20130730_0541,1.0,0.688,0.5603,1.0,1.0
4,SW154LA-3527_20130705_0909,1.0,0.6475,0.0008,0.9888,1.0
5,CR05EF-13527_20130921_1236,0.7847,0.6833,0.1207,1.0,0.997
6,CR05EF-13527_20130916_0823,0.0032,0.5602,1.0,0.8827,0.998
7,HA53AA-13548_20130727_0954,1.0,0.9918,0.5017,1.0,1.0
8,W112NN-13548_20130709_0403,1.0,0.4755,0.7288,1.0,1.0
9,W112NN-13548_20130713_0448,1.0,0.6867,0.3445,1.0,1.0


In [43]:
for fname in ['E105JP-13548_20131009_0717', 'CR05EF-13527_20130921_1236']:

    print fname
    print '-' * 40 + '\n'
    for label in classes:
        for human in humans:

            if label in labels[human][fname]:
                frac = labels[human][fname][label].mean()
            else:
                frac = 0

            print "    %8s labelled %02.2f%% as %s" % (human, frac * 100, label)
        print ""

E105JP-13548_20131009_0717
----------------------------------------

         Ali labelled 0.00% as airplane
      Jiemin labelled 93.77% as airplane
      Verity labelled 63.28% as airplane

         Ali labelled 66.90% as bird
      Jiemin labelled 0.00% as bird
      Verity labelled 5.42% as bird

         Ali labelled 0.00% as Mix traffic
      Jiemin labelled 0.00% as Mix traffic
      Verity labelled 0.00% as Mix traffic

         Ali labelled 0.00% as beep
      Jiemin labelled 0.00% as beep
      Verity labelled 0.00% as beep

         Ali labelled 0.30% as electrical
      Jiemin labelled 0.00% as electrical
      Verity labelled 0.00% as electrical

CR05EF-13527_20130921_1236
----------------------------------------

         Ali labelled 0.00% as airplane
      Jiemin labelled 0.00% as airplane
      Verity labelled 21.53% as airplane

         Ali labelled 25.85% as bird
      Jiemin labelled 0.00% as bird
      Verity labelled 16.87% as bird

         Ali labelled 0.00% as

In [32]:
fname = 

for label in classes:
    for human in humans:
    
        if label in labels[human][fname]:
            frac = labels[human][fname][label].mean()
        else:
            frac = 0

        print "%s labelled %0.2f%% as %s" % (human, frac * 100, label)
    print ""

Ali labelled 0.00% as airplane
Jiemin labelled 0.00% as airplane
Verity labelled 21.53% as airplane

Ali labelled 25.85% as bird
Jiemin labelled 0.00% as bird
Verity labelled 16.87% as bird

Ali labelled 0.00% as Mix traffic
Jiemin labelled 87.93% as Mix traffic
Verity labelled 0.00% as Mix traffic

Ali labelled 0.00% as beep
Jiemin labelled 0.00% as beep
Verity labelled 0.00% as beep

Ali labelled 0.30% as electrical
Jiemin labelled 0.00% as electrical
Verity labelled 0.00% as electrical

