# **Misclassification spotter Part 2: Evaluation**


Evaluation of misclassification spotter through manual validation of candidate calls. Gave N=100 calls (wav files) to two independent labellers A and B and let them classify the calls.

## Libraries

In [1]:
import os
import pandas as pd
import librosa.display
import numpy as np
from pandas.core.common import flatten
import datetime
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns
import scipy
import pickle
from pathlib import Path

In [2]:
wd = os.getcwd()

DATA = os.path.join(os.path.sep, str(Path(wd).parents[0]), "data", "processed")
UMAP_COORDS = os.path.join(os.path.sep, str(Path(wd).parents[0]), "data", "interim", "parameter_search")
FIGURES = os.path.join(os.path.sep, str(Path(wd).parents[0]), "reports", "figures")
EXTERNAL_DATA = os.path.join(os.path.sep, str(Path(wd).parents[0]), "data", "external")

## Clean label files

### Clean B's labels

In [13]:
manual_labelfile = os.path.join(os.path.sep, EXTERNAL_DATA, "CallIDs_B.csv")
label_df = pd.read_csv(manual_labelfile, sep=";")
#manual_labels

In [14]:
#label_df.type.value_counts()

In [15]:
cleaned_label = label_df.type.copy()

label_df['has_questionmark'] = [1 if "?" in x else 0 for x in cleaned_label]
label_df['has_x'] = [1 if "x" in x else 0 for x in cleaned_label]
label_df['has_sq'] = [1 if "sq" in x else 0 for x in cleaned_label]
label_df['has_plus'] = [1 if "+" in x else 0 for x in cleaned_label]

cleaned_label = [x.replace("?", "") for x in cleaned_label] # remove ?
cleaned_label = [x.replace(" ", "") for x in cleaned_label] # remove whitespace
cleaned_label = [x.replace("x", "") for x in cleaned_label] # remove xs
cleaned_label = [x.replace("sq", "") for x in cleaned_label] # remove sq -> don't know what that means
cleaned_label = ["hyb" if "fu" in x else x for x in cleaned_label] # mark fu/hybrids
cleaned_label = ["hyb" if "+" in x else x for x in cleaned_label] # mark fu/hybrids
cleaned_label = ["sn" if x=="s" else x for x in cleaned_label] # Baptiste marked sn as s, so correct that
cleaned_label = ["unk" if x=="ukn" else x for x in cleaned_label] # Baptiste marked some as ukn

label_df['manual_label'] = cleaned_label
label_df.to_pickle(os.path.join(os.path.sep, EXTERNAL_DATA, 'b_labels.pkl'))

### Clean A's labels

In [16]:
manual_labelfile = os.path.join(os.path.sep, EXTERNAL_DATA, "CallIDs_A.csv")

label_df = pd.read_csv(manual_labelfile, sep=";", header=None)
label_df.columns = ['Call', 'type', 'comments']
label_df = label_df.loc[0:99,:]
#manual_labels

In [17]:
#label_df.type.value_counts()

In [18]:
cleaned_label = label_df.type.copy()

label_df['has_questionmark'] = [1 if "?" in x else 0 for x in cleaned_label]

cleaned_label = [x.replace("?", "") for x in cleaned_label] # remove ?
cleaned_label = [x.replace("#", "notacall") for x in cleaned_label] 
cleaned_label = [x.replace("mov + ld", "hyb") for x in cleaned_label]
cleaned_label = [x.replace("mov", "mo") for x in cleaned_label] 

label_df['manual_label'] = cleaned_label
label_df.to_pickle(os.path.join(os.path.sep, EXTERNAL_DATA, 'a_labels.pkl'))

## Merge into one dataframe

In [19]:
# Read in both cleaned labels
human1 = pd.read_pickle(os.path.join(os.path.sep, EXTERNAL_DATA, 'b_labels.pkl'))
human2 = pd.read_pickle(os.path.join(os.path.sep, EXTERNAL_DATA, 'a_labels.pkl'))

# Read in info file
outname = os.path.join(os.path.sep, DATA, 'info.csv')
random_subset = pd.read_csv(outname, sep=";")
#random_subset

In [20]:
np.sum(human1.Call==human2.Call)

71

In [21]:
# Put in same order

human1 = human1.sort_values(by='Call',ignore_index=True)
human2 = human2.sort_values(by='Call',ignore_index=True)
np.sum(human1.Call==human2.Call)

100

In [22]:
human1.columns = ['human1_'+x for x in human1.columns]
human2.columns = ['human2_'+x for x in human2.columns]

In [23]:
# Merge into one dataframe
if np.sum(human1.human1_Call==human2.human2_Call) != human1.shape[0]:
    print("misaligned")
else:
    print("aligned")
    all_labellers = pd.concat([human1, human2], axis=1)

    all_labellers['Call'] = all_labellers['human1_Call']
    all_labellers = all_labellers.drop(columns=['human1_Call', 'human2_Call'])

aligned


In [24]:
label_df = pd.merge(all_labellers, random_subset, left_on="Call", right_on="callID")

## Evaluate

In [25]:
label_df['labeller_agreement'] = [1 if x==y else 0 for x,y in zip(label_df.human1_manual_label, label_df.human2_manual_label)]
label_df['labeller_agreement'].value_counts()

0    50
1    50
Name: labeller_agreement, dtype: int64

In [43]:
for human in ["human1", "human2"]:
    
    print("******************************")
    print(human)
    print("******************************")
    manual_col = human+'_manual_label'
    original_col = "call_lable"
    algo_choice = "neighbor_1"
    
    mislabelled_df = label_df.loc[label_df[manual_col]!=label_df[original_col],:]
    print(mislabelled_df.shape[0], " truly mislabelled")
    
    correctly_placed = mislabelled_df.loc[mislabelled_df[manual_col]==mislabelled_df[algo_choice],:]
    percentage_correct_placed = (correctly_placed.shape[0]/mislabelled_df.shape[0])*100
    print("  ", correctly_placed.shape[0], " of which would have been correctly placed by algorithm (", round(percentage_correct_placed,2), "%)")
    
    percent_agreement = (np.sum(correctly_placed.labeller_agreement) / correctly_placed.shape[0])*100
    print("    ", round(percent_agreement,2),"% labeller agreement (", np.sum(correctly_placed.labeller_agreement), ")")
    
    incorrectly_placed = mislabelled_df.loc[mislabelled_df[manual_col]!=mislabelled_df[algo_choice],:]
    #percentage_incorrect_placed = (incorrectly_placed.shape[0]/mislabelled_df.shape[0])*100
    #print("  ", incorrectly_placed.shape[0], " of which would have been incorrectly placed by algorithm (", round(percentage_incorrect_placed,2), "%)")
    
    #percent_agreement = (np.sum(incorrectly_placed.labeller_agreement) / incorrectly_placed.shape[0])*100
    #print("    ", round(percent_agreement,2),"% labeller agreement (", np.sum(incorrectly_placed.labeller_agreement), ")")
    
    # percentage of these that COULD not have been incorrectly placed, because unknown, not a call whatsoever
    n_no_call = np.sum(incorrectly_placed[manual_col].isin(['hyb', 'notacall', 'unk']))
    per_no_call = (n_no_call/mislabelled_df.shape[0])*100
    print("  ", n_no_call, " of which were not a call (", round(per_no_call,2), "%)")
    
    # truly incorrectly placed:
    n_incorr_placed = incorrectly_placed.shape[0] - n_no_call
    per_incorr_placed = (n_incorr_placed/mislabelled_df.shape[0])*100
    print("  ", n_incorr_placed, " truly incorrectly placed (", round(per_incorr_placed,2), "%)")
    
    truly_incorrect = incorrectly_placed.loc[~ incorrectly_placed[manual_col].isin(['hyb', 'notacall', 'unk']),:]
    percent_agreement = (np.sum(truly_incorrect.labeller_agreement) / truly_incorrect.shape[0])*100
    print("    ", round(percent_agreement,2),"% labeller agreement (", np.sum(truly_incorrect.labeller_agreement), ")")
    
    
    print(" ")
    print("Overview of incorrectly placed assignment:")
    print(incorrectly_placed[manual_col].value_counts())
    print(" ")

******************************
human1
******************************
69  truly mislabelled
   27  of which would have been correctly placed by algorithm ( 39.13 %)
     70.37 % labeller agreement ( 19 )
   24  of which were not a call ( 34.78 %)
   18  truly incorrectly placed ( 26.09 %)
     11.11 % labeller agreement ( 2 )
 
Overview of incorrectly placed assignment:
hyb         14
al           9
unk          6
notacall     4
soc          3
ld           2
mo           2
cc           1
agg          1
Name: human1_manual_label, dtype: int64
 
******************************
human2
******************************
67  truly mislabelled
   32  of which would have been correctly placed by algorithm ( 47.76 %)
     59.38 % labeller agreement ( 19 )
   29  of which were not a call ( 43.28 %)
   6  truly incorrectly placed ( 8.96 %)
     33.33 % labeller agreement ( 2 )
 
Overview of incorrectly placed assignment:
hyb         11
notacall    10
unk          8
cc           4
mo           1
soc   

In [67]:
# Other way of looking at it

#labellers_agree = label_df.loc[label_df['labeller_agreement']==1,:]
#clearly_mislabelled = labellers_agree.loc[labellers_agree['human1_manual_label']!=labellers_agree['call_lable'],:]
#print(clearly_mislabelled.shape[0], " clearly mislabelled (labellers agree)")

#all_agree = clearly_mislabelled.loc[clearly_mislabelled['human1_manual_label']==clearly_mislabelled[algo_choice],:]
#percent_all_agree = (all_agree.shape[0]/clearly_mislabelled.shape[0])*100
#print(all_agree.shape[0], " of these correctly labelled by algo (", round(percent_all_agree,2), "%)")

# How many of these were labelled as not-a-call or unknown?
#difficult_cases = label_df.loc[(label_df['human1_manual_label'].isin(['notacall', 'hyb', 'unk']) | label_df['human2_manual_label'].isin(['notacall', 'hyb', 'unk'])),:]
#print(difficult_cases.shape[0], " where at least one labeller labelled as not-a-call/unknown/hybrid")


# How many where either labeller 1 or labeller 2 disagreed with original label?
label_df['human1_disagree'] = [1 if x!=y else 0 for x,y in zip(label_df.human1_manual_label, label_df.call_lable)]
label_df['human2_disagree'] = [1 if x!=y else 0 for x,y in zip(label_df.human2_manual_label, label_df.call_lable)]
label_df['any_disagree'] = [1 if (x+y)>=1 else 0 for x,y in zip(label_df.human1_disagree, label_df.human2_disagree)]

print(np.sum(label_df['any_disagree']), " where at least 1 labeller disagrees with original label")

any_disagree = label_df.loc[label_df['any_disagree']==1,:]
clearly_mislabelled = any_disagree.loc[any_disagree['labeller_agreement']==1,:]
print("  ", clearly_mislabelled.shape[0], " of these were clearly mislabelled (both labellers agree)")

clearly_mislabelled_iscall = clearly_mislabelled.loc[~clearly_mislabelled['human1_manual_label'].isin(['notacall', 'hyb', 'unk']),:]
print("    ", clearly_mislabelled_iscall.shape[0], " of these were labelled as one of the seven calltypes")

all_agree = clearly_mislabelled_iscall.loc[clearly_mislabelled_iscall['human1_manual_label']==clearly_mislabelled_iscall[algo_choice],:]
percent_all_agree = (all_agree.shape[0]/clearly_mislabelled_iscall.shape[0])*100
print("      ", all_agree.shape[0], " of these correctly labelled by algo (", round(percent_all_agree,2), "%)")
print ("    ", clearly_mislabelled.shape[0]-clearly_mislabelled_iscall.shape[0], " labelled as unk/not-a-call/hyb")

unclear_mislabelled = any_disagree.loc[any_disagree['labeller_agreement']==0,:]
print("  ", unclear_mislabelled.shape[0], " of these were not clearly mislabelled (labellers disagree)")


#other_cases = unclear_mislabelled.loc[((~unclear_mislabelled['human1_manual_label'].isin(['notacall', 'hyb', 'unk'])) | (~unclear_mislabelled['human2_manual_label'].isin(['notacall', 'hyb', 'unk']))),:]
other_cases = unclear_mislabelled.loc[~(unclear_mislabelled['human1_manual_label'].isin(['notacall', 'hyb', 'unk']) | unclear_mislabelled['human2_manual_label'].isin(['notacall', 'hyb', 'unk'])),:]
print("    ", other_cases.shape[0], " of these were labelled as one of the seven calltypes by both labellers")

# For how many of those does algo agree with at least one of the labellers?
other_cases['human1_algo_agree'] = [1 if x==y else 0 for x,y in zip(other_cases.human1_manual_label, other_cases.neighbor_1)]
other_cases['human2_algo_agree'] = [1 if x==y else 0 for x,y in zip(other_cases.human2_manual_label, other_cases.neighbor_1)]
other_cases['any_human_algo_agree'] = [1 if (x+y)>=1 else 0 for x,y in zip(other_cases.human1_algo_agree, other_cases.human2_algo_agree)]
per_some_agreement = (np.sum(other_cases['any_human_algo_agree'])/other_cases.shape[0])*100
print("      ", np.sum(other_cases['any_human_algo_agree']), " where algo agrees with any of the human labels (", round(per_some_agreement,2), "%)")

# How many of these were labelled as not-a-call or unknown?
difficult_cases = unclear_mislabelled.loc[(unclear_mislabelled['human1_manual_label'].isin(['notacall', 'hyb', 'unk']) | unclear_mislabelled['human2_manual_label'].isin(['notacall', 'hyb', 'unk'])),:]
print("    ", difficult_cases.shape[0], " of these where at least one labeller labelled as not-a-call/unknown/hybrid")

# For how many of those does algo agree with at least one of the labellers?
difficult_cases['human1_algo_agree'] = [1 if x==y else 0 for x,y in zip(difficult_cases.human1_manual_label, difficult_cases.neighbor_1)]
difficult_cases['human2_algo_agree'] = [1 if x==y else 0 for x,y in zip(difficult_cases.human2_manual_label, difficult_cases.neighbor_1)]
difficult_cases['any_human_algo_agree'] = [1 if (x+y)>=1 else 0 for x,y in zip(difficult_cases.human1_algo_agree, difficult_cases.human2_algo_agree)]
per_some_agreement = (np.sum(difficult_cases['any_human_algo_agree'])/difficult_cases.shape[0])*100
print("      ", np.sum(difficult_cases['any_human_algo_agree']), " where algo agrees with any of the human labels (", round(per_some_agreement,2), "%)")


80  where at least 1 labeller disagrees with original label
   30  of these were clearly mislabelled (both labellers agree)
     21  of these were labelled as one of the seven calltypes
       19  of these correctly labelled by algo ( 90.48 %)
     9  labelled as unk/not-a-call/hyb
   50  of these were not clearly mislabelled (labellers disagree)
     17  of these were labelled as one of the seven calltypes by both labellers
       12  where algo agrees with any of the human labels ( 70.59 %)
     33  of these where at least one labeller labelled as not-a-call/unknown/hybrid
       9  where algo agrees with any of the human labels ( 27.27 %)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [151]:
false_alarms = label_df.loc[label_df['any_disagree']==0,:]
false_alarms[['human1_manual_label', 'human2_manual_label', 'call_lable', 'neighbor_1','human1_comments', 'human2_comments']]

Unnamed: 0,human1_manual_label,human2_manual_label,call_lable,neighbor_1,human1_comments,human2_comments
2,mo,mo,mo,al,,cut off
4,soc,soc,soc,sn,,
7,al,al,al,mo,,alert call
16,cc,cc,cc,sn,nf,
18,mo,mo,mo,sn,,slightly weird one
19,sn,sn,sn,cc,,
20,mo,mo,mo,sn,probably nf,
29,soc,soc,soc,cc,,
32,soc,soc,soc,cc,,
40,mo,mo,mo,soc,,a bit hybridy


In [115]:
pd.set_option("display.max.rows", None)
label_df[['Call', 'call_lable', 'neighbor_1', 'human1_manual_label', 'human2_manual_label', 'human1_comments', 'human2_comments']]

Unnamed: 0,Call,call_lable,neighbor_1,human1_manual_label,human2_manual_label,human1_comments,human2_comments
0,HM_HMB_R11_AUDIO_file_5_(2017_08_24-06_44_59)_...,al,sn,al,sn,"most likely nf, hard to tell the type",
1,HM_HMB_R11_AUDIO_file_5_(2017_08_24-06_44_59)_...,al,sn,ukn,al,"most likely nf, hard to tell the type","aerial, cut off, nonfocal"
2,HM_HMB_R11_AUDIO_file_5_(2017_08_24-06_44_59)_...,mo,al,mo,mo,,cut off
3,HM_HMB_R11_AUDIO_file_5_(2017_08_24-06_44_59)_...,cc,sn,ukn,cc,might not even be a call,"really hard to tell, not even sure it's a call"
4,HM_HRT_R07_20170903-20170908_file_4_(2017_09_0...,soc,sn,soc,soc,,
5,HM_HRT_R09_AUDIO_file_4_(2017_08_23-06_44_59)_...,sn,cc,cc,cc,,
6,HM_HRT_R09_AUDIO_file_4_(2017_08_23-06_44_59)_...,sn,cc,cc,cc,,
7,HM_HRT_R09_AUDIO_file_4_(2017_08_23-06_44_59)_...,al,mo,al,al,,alert call
8,HM_HRT_R09_AUDIO_file_5_(2017_08_24-06_44_59)_...,ld,mo,mo,hyb,,a bit hybridy at the end (some trill that woul...
9,HM_HRT_R09_AUDIO_file_6_(2017_08_25-06_44_59)_...,soc,cc,hyb,soc,,
