# Resolve focal conflicts

Script to resolve conflicts in labeling (e.g. two calls that overlap in time, are highly
similar and none of them has been labelled nonfocal).


Requirements:
- "candidates_matches.json", containing all potential matches (generated with 01_identify_focal_conflicts)
- "candidates_labelfile.csv", containing audio and spectrogram of all calls involved in a match (generated with 01_identify_focal_conflicts)
- "f_nf.csv", a csv file containing all pairs of calls and their respective distance score (generated with 02_assign_distance_score)

Output:
- updates f_nf.csv with intensity scores
- generates "pred_labelfile.csv", containing the predictions for the candidate calls

In [1]:
import pandas as pd
import os
import numpy as np
import json
from scipy import stats
import math
from scipy.signal import butter, lfilter

In [2]:
f = open('server_path.txt', "r")
SERVER = f.read().strip()
f.close()

HOME = SERVER + os.path.join(os.path.sep, 'EAS_shared',
                             'meerkat','working','processed',
                             'acoustic', 'resolve_conflicts')

# location of candidate files generated with 01_identify_focal_conflicts
CANDIDATES_MATCHES = os.path.join(os.path.sep, HOME,'candidates_matches.json')
CANDIDATES_LABELFILE = os.path.join(os.path.sep, HOME,'candidates_labelfile.csv')

# location of file with distance scores generated with 02_assign_distance_score
F_NF_FILE = os.path.join(os.path.sep, HOME,'f_nf.csv')

In [3]:
# Distance score cutoff for deciding which call pairs are same-call vs. different-call
# all <= CUTOFF are labelled as same-call
CUTOFF = 0.25

# Bandpass filters for calculating audio intensity
LOWCUT = 300.0
HIGHCUT = 3000.0

In [4]:
# Function that calculates intensity score from 
# amplitude audio data
# Input: 1D numeric numpy array (audio data)
# Output: Float (Intensity)
def calc_audio_intense_score(audio):
    res = 10*math.log((np.mean(audio**2)),10)
    return res

# small helper function
def which_call_am_I(call,sub_df):    
    if call==sub_df.call_a.values[0]:
        call="a"
        other="b"
    elif call==sub_df.call_b.values[0]:
        call="b"
        other="a"    
    return call, other

# Butter bandpass filter implementation:
# from https://scipy-cookbook.readthedocs.io/items/ButterworthBandpass.html


def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a

def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y

# Read in data

In [None]:
print("Reading in data...")

In [5]:
# labelfile of all candidate matching calls
labelfile = pd.read_csv(CANDIDATES_LABELFILE, sep="\t")
labelfile.shape

(4270, 7)

In [6]:
# dictionary of all candidate calls and their potential matches
# (1 call can have multiple matches)
with open(CANDIDATES_MATCHES, "r") as file:  
    matches = json.load(file)
len(matches)

4270

In [7]:
# table with all call pairs and their respective distance scores
f_nf = pd.read_csv(F_NF_FILE, sep="\t")

In [8]:
f_nf

Unnamed: 0.1,Unnamed: 0,call_a,call_b,dist_score
0,0,20170806_VCVM001_01_11_05_400_0_00_065_sn,20170806_VHMM003_01_10_57_645_0_00_063_sn,0.608889
1,1,20170806_VCVM001_01_11_05_645_0_00_068_sn,20170806_VHMM003_01_10_57_887_0_00_071_sn,0.890803
2,2,20170806_VCVM001_01_11_05_880_0_00_062_sn,20170806_VHMM003_01_10_58_067_0_00_058_sn,0.480133
3,3,20170806_VCVM001_01_11_05_880_0_00_062_sn,20170806_VHMM003_01_10_58_153_0_00_069_sn,0.666613
4,4,20170806_VCVM001_01_12_26_129_0_00_064_sn_x,20170806_VHMM002_01_12_08_899_0_00_066_sn,0.965364
...,...,...,...,...
2671,2671,20190809_VLM234_01_20_38_163_0_00_043_sn,20190809_VLM245_01_21_33_006_0_00_054_sn,1.088044
2672,2672,20190809_VLM234_01_20_38_228_0_00_043_sn,20190809_VLM245_01_21_33_006_0_00_054_sn,1.034063
2673,2673,20190809_VLM234_01_20_38_290_0_00_035_sn,20190809_VLM245_01_21_33_098_0_00_036_sn,0.666634
2674,2674,20190809_VLM234_01_20_38_402_0_00_038_sn,20190809_VLM245_01_21_33_098_0_00_036_sn,0.198532


## Calculate intensity scores

In [None]:
print("Calculating audio intensity scores...")

In [7]:
# Using the band-pass filtered signal! 

clean_intensity = []

for call in list(labelfile.callID_new):
    audio = list(labelfile.loc[labelfile.callID_new==call,'raw_audio'])[0]
    sr = list(labelfile.loc[labelfile.callID_new==call,'samplerate_hz'])[0]
    y = butter_bandpass_filter(audio, LOWCUT, HIGHCUT, sr, order=6)
    clean_intensity.append(calc_audio_intense_score(y))
labelfile['intense_score'] = clean_intensity
intense_dict = dict(zip(labelfile.callID_new.values, labelfile.intense_score.values))

## Add intensity scores to labelfile and f_nf dataframe

In [8]:
f_nf['intense_a'] = [intense_dict[x] for x in f_nf.call_a]
f_nf['intense_b'] = [intense_dict[x] for x in f_nf.call_b]
f_nf.to_csv(F_NF_FILE, sep="\t", index=False)
#f_nf

# Assign focal or nonfocal

Make dataframe of all call pairs, their distance scores and faint scores.

Then assign calls focal or noncal following these hierarchical steps:

- 1) Assignment of calls with no high fidelity match (e.g. that were not recognized as "same-call")
- 2) Assignment of calls that have only one high fidelity match
- 3) Assignment of calls that have multiple high fidelity matches

In [None]:
print("Assigning focal/nonfocal...")

In [9]:
# this will save the predictions
pred_nonFocal={}
# this will save information on WHY prediction was made 
why_pred = {}
# this will save additional information on WHY prediction was made 
pred_comment= {}

### 1) Case: Calls with no high fidelity match

--> can be assigned focal (why: "no_match")

In [10]:
for call in labelfile.callID_new.values:    
    # if not already assigned
    if call not in pred_nonFocal.keys():
        # all rows that concern this call
        sub_df = f_nf.loc[(f_nf['call_a']==call) | (f_nf['call_b']==call),:]
        # all matches of this call        
        sub_df_matched = sub_df.loc[sub_df.dist_score<CUTOFF,:]
        
        # if call has no matches!:
        if sub_df_matched.shape[0]==0:
            pred_nonFocal[call] = 0
            why_pred[call] = "no_match"
            pred_comment[call] = "no further info"

print(len(pred_nonFocal), " calls assigned after step 1) no high fidelity match")

1863  calls assigned after step 1) no high fidelity match


### 2) Case: Only 1 high fidelity match 

    3a) the match partner is already assigned f/nf or
        -->  assign call to the opposite (why: "partner assigned (focal)" or "partner assigned (nonfocal)")

    3b) match partner is not yet assigned
         --> assign to nf if weaker, assign to f if stronger (why: "weaker_1_match" or "stronger_1_match")

In [11]:
for call in labelfile.callID_new.values:    
    # if not already assigned
    if call not in pred_nonFocal.keys():
        # all rows that concern this call
        sub_df = f_nf.loc[(f_nf['call_a']==call) | (f_nf['call_b']==call),:]
        
        # all matches of this call
        sub_df_matched = sub_df.loc[sub_df.dist_score<CUTOFF,:]
        
        # if call has exactly 1 match:
        if sub_df_matched.shape[0]==1:
            me, other = which_call_am_I(call, sub_df_matched)
            other_call = sub_df_matched['call_'+other].values[0] 
            
            # if partner already assigned
            if other_call in pred_nonFocal.keys():
                # assign to opposite
                pred_nonFocal[call] = np.abs(1-pred_nonFocal[other_call])
                why_pred[call] = "partner_assigned"
                pred_comment[call] = other_call+": "+why_pred[other_call]
                
            # if partner not already assigned   
            else:
                #if weaker, assign nonfocal
                if intense_dict[call]<=intense_dict[other_call]:
                    pred_nonFocal[call] = 1
                    why_pred[call] = "weaker_1_match"
                    pred_comment[call] = str(round(intense_dict[call],2))+" vs. "+str(round(intense_dict[other_call],2))+" ("+other_call+")"
                #if stronger, assign focal
                else:
                    pred_nonFocal[call] = 0
                    why_pred[call] = "stronger_1_match"
                    pred_comment[call] = str(round(intense_dict[call],2))+" vs. "+str(round(intense_dict[other_call],2))+" ("+other_call+")"

print(len(pred_nonFocal), " calls assigned after step 2) ONE high fidelity match")

3895  calls assigned after step 2) ONE high fidelity match


### 3)  Case: Multiple high fidelity matches

    4a) of which at least ONE (should be only one) is already known to be the focal one
        -->  assign call to nonfocal (why: "match with a focal")

    4b) of which NONE is known to be focal, but

        4b1) I am the strongest
            --> assign call to focal (why: "strongest_in_multiple")

        4b2) I am not the strongest
            --> assign call to nonfocal (why: "not_strongest_in_multiple")

In [12]:
for call in labelfile.callID_new.values:    
    # if not already assigned
    if call not in pred_nonFocal.keys():
        # all rows that concern this call
        sub_df = f_nf.loc[(f_nf['call_a']==call) | (f_nf['call_b']==call),:]
        # all matches of this call
        
        sub_df_matched = sub_df.loc[sub_df.dist_score<CUTOFF,:]
        
        # if call has  >1 match:
        if sub_df_matched.shape[0]>1:
            all_ids = (list(set(list(sub_df_matched.call_a.values)+list(sub_df_matched.call_b.values))))
            all_partners = [x for x in all_ids if x != call]            
            partner_assignments = [pred_nonFocal[x] for x in all_partners if x in pred_nonFocal.keys()]
            
            #  a) if at least 1 partner is assigned focal
            # (at least one partner is assigned AND at least one is assigned as focal)
            if ((len(partner_assignments)!=0) and (len([x for x in partner_assignments if x==0])>0)):
                pred_nonFocal[call] = 1
                why_pred[call] = "match_with_a_focal"
                
                focal_partner = [x for x in all_partners if ((x in pred_nonFocal.keys()) and (pred_nonFocal[x]==0))]
                pred_comment[call] = focal_partner[0]
            # b) no partner is assigned focal
            else: 
                # b1) I am the strongest
                if intense_dict[call]==np.max([intense_dict[x] for x in all_ids]):
                    pred_nonFocal[call] = 0
                    why_pred[call] = "strongest_in_multiple"
                    pred_comment[call] = str(round(intense_dict[call],2))
                # b2) I am not the strongest
                else:
                    pred_nonFocal[call] = 1
                    why_pred[call] = "not_strongest_in_multiple"
                    pred_comment[call] = str(round(intense_dict[call],2))+" vs."+str(round(np.max([intense_dict[x] for x in all_ids]),2))
                    
                
print(len(pred_nonFocal), " calls assigned after step 3) multiple high fidelity matches")

4270  calls assigned after step 3) multiple high fidelity matches


# Save results

In [13]:
labelfile['pred_nonFocal'] = [pred_nonFocal[x] for x in labelfile.callID_new]
labelfile['pred_why'] = [why_pred[x] for x in labelfile.callID_new]
labelfile['pred_comment'] = [pred_comment[x] for x in labelfile.callID_new]

In [19]:
# can save as csv (if dropping the array columns)
pred_labelfile_out = os.path.join(os.path.sep, HOME,'pred_labelfile.csv')
labelfile.drop(columns=["raw_audio", 'denoised_spectrograms', 'spectrograms']).to_csv(pred_labelfile_out, sep="\t", index=False)

In [None]:
print("Done.")

# [Check performance]

In [14]:
print("0: assigned focal, 1: assigned nonfocal")
print(labelfile.pred_nonFocal.value_counts())

0    3006
1    1264
Name: pred_nonFocal, dtype: int64

In [15]:
#pd.crosstab(index=labelfile['pred_why'], columns=labelfile['pred_nonFocal'])

pred_nonFocal,0,1
pred_why,Unnamed: 1_level_1,Unnamed: 2_level_1
match_with_a_focal,0,164
no_match,1863,0
not_strongest_in_multiple,0,75
partner_assigned,477,480
stronger_1_match,530,0
strongest_in_multiple,136,0
weaker_1_match,0,545
