# Introduction

Reading in long wav files containing meerkat vocalisations and csv files containing the respective labeling. Wav files contain many vocalizations and periods of silence, and label files (CSV) indicate at what time vocalisations occur (and what type of vocalisation they are). The result is a dataframe with n_rows = number of vocalisations, metadata and the call audio data.

## Prerequisites

- Project folder must contain subfolder called "in_labels", containing all label tables in csv format 
- Project folder must contain subfolder called "in_wavs", containing all audio files

### Installing and loading libraries

(software installations within Google Colaboratory are not persistent, need to reinstall libraries every time you (re-)connect to an instance)

In [1]:
import os
import pandas as pd
import sys
import re
import json
import librosa
import librosa.display
from IPython.display import Audio
import numpy as np
import statistics
import matplotlib.pyplot as plt
from datetime import time
from datetime import datetime
import glob
from pandas.core.common import flatten
import shutil
import pickle
from pathlib import Path
import datetime

### Setting constants

Setting project, input and output folders.

In [2]:
wd = os.getcwd()

DATA = os.path.join(os.path.sep, str(Path(wd).parents[0]), "data", "processed")
LABELS_IN = os.path.join(os.path.sep, str(Path(wd).parents[0]), "data", "raw", "in_labels")
AUDIO_IN = os.path.join(os.path.sep, str(Path(wd).parents[0]), "data", "raw", "in_wavs")
LABELS_OUT = os.path.join(os.path.sep, str(Path(wd).parents[0]), "data", "raw", "labels")

for d in [LABELS_IN, AUDIO_IN, LABELS_OUT]:
    if (not os.path.isdir(d)):
        os.mkdir(d)
        
MEERKAT_CHANNEL = json.load(open(os.path.join(os.path.sep, str(Path(wd).parents[0]), "data", "raw", "meerkat_channel.json")))
MEERKAT_INFO_PATH = os.path.join(os.path.sep, str(Path(wd).parents[0]), "data", "raw", "Meerkat_info.csv")

In [4]:
# Constants for parsing label files:
#- column names in labels CSV that indicate start and duration
START_COL = 'Start'
DUR_COL = 'Duration'
GROUP = 'HM'

## Functions

In [3]:
# removes everything up to the first numeric char
def rem_nonnumeric(str): 
    foundDigit=False
    pos=0
    for char in str:
        if(char.isdigit()):
            return(str[pos:])
        pos=pos+1
    return str 

# Function that gets fileID from csv filename
# Input: csv_filename (not path!) (String)
# Output: csv_filename up to the last numeric character
def fileID_from_csv_filename(csv_in):
    csv_in = csv_in[::-1]  # reverse string
    csv_out = rem_nonnumeric(csv_in)
    csv_out = csv_out[::-1] # reverse to normal again
    return (csv_out)
  
# Function that gets datatime object from timestring
# timestring must match one of the given time_patterns
# Input: some string containing a time (String)
# Output: datetime object
# Example usage: dt = get_time("01:02:30.555")
def get_time(timestring):
    time_patterns = ['%H:%M:%S.%f', '%M:%S.%f']
    for pattern in time_patterns:
        try:
            return datetime.datetime.strptime(timestring, pattern)
        except:
            pass

    print("Date is not in expected format")
    
    sys.exit(0)

# Function that converts time in datatime object to s 
# Input: datatime (datatime.datatime)
# Output: time in ms (float)
# Example usage: s = get_s(datatime_obj)
def get_s(dt):
    return (dt.microsecond/1000000+dt.second + dt.minute*60 + dt.hour*60*60)

# Function to get meerkat ID (alphanumeric string) from filename
# filename is always HM_meerkatID_*.extension
# Input: filename (String)
# Output: meerkat ID (String)
# Example use: get_meerkatID('HM_VHMM003_HLT_AUDIO_R12_file_5_(2017_08_06-06_44_59)_ASWMUX221102.wav')

def get_meerkatID(filename):
    meerkatID = filename.replace(GROUP+'_','')
    meerkatID = str.split(meerkatID, sep='_')[0]
    return meerkatID

# Function to get date from filename
# date in filename must be in one of the three patterns
# Input: filename (String)
# Output: datetime object
# Example use: get_datetime('HM_VHMF001_HTB_R20_20190707-20190719_file_9_(2019_07_15-11_44_59)_155944')

# NEED TO REWRITE

def get_datetime(filename):
    match = re.search('\d{4}-\d{2}-\d{2}', filename)
    if (match):
        #date = datetime.strptime(match.group(), '%Y-%m-%d').date()
        date = datetime.datetime.strptime(match.group(), '%Y-%m-%d').date()
    else:
        match = re.search('\d{4}_\d{2}_\d{2}', filename)
        if (match):
            #date = datetime.strptime(match.group(), '%Y_%m_%d').date() 
            date = datetime.datetime.strptime(match.group(), '%Y_%m_%d').date() 

        else:
            match = re.search('\d{4}\d{2}\d{2}', filename)
            if(match):
                #date = datetime.strptime(match.group(), '%Y%m%d').date()
                date = datetime.datetime.strptime(match.group(), '%Y%m%d').date()
    return date


# Function that produces a labeltable from given csv+wav pairing
# Input: csv_file location (String), wav_file location (String), fileID (String)
# Output: labeltable (pd dataframe)

def prep_labels(csv_loc, wav_loc, fileID):

  # read in labels
    labels = pd.read_csv(csv_loc, sep="\t")

  # find name of column that contains the labels. Should contain 'Name'
    name_col = [col for col in labels.columns if 'Name' in col]
    if(len(name_col)==1):
        name_col = name_col[0]
        #call = ["n" if any(s in i for s in IRRELEVANT_LABELS) else "y" for i in list(labels[name_col])]
        #labels['call_yn'] = call
    else:
        print("Cannot find label name column")
  
  # Add start stop s
    if (labels.shape[0]!=0):
        labels['start_s'] = labels.apply(lambda row: get_s(get_time(row['Start'])), axis=1)
        labels['duration_s'] = labels.apply(lambda row: get_s(get_time(row['Duration'])), axis=1)
        labels['stop_s'] = labels['start_s']+labels['duration_s']

        # Add additional data
        labels['date']=[get_datetime(fileID).strftime("%Y-%m-%d")]*labels.shape[0]
        labels['samplerate_hz'] = [librosa.get_samplerate(wav_loc)]*labels.shape[0]
        labels['indv'] = [get_meerkatID(fileID)]*labels.shape[0]
        labels["original_wav"] = [wav_loc]*labels.shape[0]
        labels["bout_number"] = [i for i in range(labels.shape[0])]

    labels = labels.rename(columns={name_col: 'Name'})

    return labels

# Function that gets audio data (numpy array of amplitude) from wav
# If "SOUNDFOC" is in filename, assumes that audio is stereo and 
# looks up the channel with meerkat vocalizations in MEERKAT_CHANNEL dictionary.
# Else assumes mono.
# Input: Path to wav file (String),
#        offset in s (Float),
#        duration in s (Float)
# Output: Amplitude (numpy array)

def get_audio(wav_loc, start_s, duration_s):
  # SOUNDFOCs are stereo
    if "SOUNDFOC" in wav_loc:
        data, rate = librosa.load(wav_loc, offset=start_s, duration=duration_s, sr=None, mono=False)
        wav_filename = os.path.basename(wav_loc)
        if wav_filename in MEERKAT_CHANNEL.keys():
            channel = MEERKAT_CHANNEL[wav_filename]
        else:
            channel = 0
        data = np.asfortranarray(data[channel,:])
    else:
        data, rate = librosa.load(wav_loc, offset=start_s, duration=duration_s, sr=None)

    if np.issubdtype(type(data[0]), np.integer):
        data = data.astype('float32') # hope this will work
        #data = int16_to_float32(data) # this was the original version but must have deleted this function!? 
    return data

## Processing files

In [4]:
LABELS_IN = LABELS_IN + "/"
AUDIO_IN = AUDIO_IN + "/"
LABELS_OUT = LABELS_OUT + "/"

### Setting variables

Getting list of csvs and matching wavs

In [5]:
# Getting list of fileIDs, wavs and csvs (fileID, in_wav_loc and in_csv_loc)

in_csv_loc = glob.glob(LABELS_IN+'*.csv') + glob.glob(LABELS_IN+'*.CSV')
csv_filenames = [os.path.basename(csv) for csv in in_csv_loc]

fileIDs = [fileID_from_csv_filename(csv_filename) for csv_filename in csv_filenames]

in_wav_loc = [glob.glob(AUDIO_IN+fileID+'*') for fileID in fileIDs] # creates list of lists
in_wav_loc = [["NA"] if not x else x for x in in_wav_loc] # Replace empty lists with "NA"
in_wav_loc = list(flatten(in_wav_loc)) # Flatten list

### Removing bad quality files

Removing files with bad quality

In [39]:
badIDs = ['HM_VHMM007_LT_AUDIO_R11_file_5_(2017_08_06-06_44_59)_ASWMUX221163', 
           'HM_VHMM006_RT_AUDIO_R14_file_5_(2017_08_06-06_44_59)_ASWMUX221052']

In [40]:
bad_csvs = [glob.glob(LABELS_IN+fileID+'*') for fileID in badIDs]
bad_csvs = list(flatten(bad_csvs))

bad_wavs = []
for bad_csv in bad_csvs:
    bad_wavs.append(in_wav_loc[in_csv_loc.index(bad_csv)])

for bad_csv in bad_csvs:
    in_csv_loc.remove(bad_csv)
for bad_wav in bad_wavs:
    in_wav_loc.remove(bad_wav)
for badID in badIDs:
    fileIDs.remove(badID)

In [41]:
print(str(len(fileIDs))+" files remaining") # all files

114 files remaining


### Generating label files

In [42]:
for csv, wav, fileID in zip(in_csv_loc, in_wav_loc, fileIDs):
    if not wav=='NA':
        print("Processing "+fileID)
        # Parse labels
        labels = prep_labels(csv, wav, fileID)  
        # If labels is non-empty...
        if (labels.shape[0]!=0):
            labels.to_csv(LABELS_OUT+fileID+"_labels.csv", index=False)
        else:
            print("No labelled calls for "+fileID)
    else:
        print("No wav for "+fileID)

Processing HM_VHMM023_MBLS_R02_20190707-20190719_file_8_(2019_07_14-11_44_59)_145944
Processing HM_VCVM001_SOUNDFOC_20190712_2
Processing HM_VHMM021_MBLT_R01_20190707-20190719_file_6_(2019_07_12-11_44_59)_125944
Processing HM_VHMM014_LSTB_R19_20190707-20190719_file_6_(2019_07_12-11_44_59)_125944
Processing HM_VHMF015_RTTB_R05_20190707-20190719_file_8_(2019_07_14-11_44_59)_145944
Processing HM_VHMM017_RSTB_R23_20190708-20190720_file_9_(2019_07_15-11_44_59)_155944
Processing HM_VHMM023_MBLS_R02_20190707-20190719_file_6_(2019_07_12-11_44_59)_125944
Processing HM_VHMF001_HTB_R20_20190707-20190719_file_10_(2019_07_16-11_44_59)_165944
Processing HM_VHMM023_MBLS_R02_20190707-20190719_file_9_(2019_07_15-11_44_59)_155944
Processing HM_VHMM008_SHTB_R14_20190707-20190719_file_9_(2019_07_15-11_44_59)_155944
Processing HM_VHMF022_MBRS_R22_20190707-20190719_file_7_(2019_07_13-11_44_59)_135944
Processing HM_RT_R12_file_5_(2017_08_24-06_44_59)_ASWMUX221102
Processing HM_VHMM017_RSTB_R23_20190708-20190

In [43]:
print("Generated "+str(len(glob.glob(LABELS_OUT+'*')))+" label files") # all

Generated 108 label files


### Get updated list of wav and labels file locations

In [44]:
# List of all wav filepaths (to full wav files, not just the calls), where I have 
# a labels file in LABELS_OUT. Therefore, do it kind of backwards.

labels_loc = glob.glob(LABELS_OUT+'*.csv')
fileIDs = [os.path.basename(item).replace('_labels.csv', '') for item in labels_loc] 

wavs_loc = [glob.glob(AUDIO_IN+fileID+'*') for fileID in fileIDs] # creates list of lists
wavs_loc = [["NA"] if not x else x for x in wavs_loc] # Replace empty lists with "NA"
wavs_loc = list(flatten(wavs_loc)) # Flatten list

# check for NAs, should be none!
for ID, wav in zip(fileIDs, wavs_loc):
    if wav=="NA":
        print("Error, wav file missing for "+ID)

### Make a dataframe for the whole dataset

In [186]:
df_list=[]

for label_loc in labels_loc:
    print("Reading label df for "+os.path.basename(label_loc))
    df_list.append(pd.read_csv(label_loc))

df = pd.concat(df_list)

Reading label df for HM_VHMM007_LSLT_R17_20190707-20190719_file_10_(2019_07_16-11_44_59)_165944_labels.csv
Reading label df for HM_VHMF015_RTTB_R05_20190707-20190719_file_7_(2019_07_13-11_44_59)_135944_labels.csv
Reading label df for HM_VHMM003_SOUNDFOC_20170824_2_labels.csv
Reading label df for HM_VHMM017_RSTB_R23_20190708-20190720_file_6_(2019_07_12-11_44_59)_125944_labels.csv
Reading label df for HM_VCVM001_SOUNDFOC_20190716_2_labels.csv
Reading label df for HM_VHMF010_SOUNDFOC_20190715_labels.csv
Reading label df for HM_VHMM003_SOUNDFOC_20170825_3_labels.csv
Reading label df for HM_LT_R09_20170903-20170908_file_2_(2017_09_03-05_44_59)_ASWMUX221110_labels.csv
Reading label df for HM_VHMM014_LSTB_R19_20190707-20190719_file_8_(2019_07_14-11_44_59)_145944_labels.csv
Reading label df for HM_VHMM021_MBLT_R01_20190707-20190719_file_7_(2019_07_13-11_44_59)_135944_labels.csv
Reading label df for HM_VHMM008_SHTB_R14_20190707-20190719_file_10_(2019_07_16-11_44_59)_165944_labels.csv
Reading la

In [187]:
df.shape 

(46009, 15)

### Cleaning dataset

#### Adding additional columns for noisy, uncertain calls

In [188]:
df['call_lable'] = df['Name'].copy()

In [189]:
# Mark all that contain # (unsure if it's even a call)
df['unsure_call'] = [1 if '#' in i else 0 for i in df['call_lable']]
df['call_lable'] = [x.replace("#", "") for x in df['call_lable']]

# Mark noisy calls (marked with "X" or "x")
df['noisy'] = [1 if (('x' in i) or ('X' in i)) else 0 for i in df['call_lable']]
df['call_lable'] = [x.replace("X", "") for x in df['call_lable']]
df['call_lable'] = [x.replace("x", "") for x in df['call_lable']]

# Mark uncertain calls (not sure which type)
df['certain_yn'] = ["n" if ('?' in i) else "y" for i in df['call_lable']]
df['call_lable'] = [x.replace("?", "") for x in df['call_lable']]

# Mark calls from microphone recordings (Soundfoc)
df['soundfoc_yn'] = ['y' if "SOUNDFOC" in i else 'n' for i in df['original_wav']]

# Mark nonfocal calls
df['nonfocal_yn'] = ['y' if any(substring in i for substring in ['nf', 'NF', 'NONFOC', 'nonfoc']) else 'n' for i in df['call_lable']]

#### Categorize labels


Ignore "?", "!","*" and "%" (? expresses uncertainty about label, * expresses uncertainty about nonfocal or focal, don't know what ! means, % means overlapping calls)

In [190]:
lablenames = df['call_lable'].copy()

labelsep = re.compile('[\?\*\!%]') # Match ? * ! %
lablenames = [labelsep.sub('', i) for i in lablenames]

String split and use first word for categorization based on dictionary

In [191]:
call_types = {
    'cc' :["cc","Marker", "Marque", "CC", "MARKER", "MARQUE"],
    'sn' :["sn","subm", "short","^s$", "s ", "SN", "SUBM", "SHORT", "S"], 
    'mo' :["mo", "mov", "move", "MO","MOV","MOVE"],
    'agg':["ag", "agg", "aggress", "chat", "growl","AG","AGG","AGGRESS","CHAT","GROWL"],
    'ld' :["ld","LD","lead","LEAD"],
    'soc':["soc","SOCIAL", "so ", "SOC", "social"], 
    'al' :["al", "AL","ALARM", "alarm"],
    'fu/hyb': ["fu", "FU", "Fu", 'HYB', 'hyb', 'Hyb']
    }
    
# make dictionary labels to calls
labels_to_calls = dict((v,k) for k in call_types for v in call_types[k])

In [192]:
firstword = [i.split(' ')[0] for i in lablenames]
lables = [labels_to_calls[lable] if lable in labels_to_calls.keys() else "unknown" for lable in firstword]

In [193]:
pd.Series(lables).value_counts()

cc         22648
unknown    11036
soc         4184
sn          3960
fu/hyb      1251
mo          1039
al           845
agg          818
ld           228
dtype: int64

In [194]:
df['call_lable'] = lables

#### Removing all unknown calls (beep, synch etc..)

In [195]:
df = df.loc[df['call_lable']!="unknown",:]
df.shape

(34973, 21)

#### Removing zero-duration-calls

In [196]:
print(df[df['duration_s']==0].shape[0])
df = df.loc[df['duration_s'] > 0]
df.shape

18


(34955, 21)

In [197]:
df['call_lable'].value_counts()

cc        22634
soc        4181
sn         3960
fu/hyb     1251
mo         1039
al          845
agg         817
ld          228
Name: call_lable, dtype: int64

In [198]:
df.to_pickle(os.path.join(os.path.sep, DATA, "meerkat_labelfile.pkl"))

[#### Removing unsure, uncertain, noisy and non-focal calls]

In [165]:
#df = pd.read_pickle(os.path.join(os.path.sep, DATA, "meerkat_labelfile.pkl"))

In [166]:
#print("Full: ", df.shape)
#df = df.loc[df['unsure_call']==0]
#print("Removed unsure: ", df.shape)
#df = df.loc[df['certain_yn']=="y"]
#print("Removed uncertain: ",df.shape)
#df = df.loc[df['nonfocal_yn']=="n"]
#print("Removed nonfocal: ",df.shape)
#df = df.loc[df['noisy']==0]
#print("Removed noisy: ",df.shape)

In [167]:
#df['call_lable'].value_counts()

cc        22648
soc        4184
sn         3960
fu/hyb     1251
mo         1039
al          845
agg         818
ld          228
Name: call_lable, dtype: int64

## Sorting and saving
Add index, sort values, give callID and save dataframe:

In [199]:
df = df.sort_values(by=['original_wav', 'bout_number'])
df.index = np.arange(0,df.shape[0],1) #reset_index(inplace=True)

In [200]:
df['callID'] = [os.path.basename(x)[:-4]+'___'+str(y) for x,y in zip(df['original_wav'], df['Start'])]

In [201]:
df

Unnamed: 0,Name,Start,Duration,Time Format,Type,Description,start_s,duration_s,stop_s,date,...,original_wav,bout_number,Time.Format,call_lable,unsure_call,noisy,certain_yn,soundfoc_yn,nonfocal_yn,callID
0,CCX,1:00:06.330,0:00.138,decimal,Cue,,3606.330,0.138,3606.468,2017-08-23,...,/home/mthomas/Documents/MPI_work/projects/meer...,1,,cc,0,1,y,n,n,HM_HMB_R11_AUDIO_file_4_(2017_08_23-06_44_59)_...
1,CC,1:00:15.043,0:00.083,decimal,Cue,,3615.043,0.083,3615.126,2017-08-23,...,/home/mthomas/Documents/MPI_work/projects/meer...,4,,cc,0,0,y,n,n,HM_HMB_R11_AUDIO_file_4_(2017_08_23-06_44_59)_...
2,CC,1:00:35.627,0:00.191,decimal,Cue,,3635.627,0.191,3635.818,2017-08-23,...,/home/mthomas/Documents/MPI_work/projects/meer...,5,,cc,0,0,y,n,n,HM_HMB_R11_AUDIO_file_4_(2017_08_23-06_44_59)_...
3,CCX,1:00:39.664,0:00.136,decimal,Cue,,3639.664,0.136,3639.800,2017-08-23,...,/home/mthomas/Documents/MPI_work/projects/meer...,6,,cc,0,1,y,n,n,HM_HMB_R11_AUDIO_file_4_(2017_08_23-06_44_59)_...
4,CCX,1:00:47.877,0:00.245,decimal,Cue,,3647.877,0.245,3648.122,2017-08-23,...,/home/mthomas/Documents/MPI_work/projects/meer...,7,,cc,0,1,y,n,n,HM_HMB_R11_AUDIO_file_4_(2017_08_23-06_44_59)_...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34950,SN,12:51.564,0:00.068,decimal,Cue,,771.564,0.068,771.632,2017-08-25,...,/home/mthomas/Documents/MPI_work/projects/meer...,124,,sn,0,0,y,y,n,HM_VLF206_SOUNDFOC_20170825_2.WAV___12:51.564
34951,CC,12:52.519,0:00.177,decimal,Cue,,772.519,0.177,772.696,2017-08-25,...,/home/mthomas/Documents/MPI_work/projects/meer...,125,,cc,0,0,y,y,n,HM_VLF206_SOUNDFOC_20170825_2.WAV___12:52.519
34952,CC NONFOC (PET)?,12:55.351,0:00.242,decimal,Cue,,775.351,0.242,775.593,2017-08-25,...,/home/mthomas/Documents/MPI_work/projects/meer...,126,,cc,0,0,n,y,y,HM_VLF206_SOUNDFOC_20170825_2.WAV___12:55.351
34953,CC NONFOC (PET),13:00.884,0:00.315,decimal,Cue,,780.884,0.315,781.199,2017-08-25,...,/home/mthomas/Documents/MPI_work/projects/meer...,127,,cc,0,0,y,y,y,HM_VLF206_SOUNDFOC_20170825_2.WAV___13:00.884


## Adding metdata
Adding info on meerkats

In [204]:
df.columns

Index(['Name', 'Start', 'Duration', 'Time Format', 'Type', 'Description',
       'start_s', 'duration_s', 'stop_s', 'date', 'samplerate_hz', 'indv',
       'original_wav', 'bout_number', 'Time.Format', 'call_lable',
       'unsure_call', 'noisy', 'certain_yn', 'soundfoc_yn', 'nonfocal_yn',
       'callID', 'raw_audio'],
      dtype='object')

In [208]:
sorted(list(set(df['indv'])))


meerkat_info = pd.read_csv(MEERKAT_INFO_PATH, sep=";")
meerkat_names = meerkat_info['Code']

labelsep = re.compile('[\+\s]') # Match ? * ! %
meerkat_dyemarks = [labelsep.sub('', i) for i in list(meerkat_info['Dye mark'])]
meerkat_info['Dye mark'] = meerkat_dyemarks


meerkat_dict = meerkat_info.drop(columns=['Code', 'Name']).T
meerkat_dict.columns = meerkat_names
meerkat_dict = meerkat_dict.to_dict()

for ind in sorted(list(set(df['indv']))):
    if (not ind in meerkat_dict.keys()):
        if ind in meerkat_dyemarks:
            ind_row = meerkat_info[meerkat_info['Dye mark']== ind]
            ind_code = meerkat_info[meerkat_info['Dye mark'] == ind].Code.values[0]
            if(ind_code in meerkat_dict.keys()):
                meerkat_dict[ind] = meerkat_dict[ind_code]
                print(ind, "is actually ", ind_code)
        else:
            print(ind, "not in dyemarks")

HMB is actually  VCVM001
HRT not in dyemarks
HTB is actually  VHMF001
LT is actually  VHMF031
PET not in dyemarks
RT is actually  VHMF030
VHMM002 not in dyemarks
VHMM003 not in dyemarks
VLF206 not in dyemarks


In [210]:
meerkat_dict['PET'] = meerkat_dict['VCVM001']
meerkat_dict['VLF206'] = {'DOB': '21.08.2013', 
                          'Dye mark': 'RCRSTBL', 
                          'Sex': 'F', 
                          'Status': 'Adult'
                          }
meerkat_dict['VHMM002'] = {'DOB': '27.10.16', 
                          'Dye mark': 'HRT', 
                          'Sex': 'M', 
                          'Status': 'Dominant'
                          }

meerkat_dict['VHMM003'] = {'DOB': '27.10.16', 
                          'Dye mark': 'HLT', 
                          'Sex': 'M', 
                          'Status': 'Adult'
                          }
for ind in sorted(list(set(df['indv']))):
    if (not ind in meerkat_dict.keys()):
        meerkat_dict[ind] = {'DOB': 'NA', 
                             'Dye mark': 'NA', 
                             'Sex': 'NA', 
                             'Status':'NA'}

In [211]:
sex = [meerkat_dict[ind]['Sex'] for ind in df['indv']]
df['Sex'] = sex
status = [meerkat_dict[ind]['Status'] for ind in df['indv']]
df['status'] = status

## Extract audio

In [202]:
raw_audio_data = df.apply(lambda row: get_audio(row['original_wav'], row['start_s'], row['duration_s']), axis=1)
df['raw_audio'] = raw_audio_data

## Save

In [9]:
df.to_pickle(os.path.join(os.path.sep, DATA, "meerkat_full_audio.pkl"))