# Extract audio and generate call txts

Script to extract audio for each meerkat call based on labelfiles with start- and stop times in our standard format.

In [2]:
import pandas as pd
import os
import numpy as np 
import datetime
import sys
import audiofile as af

In [3]:
#print("Please type your path to the EAS server and press enter! \n For example: \n /Volumes    (on Mac) \n //10.126.19.90    (on Windows) \n /home/username/Documents/MPI-Server    (when mounted in Documents) \n")
SERVER=input("Enter path to the EAS server (e.g. /Volumes or //10.126.19.90):")

Enter path to the EAS server (e.g. /Volumes or //10.126.19.90):/Volumes


In [4]:
if not os.path.exists(SERVER):
    print("Invalid server path: ", SERVER)
    exit()  
    
# If someone put a slash or backslash in last position
if SERVER[-1:]=="/" or SERVER[-1:]=="\n":
    SERVER = SERVER[:-1]

In [5]:
HOME = SERVER + os.path.join(os.path.sep, 'EAS_shared',
                                         'meerkat','working','processed',
                                         'acoustic', 'extract_calls')

LABELFILES_FOLDER = SERVER + os.path.join(os.path.sep, 'EAS_shared',
                                         'meerkat','working','processed',
                                         'acoustic', 'total_synched_call_tables')

AUDIOS_PATH = SERVER + os.path.join(os.path.sep, 'EAS_shared',
                                         'meerkat','archive','rawdata',
                                         'MEERKAT_RAW_DATA')

CHANNEL_INFO_PATH = SERVER + os.path.join(os.path.sep, 'EAS_shared',
                                         'meerkat','working','METADATA',
                                         'soundfoc_channels.csv')


TXT_PATH = os.path.join(os.path.sep, HOME, 'txts')

In [6]:
for path in [HOME, CHANNEL_INFO_PATH, LABELFILES_FOLDER, AUDIOS_PATH]:
    if not os.path.exists(path):
        print("Error: Path does not exist ", path)

In [8]:
# create, if it doesn't exist yet
if not os.path.exists(TXT_PATH):
    os.mkdir(TXT_PATH)

In [9]:
def get_time(timestring):    
    """
    Function that gets datatime object from timestring
    timestring must match one of the given time_patterns

    Parameters
    ----------
    timestring : String
                 some string containing a time

    Returns
    -------
    result : datetime object
             the time as datetime object

    Example
    -------
    >>> dt = get_time("01:02:30.555")
    
    """ 
    time_patterns = ['%H:%M:%S.%f', '%M:%S.%f']
    
    if(timestring)==0:
        return datetime.datetime.strptime('0:00:00.00', '%H:%M:%S.%f')
    else:
        for pattern in time_patterns:
            try:
                return datetime.datetime.strptime(timestring, pattern)
            except:
                pass
    
        print("Date is not in expected format") 
        print(timestring)
        sys.exit(0)


def get_s(dt):
    """
    Function that that converts time in datatime object to s 

    Parameters
    ----------
    timestring : datetime object

    Returns
    -------
    result : float
             time in s   
    """ 
    return dt.microsecond/1000000+dt.second + dt.minute*60 + dt.hour*60*60


def replace_multiple(string, list_of_chars, replacement):
    """
    Function that replaces multiple substrings in a string
    with other substrings.

    Parameters
    ----------
    string : String
             your input string 
    list_of_chars: list of strings
                   List of substrings you want to have replaced
    replacement: string or list of strings
                 Substring or list of substrings you want to use as
                 replacement. If list, then it should be the same length as
                 list_of_chars to be matched by position.

    Returns
    -------
    result : String
             The modified string

    Example
    -------
    >>> mod_string = replace_multiple("This is an example", ['s', 'a'], '!')
    >>> 'Thi! i! !n ex!mple'
    
    >>> mod_string = replace_multiple("This is an example", ['s', 'a'], ['S', 'A'])
    >>> 'ThiS iS An exAmple'
    """ 
    # if all are to be replaced by same string
    if (type(replacement)==str):
        replacement = [replacement]*len(list_of_chars)
        
    for ch, repl in zip(list_of_chars, replacement):
        if ch in string:
            string=string.replace(ch,repl)
    return string


In [10]:
def write_audio(wav_loc, start_s, duration_s, MEERKAT_CHANNEL, outdir, outname):
    
    """
    Function that that extracts a chunk of audio data from a given wav file
    and saves the chunk as txt file.
    If "SOUNDFOC" is in filename, assumes that audio is stereo and 
    looks up the channel with meerkat vocalizations in MEERKAT_CHANNEL dictionary.
    If anything fails and audio cannot be written as txt, returns "failed read"

    Parameters
    ----------
    wav_loc : string
              path to wav file
    start_s : float
              offset in s
    duration_s: float
                duration in s
    MEERKAT_CHANNEL: Dict
                    Dictionary containing channel info (0/1) for some 
                    wav files (basename)
                    e.g. 'HM_VLF206_SOUNDFOC_20170825_2.WAV' : 0
                         'HM_VLF206_SOUNDFOC_20170825_3.WAV' : 1
    outdir : String
             path to directory, where txt file should be saved
    
    outname :  String
               filename of the resulting txt file (without ".txt")

    Returns
    -------
        String
        wav_loc input string plus "pass" if txt has been generated 
        and "failed read" if it has failed
    
    """
    error_ms = 'pass'
    # SOUNDFOCs are stereo
    if ((wav_loc!="NA") and (duration_s>0)):
        try:
            data, rate = af.read(wav_loc, offset=start_s, duration=duration_s)
            
            # if signal is stereo
            if data.shape[0]==2:
                wav_filename = os.path.basename(wav_loc)
                
                if wav_filename in MEERKAT_CHANNEL.keys():
                    channel = MEERKAT_CHANNEL[wav_filename]
                else:
                    channel = 0
                
                data = np.asfortranarray(data[channel,:]) 
                
                if np.issubdtype(type(data[0]), np.integer):
                    data = data.astype('float32')
                    
                txt_out_path = os.path.join(os.path.sep, outdir, outname+'.txt')
                np.savetxt(txt_out_path, data, fmt='%.18f',header="sr:"+str(rate))
                #np.savetxt(outdir+outname+'.txt', data, fmt='%.18f',header="sr:"+str(rate))              
                
        except Exception:
            error_ms = 'failed_read'
            pass
        
    return (wav_loc+" : "+error_ms)

# Preparations
## Read in data

In [56]:
# Read in all labelfiles 
print("Making one giant labelfile...")
labelfiles_list = os.listdir(LABELFILES_FOLDER)

df_list=[]
for file in labelfiles_list:
    df_list.append(pd.read_csv(os.path.join(os.path.sep,LABELFILES_FOLDER,file), sep="\t", encoding="ISO-8859-1"))

# concatenate them to have one big labelfile containing all calls
labelfile = pd.concat(df_list, axis=0, sort=True)
labelfile.reset_index(inplace=True, drop=True)

print(labelfile.shape)

Making one giant labelfile...
(82264, 22)


## Find location of the wav files on server

In [None]:
# Add column containing the path to the actual wav file on the server
# that we need in order to extract that call
# ( we have the wav filename, but not its locationon the server)

In [12]:
# list all wav filenames that we need to access
wavs_we_need = sorted(list(set(labelfile.wavFileName)))

# list all filepaths to all available wav files on the server
print("Searching wavs on server...")
listOfFiles = list()
for r, d, f in os.walk(AUDIOS_PATH):
    for file in f:
        if (file.endswith(".wav") or file.endswith(".WAV")):
            if file[0]!=".":
                listOfFiles.append(os.path.join(r, file))
            else:
                pass
        else:
            pass
print("Done")

Searching wavs on server...
Done


In [57]:
# Create dictionary
wav_matches = []
no_wav_path = []

# Now assign a path to each wav_filename
for wav in wavs_we_need:
    # Search corresponding wav
    matches = [x for x in listOfFiles if wav in x]
    
    if(len(matches)==0):
        # save all where no_wav_path was found in this list
        no_wav_path.append(wav)
        matches = "NA"
    # save all matches in wav_matches
    wav_matches.append(matches)
    
# Unlist all to string and choose first match in case there are multiple
wav_matches = [x if type(x)==str else x[0] for x in wav_matches]
wav_dict = dict(zip(wavs_we_need, wav_matches))

# print the missing
for i in range(len(no_wav_path)):
    print("Couldn't find ", no_wav_path[i], " in AUDIOS_PATH")

## Modifications in labelfile

In [58]:
# Modify labelfile to simplify writing the audio

# 1) Add path to wav file to dataframe
labelfile['wav_loc'] = [wav_dict[x] for x in labelfile.wavFileName]

# 2) Make start and duration column in seconds (to be directly fed into librosa.load)
labelfile['start_s'] = labelfile.apply(lambda row: get_s(get_time(row['t0File'])), axis=1)
labelfile['duration_s'] = labelfile.apply(lambda row: get_s(get_time(row['duration'])), axis=1)

# 3) Make new callID column, as callID currently contains
# some chars which make it difficult to use callID as file name
to_be_replaced = ["/", " ", ":", "."]
replace_with = "_"

new_callID = [replace_multiple(x, to_be_replaced, replace_with) for x in labelfile.callID]
labelfile['callID_new'] = new_callID

# 4) Remove duplicate rows
labelfile = labelfile.drop_duplicates()

In [59]:
#labelfile.shape

(82264, 26)

## Save labelfile

In [27]:
# Save modified, complete labelfile
labelfile_out_path = os.path.join(os.path.sep, HOME, 'labelfile.csv')
labelfile.to_csv(labelfile_out_path, sep="\t", index=False)

# Generate call txts

In [29]:
# select only calls
#labelfile = labelfile.loc[labelfile.isCall==1,:]

In [73]:
# Check what txt files are already present
print("Checking which txt files already exist in ", TXT_PATH, "...")
txts = os.listdir(TXT_PATH)
txts = [x[:-4] for x in txts] # remove ".txt"
ids = list(labelfile.callID_new)

print("Found ", len(txts), " txt files")
print("Found ", len(ids), " rows")

# Which ones are in ids, but not in txts? (i.e. txt have not been generated yet)
missing = list(set(ids)-set(txts))
missing_df = labelfile.loc[labelfile.callID_new.isin(missing),:]
print(missing_df.shape[0], " rows without txt")

# Only run for calls with duration > 0
missing_df = missing_df.loc[missing_df.duration_s>0,:]
print(missing_df.shape[0], " rows without txt after removing zero-duration rows")


# Only run for true calls
missing_df = missing_df.loc[missing_df.isCall==1,:]
print(missing_df.shape[0], " rows without txt after removing non-call rows")

Checking which txt files already exist in  /Volumes/EAS_shared/meerkat/working/processed/acoustic/extract_calls/txts ...
Found  77790  txt files
Found  82264  rows
4474  rows without txt
2221  rows without txt after removing zero-duration rows
0  rows without txt after removing non-call rows


In [74]:
# Read in channel dictionary (contains info which channel is meerkat recording for stereo files (SOUNDFOC))
channel_tab= pd.read_csv(CHANNEL_INFO_PATH, sep="\t")
channel_dict = dict(zip(channel_tab.wavFile, channel_tab.meerkatChannel))

In [75]:
# Write the missing call txts FASTER

print("Generating ", missing_df.shape[0], " missing txts...")
x = missing_df.apply(lambda row: write_audio(row['wav_loc'], # location of long audio file
                                             row['start_s'], # start of call in that file
                                             row['duration_s'], # end of call in that file
                                             channel_dict, # dict containtin channel info
                                             TXT_PATH,  # output directory where call txt will be saved
                                             row['callID_new']), # will be filename of call txt
                     axis=1)
print("Done.")

Generating  0  missing txts...
Done.


## Check results

In [80]:
# Check results
# Print a summary of files that are now present as txt or not

print("Final status: ")

txts = list(os.listdir(TXT_PATH))
txts = [x[:-4] for x in txts] # remove ".txt"
ids = list(labelfile.callID_new)

intersect = list(set(txts) & set(ids))
print(len(intersect), " matching callID and wav, while ",labelfile.shape[0]," expected.")

diff = list(set(ids)-set(txts))
print(len(diff), " ids with no matching txt")


# Why don't they have matches?

# Because of zero duration or not being a call?
invalid_ids = list(labelfile.loc[((labelfile.duration_s==0) | (labelfile.isCall==0)),:].callID_new) # 1725
invalid_diffs = [x for x in diff if x in invalid_ids]
print(len(invalid_diffs), " of these non matches are due to zero duration or not being a call")

# Truly missing
truly_missing = list(set(diff)-set(invalid_diffs))
print(len(truly_missing), " truly missing txts")

diff2 = list(set(txts)-set(ids))
print(len(diff2), " txts with no matching id")

Final status: 
77790  matching callID and wav, while  82264  expected.
4474  ids with no matching txt
4474  of these non matches are due to zero duration or not being a call
0  truly missing txts
0  txts with no matching id
