# Extract audio and generate call txts

Script to extract audio for each meerkat call based on labelfiles with start- and stop times in our standard format.

In [3]:
import pandas as pd
import os
import numpy as np 
import librosa 
import datetime
import sys
import sox
import audiofile as af

In [4]:
# folder containing label csvs indicating start, stop times etc
LABELFILES_FOLDER = "/Volumes/EAS_shared/meerkat/working/processed/acoustic/total_synched_call_tables/"

# Path to folder that contains audio recording files (long wavs) (can be in subdirectories)
AUDIOS_PATH = "/Volumes/EAS_shared/meerkat/archive/rawdata/MEERKAT_RAW_DATA"

# Info about channel in stereo recordings (all soundfoc are stereo)
CHANNEL_INFO_PATH = "/Volumes/EAS_shared/meerkat/working/METADATA/soundfoc_channels.csv"

# folder that contains output call txt files
TXT_PATH = "txts/"
#TXT_PATH = '/Volumes/MaraMeerkat/additional_call_txts/'


# create, if it doesn't exist yet
if not os.path.exists(TXT_PATH):
    os.mkdir(TXT_PATH)

In [5]:
def get_time(timestring):    
    """
    Function that gets datatime object from timestring
    timestring must match one of the given time_patterns

    Parameters
    ----------
    timestring : String
                 some string containing a time

    Returns
    -------
    result : datetime object
             the time as datetime object

    Example
    -------
    >>> dt = get_time("01:02:30.555")
    
    """ 
    time_patterns = ['%H:%M:%S.%f', '%M:%S.%f']
    
    if(timestring)==0:
        return datetime.datetime.strptime('0:00:00.00', '%H:%M:%S.%f')
    else:
        for pattern in time_patterns:
            try:
                return datetime.datetime.strptime(timestring, pattern)
            except:
                pass
    
        print("Date is not in expected format") 
        print(timestring)
        sys.exit(0)


def get_s(dt):
    """
    Function that that converts time in datatime object to s 

    Parameters
    ----------
    timestring : datetime object

    Returns
    -------
    result : float
             time in s   
    """ 
    return dt.microsecond/1000000+dt.second + dt.minute*60 + dt.hour*60*60


def replace_multiple(string, list_of_chars, replacement):
    """
    Function that replaces multiple substrings in a string
    with other substrings.

    Parameters
    ----------
    string : String
             your input string 
    list_of_chars: list of strings
                   List of substrings you want to have replaced
    replacement: string or list of strings
                 Substring or list of substrings you want to use as
                 replacement. If list, then it should be the same length as
                 list_of_chars to be matched by position.

    Returns
    -------
    result : String
             The modified string

    Example
    -------
    >>> mod_string = replace_multiple("This is an example", ['s', 'a'], '!')
    >>> 'Thi! i! !n ex!mple'
    
    >>> mod_string = replace_multiple("This is an example", ['s', 'a'], ['S', 'A'])
    >>> 'ThiS iS An exAmple'
    """ 
    # if all are to be replaced by same string
    if (type(replacement)==str):
        replacement = [replacement]*len(list_of_chars)
        
    for ch, repl in zip(list_of_chars, replacement):
        if ch in string:
            string=string.replace(ch,repl)
    return string

def write_audio(wav_loc, start_s, duration_s, MEERKAT_CHANNEL, outdir, outname):
    
    """
    Function that that extracts a chunk of audio data from a given wav file
    and saves the chunk as txt file.
    If "SOUNDFOC" is in filename, assumes that audio is stereo and 
    looks up the channel with meerkat vocalizations in MEERKAT_CHANNEL dictionary.
    If anything fails and audio cannot be written as txt, returns "failed read"

    Parameters
    ----------
    wav_loc : string
              path to wav file
    start_s : float
              offset in s
    duration_s: float
                duration in s
    MEERKAT_CHANNEL: Dict
                    Dictionary containing channel info (0/1) for some 
                    wav files (basename)
                    e.g. 'HM_VLF206_SOUNDFOC_20170825_2.WAV' : 0
                         'HM_VLF206_SOUNDFOC_20170825_3.WAV' : 1
    outdir : String
             path to directory, where txt file should be saved
    
    outname :  String
               filename of the resulting txt file (without ".txt")

    Returns
    -------
        String
        wav_loc input string plus "pass" if txt has been generated 
        and "failed read" if it has failed
    
    """
    error_ms = 'pass'
    # SOUNDFOCs are stereo
    if ((wav_loc!="NA") and (duration_s>0)):
        if "SOUNDFOC" in wav_loc:
            try:
                data, rate = librosa.load(wav_loc, offset=start_s, duration=duration_s, sr=None, mono=False)
                wav_filename = os.path.basename(wav_loc)
                if wav_filename in MEERKAT_CHANNEL.keys():
                    data = np.asfortranarray(data[MEERKAT_CHANNEL[wav_filename],:])
                else:
                    data = np.asfortranarray(data[0,:])
                
                np.savetxt(outdir+outname+'.txt', data, fmt='%.18f',header="sr:"+str(rate))
            except Exception:
                error_ms = 'failed_read'
                pass
            
        else:
            try:
                data, rate = librosa.load(wav_loc, offset=start_s, duration=duration_s, sr=None)
    
                if np.issubdtype(type(data[0]), np.integer):
                    data = data.astype('float32') 
                np.savetxt(outdir+outname+'.txt', data, fmt='%.18f',header="sr:"+str(rate))
            except Exception:
                error_ms = 'failed_read'
                pass
    return (wav_loc+" : "+error_ms)

In [7]:
def write_audio_af(wav_loc, start_s, duration_s, MEERKAT_CHANNEL, outdir, outname):
    
    """
    Function that that extracts a chunk of audio data from a given wav file
    and saves the chunk as txt file.
    If "SOUNDFOC" is in filename, assumes that audio is stereo and 
    looks up the channel with meerkat vocalizations in MEERKAT_CHANNEL dictionary.
    If anything fails and audio cannot be written as txt, returns "failed read"

    Parameters
    ----------
    wav_loc : string
              path to wav file
    start_s : float
              offset in s
    duration_s: float
                duration in s
    MEERKAT_CHANNEL: Dict
                    Dictionary containing channel info (0/1) for some 
                    wav files (basename)
                    e.g. 'HM_VLF206_SOUNDFOC_20170825_2.WAV' : 0
                         'HM_VLF206_SOUNDFOC_20170825_3.WAV' : 1
    outdir : String
             path to directory, where txt file should be saved
    
    outname :  String
               filename of the resulting txt file (without ".txt")

    Returns
    -------
        String
        wav_loc input string plus "pass" if txt has been generated 
        and "failed read" if it has failed
    
    """
    error_ms = 'pass'
    # SOUNDFOCs are stereo
    if ((wav_loc!="NA") and (duration_s>0)):
        try:
            data, rate = af.read(wav_loc, offset=start_s, duration=duration_s)
            
            # if signal is stereo
            if data.shape[0]==2:
                wav_filename = os.path.basename(wav_loc)
                
                if wav_filename in MEERKAT_CHANNEL.keys():
                    channel = MEERKAT_CHANNEL[wav_filename]
                else:
                    channel = 0
                
                data = np.asfortranarray(data[channel,:])  
                if np.issubdtype(type(data[0]), np.integer):
                    data = data.astype('float32')
                    
                np.savetxt(outdir+outname+'.txt', data, fmt='%.18f',header="sr:"+str(rate))
                
        except Exception:
            error_ms = 'failed_read'
            pass
        
    return (wav_loc+" : "+error_ms)

# Preparations
## Read in data

In [8]:
# Read in all labelfiles 
labelfiles_list = os.listdir(LABELFILES_FOLDER)

df_list=[]
for file in labelfiles_list:
    df_list.append(pd.read_csv(LABELFILES_FOLDER+file, sep="\t", encoding="ISO-8859-1"))

# concatenate them to have one big labelfile containing all calls
labelfile = pd.concat(df_list, axis=0, sort=True)
labelfile.reset_index(inplace=True, drop=True)

labelfile.shape

(82264, 22)

## Find location of the wav files on server

In [None]:
# Add column containing the path to the actual wav file on the server
# that we need in order to extract that call
# ( we have the wav filename, but not its locationon the server)

In [10]:
# list all wav filenames that we need to access
wavs_we_need = sorted(list(set(labelfile.wavFileName)))

# list all filepaths to all available wav files on the server
listOfFiles = list()
for r, d, f in os.walk(AUDIOS_PATH):
    for file in f:
        if (file.endswith(".wav") or file.endswith(".WAV")):
            if file[0]!=".":
                listOfFiles.append(os.path.join(r, file))
            else:
                pass
        else:
            pass

In [11]:
# Create dictionary
wav_matches = []
no_wav_path = []

# Now assign a path to each wav_filename
for wav in wavs_we_need:
    # Search corresponding wav
    matches = [x for x in listOfFiles if wav in x]
    
    if(len(matches)==0):
        # save all where no_wav_path was found in this list
        no_wav_path.append(wav)
        matches = "NA"
    # save all matches in wav_matches
    wav_matches.append(matches)
    
# Unlist all to string and choose first match in case there are multiple
wav_matches = [x if type(x)==str else x[0] for x in wav_matches]
wav_dict = dict(zip(wavs_we_need, wav_matches))

# print the missing
for i in range(len(no_wav_path)):
    print("Couldn't find ", no_wav_path[i], " in AUDIOS_PATH")

## Modifications in labelfile

In [14]:
# Modify labelfile to simplify writing the audio

# 1) Add path to wav file to dataframe
labelfile['wav_loc'] = [wav_dict[x] for x in labelfile.wavFileName]

# 2) Make start and duration column in seconds (to be directly fed into librosa.load)
labelfile['start_s'] = labelfile.apply(lambda row: get_s(get_time(row['t0File'])), axis=1)
labelfile['duration_s'] = labelfile.apply(lambda row: get_s(get_time(row['duration'])), axis=1)

# 3) Make new callID column, as callID currently contains
# some chars which make it difficult to use callID as file name
to_be_replaced = ["/", " ", ":", "."]
replace_with = "_"

new_callID = [replace_multiple(x, to_be_replaced, replace_with) for x in labelfile.callID]
labelfile['callID_new'] = new_callID

# 4) Remove duplicate rows
labelfile = labelfile.drop_duplicates()

In [15]:
labelfile.shape

(82264, 26)

## Save labelfile

In [16]:
# Save modified, complete labelfile

#labelfile.to_csv("/Volumes/EAS_shared/meerkat/working/processed/acoustic/extract_calls/labelfile.csv", sep="\t")
labelfile.to_csv("labelfile.csv", sep="\t")

In [25]:
#labelfile = pd.read_csv("/Volumes/EAS_shared/meerkat/working/processed/acoustic/extract_calls/labelfile.csv", sep="\t")

In [26]:
labelfile.shape

(81784, 28)

# Generate call txts

In [None]:
# select only calls
labelfile = labelfile.loc[labelfile.isCall==1,:]

In [87]:
# Check what txt files are already present
txts = list(os.listdir(TXT_PATH))
txts = [x[:-4] for x in txts] # remove ".txt"
ids = list(labelfile.callID_new)

# Which ones are in ids, but not in txts? (i.e. txt have not been generated yet)
missing = [x for x in ids if x not in txts]

missing_df = labelfile.loc[labelfile.callID_new.isin(missing),:]
print(missing_df.shape)

(8133, 28)


In [None]:
# Read in channel dictionary (contains info which channel is meerkat recording for stereo files (SOUNDFOC))
channel_tab= pd.read_csv(CHANNEL_INFO_PATH, sep="\t")
channel_dict = dict(zip(channel_tab.wavFile, channel_tab.meerkatChannel))

In [91]:
# Write the missing call txts
#x = missing_df.apply(lambda row: write_audio(row['wav_loc'], # location of long audio file
#                                             row['start_s'], # start of call in that file
#                                             row['duration_s'], # end of call in that file
#                                             channel_dict, # dict containtin channel info
#                                             TXT_PATH,  # output directory where call txt will be saved
#                                             row['callID_new']), # will be filename of call txt
#                     axis=1)

In [None]:
# Write the missing call txts FASTER
x = missing_df.apply(lambda row: write_audio_af(row['wav_loc'], # location of long audio file
                                             row['start_s'], # start of call in that file
                                             row['duration_s'], # end of call in that file
                                             channel_dict, # dict containtin channel info
                                             TXT_PATH,  # output directory where call txt will be saved
                                             row['callID_new']), # will be filename of call txt
                     axis=1)

## Generate and save log file

In [53]:
statstab = pd.DataFrame(pd.Series(x).value_counts())
f_name = [x.split(" : ")[0] for x in statstab.index]
f_status = [x.split(" : ")[1] for x in statstab.index]
statstab['status'] = f_status # either pass or fail (was returned by write_audio function)
statstab['filename'] = f_name # long audio recording file for which error occurred

statstab = statstab.rename(columns={0: 'count'})
statstab.reset_index(inplace=True, drop=True)

#statstab.to_csv("generate_call_txt_checkfile.csv", sep="\t")

## Check results

In [None]:
# Check results
# Print a summary of files that were generated and missing files

txts = list(os.listdir(TXT_PATH))
txts = [x[:-4] for x in txts] # remove ".txt"
ids = list(labelfile.callID_new)

intersect = list(set(txts) & set(ids))
print(len(intersect), " matching callID and wav, while ",labelfile.shape[0]," expected.")


diff = (list(list(set(ids)-set(txts)) + list(set(txts)-set(ids))))
print(len(diff), " non matches")


# Why don't they have matches?

# Because of zero duration?
zero_duration_ids = list(labelfile.loc[labelfile.duration_s==0,:].callID_new) # 1725
zero_duration_diffs = [x for x in diff if x in zero_duration_ids]
print(len(zero_duration_diffs), " of these non matches are due to zero duration")

other_diffs = [x for x in diff if x not in zero_duration_ids]
print("the others are:")
for d in other_diffs:
    print(d)