# Pull OrcaSound hydrophone data from Amazon Web Service buckets

In [2]:
from datetime import datetime, timedelta
import pytz
import os
import boto3
from botocore.config import Config
from botocore import UNSIGNED
import pandas as pd
import ffmpeg
import glob
from pathlib import Path
import shutil
import paramiko
import json
import IPython.display as ipd
import librosa

ModuleNotFoundError: No module named 'librosa'

## Choose your amazon web service bucket

In [None]:
# Set up the S3 client with unsigned configuration
s3_client = boto3.client('s3', config=Config(signature_version=UNSIGNED))

bucket_name = 'audio-orcasound-net'
prefix_options = ['rpi_bush_point/hls/', 
                  'rpi_mast_center/hls/', 
                  'rpi_north_sjc/hls/', 
                  'rpi_orcasound_lab/hls/', 
                  'rpi_point_robinson/hls/', 
                  'rpi_port_townsend/hls/', 
                  'rpi_sunset_bay/hls/']

prefix = 'rpi_sunset_bay/hls/'

directories, num_directories = [], []
files, buckets = [], []

# List objects in the specified bucket and prefix
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix, Delimiter='/')

# Print directory (pseudo-folder) names
if 'CommonPrefixes' in response:
    for prefix_info in response['CommonPrefixes']:
        directories.append(prefix_info['Prefix'])
        # Get the numeric part of the directory name
        num = prefix_info['Prefix'].split('/')[-2]  # Get the last directory name before the trailing '/'
        num_directories.append(int(num))

# Print file names
if 'Contents' in response:
    for obj in response['Contents']:
        files.append(obj['Key'])

# List all buckets inside selected node
paginator = s3_client.get_paginator('list_objects_v2')
operation_parameters = {
    'Bucket': bucket_name,
    'Prefix': prefix,
    'Delimiter': '/'
}

for page in paginator.paginate(**operation_parameters):
    if 'CommonPrefixes' in page:
            for prefix_info in page['CommonPrefixes']:
                dir_name = prefix_info['Prefix']
                dir_number = int(dir_name.split('/')[-2])
                buckets.append(dir_number)


In [None]:
# takes the date and time in pst and returns the bucket that contains that time

def choose_bucket(date, dur):
    dt = datetime.strptime(date, '%Y-%m-%d_%H.%M.%S')
    end_dt = dt + timedelta(seconds=dur)
    end_date = end_dt.strftime('%Y-%m-%d_%H.%M.%S')

    year, month, day = int(date[0:4]), int(date[5:7]), int(date[8:10])
    start_hour, start_min, start_sec = int(date[11:13]), int(date[14:16]), int(date[17:19])
    end_hour, end_min, end_sec = int(end_date[11:13]), int(end_date[14:16]), int(end_date[17:19])

    # save the input times to datetime objects in PST/PDT
    pst = pytz.timezone('America/Los_Angeles')
    start_time = pst.localize(datetime(year, month, day, start_hour, start_min, start_sec))
    end_time = pst.localize(datetime(year, month, day, end_hour, end_min, end_sec))
    print('Starting time:', start_time, '\nEnding time:', end_time)

    # Identify the bucket based on converted unix start and end times from PST/PDT, uses the max of a list of all buckets below the start time
    start_unix_time = int(start_time.timestamp())
    end_unix_time = int(end_time.timestamp())
    use_bucket = int(max([val for val in buckets if val < start_unix_time]))

    # buffers ~20s before and after the calculated time for files to pull within the bucket
    start_live = round((start_unix_time - use_bucket)/10)
    end_live = round((end_unix_time - use_bucket)/10)
    #print(f'Using the bucket {use_bucket} with a live range of .{start_live} to .{end_live}')

    return use_bucket, start_time, end_time, start_live, end_live


def pull_ts_files(use_bucket, start_time, start_live, end_live):
    loc_fol = f'{use_bucket}_{start_time}'
    os.makedirs(loc_fol, exist_ok=True)
    
    metadata = {}

    # pulls the live files identified and saves them in a folder named with the bucket and the start time of the chunk
    s = start_live
    while s <= end_live:
        aws_filename = f'live{s}.ts'
        s3_key = f'rpi_sunset_bay/hls/{use_bucket}/{aws_filename}'
        download_path = os.path.join(loc_fol, aws_filename)
        try:
            s3_client.download_file(bucket_name, s3_key, download_path)
            response = s3_client.head_object(Bucket=bucket_name, Key=s3_key)
            last_modified = response['LastModified']
            metadata[aws_filename] = str(last_modified)
        except Exception as e:
            print(f"An error occurred in downloading the audio files from AWS: {e}")
        s += 1

    # Save metadata to a JSON file
    metadata_file = os.path.join(loc_fol, 'metadata.json')
    with open(metadata_file, 'w') as f:
        json.dump(metadata, f, indent=4)

    return loc_fol


def concate_ts_files(local_folder, chunk_name):
    output_file = os.path.join("ts", f"{chunk_name}.ts")
    with open(output_file, 'wb') as outfile:
        ts_files = sorted(glob.glob(os.path.join(local_folder, "*.ts")))
        for ts_file in ts_files:
            with open(ts_file, 'rb') as infile:
                outfile.write(infile.read())
    shutil.rmtree(local_folder)


def convert_ts_to_wav(chunk):
    in_ts = f"ts/{chunk}.ts"
    out_wav = os.path.join("wavs", f"{chunk}.wav")
    
    if not os.path.isfile(in_ts):
        print(f"File not found: {in_ts}. Skipping...")
        return
    
    if os.path.isfile(out_wav):
        print(f"Output file already exists: {out_wav}. Skipping...")
        return
    
    try:
        ffmpeg.input(in_ts).output(out_wav, acodec='pcm_s16le', ac=2, ar='44100').run(quiet=True, overwrite_output=True)
    except ffmpeg.Error as e:
        print(f"Error converting {in_ts}: {e}")

    y, sr = librosa.load(out_wav, sr=None)
    print(f"y is {y} and sr is {sr}")
    
    return y


def cleanup_directories(*dir_names):
    # Remove all contents and then delete the directory itself
    for dir_name in dir_names:
        dir_path = Path(dir_name)
        if dir_path.exists():
            shutil.rmtree(dir_path)
            print(f"Deleted {dir_name} and contents")

In [None]:
# INPUT YOUR DESIRED TIME IN PST/PDT AS LISTED ON ORCASOUND'S ONLINE INTERFACE
year, month, day = "2024", "12", "14"
start_hour, start_min, start_sec = "17", "20", "30"
date = f'{year}-{month}-{day}_{start_hour}.{start_min}.{start_sec}'
duration = 30

# make a ts and wav folder to save stuff in
Path("ts").mkdir(exist_ok=True)
Path("wavs").mkdir(exist_ok=True)

use_bucket, start_time, end_time, start_live, end_live = choose_bucket(date, duration)
local_folder = pull_ts_files(use_bucket, start_time, start_live, end_live)

chunk_name = os.path.join(start_time.strftime('%Y-%m-%d_%H.%M.%S') + '_to_' + end_time.strftime('%H.%M.%S'))
concate_ts_files(local_folder, chunk_name)
wavform = convert_ts_to_wav(chunk_name)
cleanup_directories("ts")