In [3]:
import importlib
import sys
import tqdm
from src.orca_hls_utils.s3_utils import get_all_folders, verify_playlist
import pandas as pd


In [None]:

# Reload the module to ensure the latest version is used
# importlib.reload(sys.modules['src.orca_hls_utils.s3_utils'])

# Define the S3 bucket and the prefix for the hydrophone (e.g. rpi_orcasound_lab)
BUCKET = "audio-orcasound-net"
PREFIX = "rpi_bush_point/hls/"

# Get all timestamp folders using s3_utils.get_all_folders
folders = get_all_folders(BUCKET, PREFIX)
print(f"Found {len(folders)} folders under prefix: {PREFIX}")
df_time = pd.DataFrame(folders, columns=['folder_name'])
df_time['timestamp'] = pd.to_datetime(df_time['folder_name'], unit='s')
# Calculate the difference in seconds between consecutive timestamps
df_time['time_diff_seconds'] = df_time['timestamp'].diff().dt.total_seconds()
display(df_time[df_time['folder_name'] > 1725996619])

Found 7034 folders under prefix: rpi_bush_point/hls/


  df_time['timestamp'] = pd.to_datetime(df_time['folder_name'], unit='s')


Unnamed: 0,folder_name,timestamp,time_diff_seconds
6934,1737446417,2025-01-21 08:00:17,86399.0
6935,1737532818,2025-01-22 08:00:18,86401.0
6936,1737619217,2025-01-23 08:00:17,86399.0
6937,1737705617,2025-01-24 08:00:17,86400.0
6938,1737792017,2025-01-25 08:00:17,86400.0
...,...,...,...
7029,1745564417,2025-04-25 07:00:17,86399.0
7030,1745650818,2025-04-26 07:00:18,86401.0
7031,1745737218,2025-04-27 07:00:18,86400.0
7032,1745823617,2025-04-28 07:00:17,86399.0


In [3]:
filtered_df_time = df_time[(df_time['timestamp'] >= '2022-12-01') & (df_time['timestamp'] <= '2023-05-31')]
display(filtered_df_time)


Unnamed: 0,folder_name,timestamp,time_diff_seconds
13560,1669861819,2022-12-01 02:30:19,21600.0
13561,1669883432,2022-12-01 08:30:32,21613.0
13562,1669905020,2022-12-01 14:30:20,21588.0
13563,1669926621,2022-12-01 20:30:21,21601.0
13564,1669948231,2022-12-02 02:30:31,21610.0
...,...,...,...
14130,1685367018,2023-05-29 13:30:18,21599.0
14131,1685388618,2023-05-29 19:30:18,21600.0
14132,1685431818,2023-05-30 07:30:18,43200.0
14133,1685453418,2023-05-30 13:30:18,21600.0


In [4]:
report = []
if not folders:
    print(f"No folders found under prefix: {PREFIX}")
else:
    for folder in tqdm.tqdm(folders[13560:14134]):  
        folder_prefix = PREFIX + folder + "/"  # e.g. "rpi_orcasound_lab/hls/1541061134/"
        res = verify_playlist(BUCKET, folder_prefix)
        if res:
            report.append({'folder': folder} | res)

100%|██████████| 574/574 [27:53<00:00,  2.92s/it]  


In [6]:
import pandas as pd
df = pd.DataFrame(report)
df['timeStamp'] = pd.to_datetime(df['folder'], unit='s')
df['extra_files'] = df['extra_files'].apply(lambda x: sorted(x))
df['len_extra'] = df['extra_files'].apply(lambda x: len(x))
df['len_missing'] = df['missing_files'].apply(lambda x: len(x))
# Calculate the difference in seconds between consecutive timestamps
df['time_diff_seconds'] = df['timeStamp'].diff().dt.total_seconds()

df.to_csv("playlist_report.csv", index=False)

  df['timeStamp'] = pd.to_datetime(df['folder'], unit='s')


In [7]:
display(df)

Unnamed: 0,folder,missing_files,extra_files,length,timeStamp,len_extra,len_missing,time_diff_seconds
0,1669905020,[2158],[],2159,2022-12-01 14:30:20,0,1,
1,1670185820,[2158],[],2159,2022-12-04 20:30:20,0,1,280800.0
2,1670272220,[2158],[],2159,2022-12-05 20:30:20,0,1,86400.0
3,1670293820,[2158],[],2159,2022-12-06 02:30:20,0,1,21600.0
4,1670769020,[2158],[],2159,2022-12-11 14:30:20,0,1,475200.0
...,...,...,...,...,...,...,...,...
84,1681543039,[000],[],2,2023-04-15 07:17:19,0,1,399600.0
85,1683257412,"[000, 005, 002, 003, 006, 004, 001]",[016],16,2023-05-05 03:30:12,1,7,1714373.0
86,1683257413,"[030, 186, 059, 133, 207, 006, 041, 202, 038, ...",[],219,2023-05-05 03:30:13,0,165,1.0
87,1684595161,"[001, 000]",[],158,2023-05-20 15:06:01,0,2,1337748.0


In [52]:
df['extra_files'].iloc[0]

['351', '353', '355', '356', '358', '352', '354', '357']

In [57]:
df['len_extra'].value_counts()

len_extra
0    569
7      3
8      1
Name: count, dtype: int64

In [None]:
import unittest
from orca_hls_utils.DateRangeHLSStream_bkp import DateRangeHLSStream


class TestDateRangeHLSStreamEdgeCases(unittest.TestCase):
    def setUp(self):
        self.stream_base = 'https://s3-us-west-2.amazonaws.com/streaming-orcasound-net/rpi_orcasound_lab'
        self.polling_interval = 60
        self.unix_time_in_data = 1745910018  # Epoch time for '2025-04-29 07:00'
        self.unix_time_out_data = 1741444800  # Epoch time for '2025-05-05T12:00'
        self.unix_time_outer_data = 1741526400  # Epoch time for '2025-05-06T12:00'
        self.wav_dir = '/tmp/wav_dir'

    def test_no_data_out_to_out(self):
        stream = DateRangeHLSStream(self.stream_base, self.polling_interval, self.unix_time_out_data, self.unix_time_outer_data, self.wav_dir)
        # stream.current_folder_index = 0
        # stream.current_clip_start_time = 1100  # will make segment_start_index = 1
        result = stream.get_next_clip()
        self.assertEqual(result, (None, None, None))
        self.assertEqual(stream.current_folder_index, 1)
        self.assertFalse(stream.is_end_of_stream)

    def test_no_data_in_to_out(self):
        stream = DateRangeHLSStream(self.stream_base, self.polling_interval, self.unix_time_in_data, self.unix_time_out_data, self.wav_dir)
        # stream.current_folder_index = 1  # last folder
        result = stream.get_next_clip()
        self.assertEqual(result, (None, None, None))
        self.assertTrue(stream.is_end_of_stream)

In [2]:
unittest.main(argv=[''], verbosity=2, exit=False)

test_no_data_in_to_out (__main__.TestDateRangeHLSStreamEdgeCases) ... 

Found 15825 folders in all for hydrophone
0
1725996619
1745910018
Found 1 folders in date range


ERROR
test_no_data_out_to_out (__main__.TestDateRangeHLSStreamEdgeCases) ... 

Found 15825 folders in all for hydrophone
0
1725996619
1741444800
Found 1 folders in date range


ERROR

ERROR: test_no_data_in_to_out (__main__.TestDateRangeHLSStreamEdgeCases)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "C:\Users\stlp\AppData\Local\Temp\ipykernel_620\4118854793.py", line 26, in test_no_data_in_to_out
    result = stream.get_next_clip()
  File "c:\Users\stlp\OneDrive - UW\Desktop\Work\OrcaSound Project\orca-hls-utils\.conda\lib\site-packages\orca_hls_utils\DateRangeHLSStream.py", line 157, in get_next_clip
    self.current_clip_start_time = self.valid_folders[
IndexError: list index out of range

ERROR: test_no_data_out_to_out (__main__.TestDateRangeHLSStreamEdgeCases)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "C:\Users\stlp\AppData\Local\Temp\ipykernel_620\4118854793.py", line 18, in test_no_data_out_to_out
    result = stream.get_next_clip()
  File "c:\Users\stlp\OneDrive - UW\Desktop\Work\OrcaSound Project\orca-hls-utils\.c

<unittest.main.TestProgram at 0x21d0bcfde20>