In [None]:
# use ?fo=json to get metadata of the item

In [None]:
import io
import pandas as pd                     # for reading, manipulating, and displaying data
import requests
from helpers_loc import get_file_stats

DATA_URL = 'https://data.labs.loc.gov/jukebox/' # Base URL of this data package
DATA_URL = 'https://data.labs.loc.gov/veterans-history-project-collection/' # Base URL of this data package

# Download the file manifest
file_manifest_url = f'{DATA_URL}manifest.json'
response = requests.get(file_manifest_url, timeout=60)
response_json = response.json()
# file information json
files = [dict(zip(response_json["cols"], row)) for row in response_json["rows"]] # zip columns and rows

# Convert to Pandas DataFrame and show stats table
stats = get_file_stats(files)

In [None]:
df = pd.DataFrame(stats)
df

In [None]:
metadata_url = f'{DATA_URL}metadata.json'
response = requests.get(metadata_url, timeout=60)
data = response.json()
print(f'Loaded metadata file with {len(data):,} entries.')

In [None]:
df_metadata = pd.DataFrame(data)
print(', '.join(df.columns.to_list()))

In [None]:
df_metadata_by_subject = df_metadata.explode('Subjects')
df_opera = df_metadata_by_subject[df_metadata_by_subject.Subjects == 'Opera']
print(f'Found {df_opera.shape[0]:,} items with subject "Opera"')

In [None]:
# create a DataFrame from the file information
df_files = pd.DataFrame(files)
# join the metadata dataframe with the file information dataframe
opera_set_with_audio = pd.merge(df_opera, df_files, left_on='Id', right_on='item_id', how='inner')
print(f'Found {opera_set_with_audio.shape[0]:,} opera items with audio files')

In [None]:
opera_set_with_audio.head()

In [None]:
# object_key contains the path to the audio file
import io

import matplotlib.pyplot as plt         # for displaying data
import numpy as np
from pydub import AudioSegment          # for reading and manipulating audio files
from scipy import signal                # for visualizing audio

item = opera_set_with_audio.iloc[0]
file_url = f'https://{item["object_key"]}'

In [None]:
# Downoad the audio to memory
response = requests.get(file_url, timeout=60)
audio_filestream = io.BytesIO(response.content)

In [None]:
# Read as mp3
sample_rate = 48000
sample_width = 1
channels = 1
audio_filestream.seek(0)  # Ensure stream is at the beginning
sound = AudioSegment.from_mp3(audio_filestream)
sound = sound.set_channels(channels)
sound = sound.set_sample_width(sample_width)
sound = sound.set_frame_rate(sample_rate)

# Get the first 10 seconds
ten_seconds = 10 * 1000
first_10_seconds = sound[:ten_seconds]

# Get audio samples and sample rate
samples = first_10_seconds.get_array_of_samples()
samples = np.array(samples)

# Visualize the results
frequencies, times, spectrogram = signal.spectrogram(samples, sample_rate)
plt.pcolormesh(times, frequencies, np.log(spectrogram))
# plt.imshow(spectrogram)
plt.ylabel('Frequency [Hz]')
plt.xlabel('Time [sec]')
plt.show()