In [1]:
import base64
import io
import subprocess
from pathlib import Path

import ffmpeg
import numpy as np
import requests
import soundfile as sf

In [None]:
sample_rate = 16000
video_path = Path(
    "SnapInsta.to_AQP6pQCnuGr0gNz8bAkvNgHiq7qFRMTy59yCjbavpPz65WHft-sZdKnE11g2mOrIa-Lpg3_hXJISrqtelf8IhPc_3xQNVHkwY9NBewU.mp4"
)
# video_path = Path("AISummit.mp4")
am_url = "http://localhost:8080/predictions/acoustic_model"
decoder_url = "http://localhost:8080/predictions/asr_decoder"

In [None]:
def convert_wav_content_to_nparray(wav_content):
    """Read wav file and extract data

    Take wav file content and extract wav data.
    """
    wav_content = io.BytesIO(wav_content)
    data, sr = sf.read(wav_content, dtype=np.int16)
    return data


def convert_np_array_to_wav_file_bytes(np_array, fs):
    in_memory_file = io.BytesIO()
    sf.write(in_memory_file, np_array, fs, format="WAV")
    in_memory_file.seek(0)
    return in_memory_file.read()


def vad(av_content):
    args = (
        ffmpeg.input("pipe:")
        .output(
            "pipe:",
            format="wav",
            acodec="pcm_s16le",
            ac=1,
            ar=sample_rate,
        )
        .get_args()
    )
    ffmpeg_process = subprocess.Popen(
        ["ffmpeg"] + args,
        stdin=subprocess.PIPE,
        stdout=subprocess.PIPE,
        stderr=subprocess.DEVNULL,
    )
    wav_content = ffmpeg_process.communicate(input=av_content)[0]
    ffmpeg_process.kill()
    wav_array = convert_wav_content_to_nparray(wav_content)
    duration = len(wav_array) / sample_rate
    return wav_array, duration

In [4]:
av_content = video_path.read_bytes()
wav_array, duration = vad(av_content)
print(duration)

0.0


In [5]:
from streamsad import SAD

sad = SAD()
segments = sad(wav_array)

# Print the detected segments
print(segments)

InvalidArgument: [ONNXRuntimeError] : 2 : INVALID_ARGUMENT : Invalid rank for input: input Got: 2 Expected: 3 Please fix either the inputs/outputs or the model.

In [None]:
for segment in segments:
    # crop audio segment
    start_sample = int(segment["start"] * sample_rate)
    end_sample = int(segment["end"] * sample_rate)
    segment_wav_array = wav_array[start_sample:end_sample]
    data = convert_np_array_to_wav_file_bytes(segment_wav_array, sample_rate)
    # ASR
    am_result = requests.get(am_url, data=data)
    asr_result = requests.get(decoder_url, data=am_result.content)
    segment["text"] = asr_result.text

In [None]:
for segment in segments:
    print(segment)

In [None]:
def _format_timestamp(seconds: float) -> str:
    """Format seconds to SRT time: HH:MM:SS,mmm"""
    total_ms = int(round(seconds * 1000))
    ms = total_ms % 1000
    s = total_ms // 1000
    hours = s // 3600
    minutes = (s % 3600) // 60
    seconds = s % 60
    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{ms:03d}"


# ensure segments are in order
sorted_segments = sorted(segments, key=lambda seg: seg.get("start", 0.0))

srt_lines = []
idx = 1
for seg in sorted_segments:
    text = seg.get("text", "").strip()
    if not text:
        # skip empty segments (no subtitle text)
        continue
    start_ts = _format_timestamp(seg["start"])
    end_ts = _format_timestamp(seg["end"])
    srt_lines.append(f"{idx}")
    srt_lines.append(f"{start_ts} --> {end_ts}")
    srt_lines.append(text)
    srt_lines.append("")  # blank line between entries
    idx += 1

srt_content = "\n".join(srt_lines)

srt_path = video_path.with_suffix(".srt")
srt_path.write_text(srt_content, encoding="utf-8")

print(f"Wrote {idx-1} subtitles to {srt_path}")