In [8]:
import os
import subprocess
from pyannote.audio import Pipeline
from whisper import load_model
import json


  from .autonotebook import tqdm as notebook_tqdm


In [10]:
import configparser

config = configparser.ConfigParser()
# config.read(r'N:\Open_LLM\spch_dirz\hf_conf.config')
config.read(r'N:\DataFlo\df_config.config')

hf_keys = config['HF']

In [11]:
# Step 1: Download Videos
def download_videos(base_url):
    """
    Downloads all videos from the given base URL.
    """
    print("Downloading videos...")
    output_dir = "videos"
    os.makedirs(output_dir, exist_ok=True)

    # Use yt-dlp to download videos
    subprocess.run(["yt-dlp", "-o", f"{output_dir}/%(title)s.%(ext)s", base_url])
    print("Videos downloaded successfully.")
    return output_dir

# Step 2: Extract Audio from Videos
def extract_audio(video_dir):
    """
    Extracts audio from all videos in the given directory.
    """
    print("Extracting audio...")
    audio_dir = r"N:\Open_LLM\spch_dirz\audio"
    os.makedirs(audio_dir, exist_ok=True)

    for video_file in os.listdir(video_dir):
        if video_file.endswith((".mp4", ".mkv", ".webm")):
            video_path = os.path.join(video_dir, video_file)
            audio_path = os.path.join(audio_dir, os.path.splitext(video_file)[0] + ".wav")
            print(video_path)
            print(audio_path)
            subprocess.run(["ffmpeg", "-i", video_path, "-q:a", "0", "-map", "a", audio_path])
    print("Audio extraction completed.")
    return audio_dir


# Step 3: Generate Transcript for Each Audio File
def generate_transcripts(audio_dir):
    """
    Generates transcripts for all audio files using Whisper.
    """
    print("Generating transcripts...")
    transcript_dir = "transcripts"
    os.makedirs(transcript_dir, exist_ok=True)

    model = load_model("base")  # Load Whisper model
    transcripts = {}

    for audio_file in os.listdir(audio_dir):
        if audio_file.endswith(".wav"):
            audio_path = os.path.join(audio_dir, audio_file)
            result = model.transcribe(audio_path)
            transcript_path = os.path.join(transcript_dir, os.path.splitext(audio_file)[0] + ".json")
            with open(transcript_path, "w") as f:
                json.dump(result, f)
            transcripts[audio_file] = result
    print("Transcripts generated successfully.")
    return transcripts


# Step 4: Perform Speech Diarization
def perform_diarization(audio_dir):
    """
    Performs speech diarization to identify speakers and their timestamps.
    """
    print("Performing speech diarization...")
    diarization_dir = "diarization"
    os.makedirs(diarization_dir, exist_ok=True)

    # pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1", use_auth_token=hf_keys['hf_key'])
    pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_keys['hf_key'])
    diarizations = {}

    for audio_file in os.listdir(audio_dir):
        if audio_file.endswith(".wav"):
            audio_path = os.path.join(audio_dir, audio_file)
            diarization = pipeline(audio_path, min_speakers=2, max_speakers=15)
            diarization_data = []

            for turn, _, speaker in diarization.itertracks(yield_label=True):
                diarization_data.append({
                    "start": turn.start,
                    "end": turn.end,
                    "speaker": speaker
                })

            diarizations[audio_file] = diarization_data

            # Save diarization data to a JSON file
            diarization_path = os.path.join(diarization_dir, os.path.splitext(audio_file)[0] + ".json")
            with open(diarization_path, "w") as f:
                json.dump(diarization_data, f, indent=4)

    print("Speech diarization completed.")
    return diarizations


In [None]:


# Step 5: Combine Timestamps, Speaker Names, and Transcripts
def combine_results(transcripts, diarizations):
    """
    Combines timestamps, speaker names, and transcripts into a single data structure.
    """
    print("Combining results...")
    combined_results = {}

    transcript_whole = []
    for segment in transcript_segments:
        start_time = segment["start"]
        end_time = segment["end"]
        text = segment["text"]
        transcript_whole.append((start_time, end_time, text))

    diarization_timestamps = []
    for turn, _, speaker in diarization.itertracks(yield_label=True):
        diarization_timestamps.append((speaker, turn.start, turn.end))


    for audio_file, transcript in transcripts.items():
        diarization_data = diarizations.get(audio_file, [])
        combined_data = []

        for segment in diarization_data:
            start_time = segment["start"]
            end_time = segment["end"]
            speaker = segment["speaker"]

            # Extract corresponding transcript segment (basic approximation)
            combined_data.append({
                "start_time": start_time,
                "end_time": end_time,
                "speaker": speaker,
                "transcript": transcript  # Placeholder; refine this for better alignment
            })

        combined_results[audio_file] = combined_data

    # Save combined results to a JSON file
    with open("combined_results.json", "w") as f:
        json.dump(combined_results, f, indent=4)

    print("Results combined and saved successfully.")



def combine_transcript_and_diarization(transcript, diarization):
    """
    Combine transcript and diarization results into a unified structure.
    
    Args:
        transcript (list): List of transcript segments in the format [(start, end, text), ...].
        diarization (list): List of diarization segments in the format [(speaker, start, end), ...].
    
    Returns:
        list: Combined results in the format [(speaker, start, end, text), ...].
    """
    combined_results = []

    for t_start, t_end, t_text in transcript:
        best_match = None
        max_overlap = 0

        for speaker, d_start, d_end in diarization:
            # Calculate overlap between transcript segment and diarization segment
            overlap_start = max(t_start, d_start)
            overlap_end = min(t_end, d_end)
            overlap = max(0, overlap_end - overlap_start)

            # Find the best match based on maximum overlap
            if overlap > max_overlap:
                max_overlap = overlap
                best_match = (speaker, d_start, d_end)

        if best_match:
            speaker, d_start, d_end = best_match
            combined_results.append((speaker, t_start, t_end, t_text))

    # Combine consecutive lines for the same speaker    
    merged_results = []
    current_speaker = None
    current_start = None
    current_end = None
    current_text = []

    for speaker, start, end, text in combined_results:
        if speaker == current_speaker:
            # Same speaker, extend the current segment
            current_end = end
            current_text.append(text)
        else:
            # New speaker, save the previous segment
            if current_speaker is not None:
                merged_results.append((current_speaker, current_start, current_end, " ".join(current_text)))
            # Start a new segment
            current_speaker = speaker
            current_start = start
            current_end = end
            current_text = [text]

    # Add the last segment
    if current_speaker is not None:
        merged_results.append((current_speaker, current_start, current_end, " ".join(current_text)))

    
    with open("combined_results.json", "w") as f:
        json.dump(merged_results, f, indent=4)

    return merged_results
  


In [4]:
import requests
from bs4 import BeautifulSoup

def list_vdos():
        
    # URL of the webpage
    url = "https://sg001-harmony.sliq.net/00293/Harmony/en/View/RecentEnded/20250224/-1"

    # Fetch the webpage
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")


    # Extract video links
    video_links = []
    for a_tag in soup.find_all("a", href=True):
        href = a_tag["href"]
        if "/PowerBrowser/PowerBrowserV2/" in href:
            full_url = f"https://sg001-harmony.sliq.net{href}"
            video_links.append(full_url)

    all_m3u8_links = []
    for url2 in video_links:
        # Fetch the webpage
        response = requests.get(url2)
        soup = BeautifulSoup(response.text, "html.parser")


        # Search for .m3u8 links
        m3u8_links = []
        for tag in soup.find_all("script"):
            if ".m3u8" in tag.text:
                m3u8_links.append(tag.text.split(".m3u8")[0] + ".m3u8")

        all_m3u8_links.append(m3u8_links)
    
    full_downloadable_urls = []
    for vdo in all_m3u8_links:
        full_downloadable_urls.append("https"+vdo[0].split('https')[-1])
    
    return full_downloadable_urls

In [8]:
vdo_list = list_vdos()

In [9]:
vdo_list

['https://sg002-live.sliq.net/00293-vod-2/_definst_/2025/03/04/House%20-%20Consumer%20and%20Public%20Affairs_2025-03-04-16.30.13_76811_38.mp4/playlist.m3u8',
 'https://sg002-live.sliq.net/00293-vod-2/_definst_/2025/03/04/House%20-%20Appropriations%20and%20Finance_2025-03-04-17.01.11_76814_14.mp4/playlist.m3u8',
 'https://sg002-live.sliq.net/00293-live/house/playlist.m3u8',
 'https://sg002-live.sliq.net/00293-vod-2/_definst_/2025/03/04/House%20-%20Labor%2C%20Veterans%20and%20Military%20Affairs_2025-03-04-17.00.50_76813_34.mp4/playlist.m3u8',
 'https://sg002-live.sliq.net/00293-vod-2/_definst_/2025/03/04/Test%20Meeting_2025-03-04-16.08.54_76810_46.mp4/playlist.m3u8',
 'https://sg002-live.sliq.net/00293-vod-2/_definst_/2025/03/04/Senate%20Chamber_2025-03-04-11.51.13_76808_10.mp4/playlist.m3u8',
 'https://sg002-live.sliq.net/00293-vod-2/_definst_/2025/03/04/Test%20Meeting_2025-03-04-16.07.20_76809_46.mp4/playlist.m3u8',
 'https://sg002-live.sliq.net/00293-vod-2/_definst_/2025/03/04/House%2

In [10]:
res = download_videos("https://sg002-live.sliq.net/00293-vod-2/_definst_/2025/03/04/Test%20Meeting_2025-03-04-16.08.54_76810_46.mp4/playlist.m3u8")

Downloading videos...
Videos downloaded successfully.


In [5]:
extract_audio(r"N:\Open_LLM\spch_dirz\videos")

Extracting audio...
N:\Open_LLM\spch_dirz\videos\playlist.mp4
N:\Open_LLM\spch_dirz\audio\playlist.wav
Audio extraction completed.


'N:\\Open_LLM\\spch_dirz\\audio'

In [None]:
trs = generate_transcripts(r"N:\Open_LLM\spch_dirz\audio")

Generating transcripts...




Transcripts generated successfully.


{'playlist.wav': {'text': ' I will try and put these, to keep going I think this will stand. System Sinne Testing iPhone testing mic you Good evening great and in part of your course! week The you this period is My Test. The testing-my.',
  'segments': [{'id': 0,
    'seek': 0,
    'start': 0.0,
    'end': 2.8000000000000003,
    'text': ' I will try and put these,',
    'tokens': [50364, 286, 486, 853, 293, 829, 613, 11, 50504],
    'temperature': 1.0,
    'avg_logprob': -4.440010472347862,
    'compression_ratio': 1.1555555555555554,
    'no_speech_prob': 0.18331240117549896},
   {'id': 1,
    'seek': 0,
    'start': 2.8000000000000003,
    'end': 7.96,
    'text': ' to keep going I think this will stand.',
    'tokens': [50504, 281, 1066, 516, 286, 519, 341, 486, 1463, 13, 50762],
    'temperature': 1.0,
    'avg_logprob': -4.440010472347862,
    'compression_ratio': 1.1555555555555554,
    'no_speech_prob': 0.18331240117549896},
   {'id': 2,
    'seek': 0,
    'start': 9.8,
    'en

In [6]:
perform_diarization(r"N:\Open_LLM\spch_dirz\audio")

Performing speech diarization...


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\novil\.cache\torch\pyannote\models--pyannote--segmentation\snapshots\c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b\pytorch_model.bin`
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.6.0+cpu. Bad things might happen unless you revert torch to 1.x.


INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch label_encoder.txt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: embedding_model, mean_var_norm_emb, classifier, label_encoder


Speech diarization completed.


{'playlist.wav': [{'start': 19.28534375,
   'end': 20.06159375,
   'speaker': 'SPEAKER_00'},
  {'start': 66.38346875, 'end': 67.17659375000001, 'speaker': 'SPEAKER_00'},
  {'start': 113.48159375, 'end': 114.25784375, 'speaker': 'SPEAKER_00'},
  {'start': 160.57971875, 'end': 161.37284375000002, 'speaker': 'SPEAKER_00'},
  {'start': 177.69096875000002, 'end': 178.85534375, 'speaker': 'SPEAKER_01'},
  {'start': 207.69471875000002,
   'end': 208.47096875000003,
   'speaker': 'SPEAKER_00'},
  {'start': 224.80596875, 'end': 225.95346875, 'speaker': 'SPEAKER_01'}],
 'test_meeting.wav': [{'start': 19.28534375,
   'end': 20.06159375,
   'speaker': 'SPEAKER_00'},
  {'start': 66.38346875, 'end': 67.17659375000001, 'speaker': 'SPEAKER_00'},
  {'start': 113.48159375, 'end': 114.25784375, 'speaker': 'SPEAKER_00'},
  {'start': 160.57971875, 'end': 161.37284375000002, 'speaker': 'SPEAKER_00'},
  {'start': 177.69096875000002, 'end': 178.85534375, 'speaker': 'SPEAKER_01'},
  {'start': 207.6947187500000

In [1]:
import json

# Load your transcript

with open(r"N:\Open_LLM\spch_dirz\senet_fin_transcript.json", 'r') as fp:
    transcript = json.load(fp)
transcript

[['SPEAKER_02',
  0.0,
  26.0,
  " We don't have the capability of a scanner.  So you can't scan and put the men in it.  If you send it to us a day before,  I don't even know if there's a scanner in the building.  I think there may be one downstairs in the clerk's office.  And so putting stuff electronically up,  the amendments is something  the Mexico has fallen behind with technology driven access."],
 ['SPEAKER_01',
  26.0,
  65.64,
  " Mr. Chair, I think it's actually a rule that we've passed  a law may be even represented in the Queen saying  that all amendments would actually have to be posted online.  This may be the only,  maybe I'll do respect that we don't post these amendments,  these documents, and I just think we should.  I think it's public information,  and I just think nothing can be more important.  A $10 billion budget from a policy standpoint  and everything contained in it.  So I think, Shana, I mean, with all the other committees  and legislature amendments, becaus

In [2]:

# Convert transcript to SRT format
srt_content = ""
for i, entry in enumerate(transcript):
    start_time = entry[1]
    end_time = entry[2]
    text = entry[3]
    srt_content += f"{i+1}\n"
    srt_content += f"{start_time:.3f} --> {end_time:.3f}\n"
    srt_content += f"{text}\n\n"

# Save SRT content to file
with open(r"N:\Open_LLM\spch_dirz\senet_fin_transcript.srt", "w") as f:
    f.write(srt_content)

In [3]:
import subprocess

def convert_to_srt(transcript, output_srt_path):
    with open(output_srt_path, "w", encoding="utf-8") as f:
        for idx, entry in enumerate(transcript, start=1):
            speaker, start_time, end_time, text = entry
            # Convert seconds to SRT timestamp format
            start_hms = f"{int(start_time//3600):02}:{int((start_time%3600)//60):02}:{start_time%60:06.3f}"
            end_hms = f"{int(end_time//3600):02}:{int((end_time%3600)//60):02}:{end_time%60:06.3f}"
            # Replace decimal points with commas
            start_hms = start_hms.replace(".", ",")
            end_hms = end_hms.replace(".", ",")
            # Write subtitle entry
            f.write(f"{idx}\n{start_hms} --> {end_hms}\n{speaker}: {text.strip()}\n\n")

# Your transcript data
with open(r"N:\Open_LLM\spch_dirz\senet_fin_transcript.json", 'r') as fp:
    transcript = json.load(fp)



# Generate SRT file
convert_to_srt(transcript, r"N:\Open_LLM\spch_dirz\videos\senet_fin_transcript2.srt")


In [None]:

# Play video with subtitles using VLC
video_path = "your_video.mp4"
subprocess.run([
    "vlc",
    "--sub-file", "subtitles.srt",
    video_path
])

In [5]:
def validate_srt(srt_path):
    with open(srt_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    index = 1
    i = 0
    while i < len(lines):
        # Check sequence number
        if lines[i].strip() != str(index):
            print(f"Error: Invalid sequence number at line {i+1}. Expected {index}, got {lines[i].strip()}")
            return False
        i += 1

        # Check timestamps
        if "-->" not in lines[i]:
            print(f"Error: Missing timestamp at line {i+1}.")
            return False
        try:
            start, end = lines[i].strip().split(" --> ")
            # Validate format (HH:MM:SS,mmm)
            assert len(start) == 12 and start.count(":") == 2 and start.count(",") == 1
            assert len(end) == 12 and end.count(":") == 2 and end.count(",") == 1
        except:
            print(f"Error: Invalid timestamp format at line {i+1}: {lines[i].strip()}")
            return False
        i += 1

        # Skip subtitle text lines
        while i < len(lines) and lines[i].strip() != "":
            i += 1
        i += 1  # Skip empty line
        index += 1

    print("SRT file is valid!")
    return True


In [6]:

# Example usage
validate_srt(r"N:\Open_LLM\spch_dirz\videos\senet_fin_transcript2.srt")

SRT file is valid!


True