## Interactive Video Playback from WhisperX Transcriptions

1. conda create -n videonlp-env python=3.11 ipykernel pandas ipywidgets ipython

In [None]:
%pip install moviepy
%pip install pygame

In [13]:
path_to_whisperx_csv = "data/data/videonlp/csv/"
path_to_video = "data/data/videonlp/video/"

In [None]:
from moviepy.editor import VideoFileClip




In [None]:
import os
import pandas as pd
from moviepy.editor import VideoFileClip
from IPython.display import display
import ipywidgets as widgets

# Function to load the sentence-level transcript CSV
def load_sentence_level_transcript(csv_path):
    df = pd.read_csv(csv_path)
    return df

# Function to play video from a specific timestamp
def play_video_at_timestamp(video_file, start_time, end_time=None):
    """
    Play the video segment corresponding to the transcript segment.
    This function ensures the timestamps are valid within the video duration.
    """
    clip = VideoFileClip(video_file)
    
    # Check if the start time is within the video duration
    if start_time > clip.duration:
        print(f"Start time {start_time:.2f}s is greater than video duration {clip.duration:.2f}s. Skipping segment.")
        return
    
    # If the end time is provided, check if it is within the video duration
    if end_time and end_time > clip.duration:
        print(f"End time {end_time:.2f}s exceeds video duration {clip.duration:.2f}s. Adjusting end time to video duration.")
        end_time = clip.duration
    
    # Extract and play the subclip
    subclip = clip.subclip(start_time, end_time) if end_time else clip.subclip(start_time)
    subclip.preview()

# Search transcript for multiple keywords and display video links
def search_transcript_and_show_video_links(keywords, transcript_df, video_path):
    keywords_pattern = '|'.join(keywords)
    search_results = transcript_df[transcript_df['text'].str.contains(keywords_pattern, case=False, na=False)]
    for i, row in search_results.iterrows():
        text = row['text']
        start_time = row['start']
        end_time = row['end']
        speaker = row['speaker']
        result_label = widgets.Label(f"Speaker {speaker}: {text} (Start: {start_time})")
        play_button = widgets.Button(description=f"Play Segment {i+1}")
        play_button.on_click(lambda b, s=start_time, e=end_time: play_video_at_timestamp(video_path, s, e))
        display(widgets.HBox([result_label, play_button]))

# Function to find all files with a specific extension in a folder and its subfolders
def find_files_with_extension(base_dir, extensions):
    """
    Find all files with specific extensions in a directory and its subfolders.
    """
    files = {}
    for root, _, filenames in os.walk(base_dir):
        for filename in filenames:
            if any(filename.endswith(ext) for ext in extensions):
                full_path = os.path.join(root, filename)
                base_name = os.path.splitext(filename)[0]
                files[base_name] = full_path
    return files

# Function to match CSV files with corresponding video files
def match_csv_and_videos(csv_dir, video_dir):
    """
    Match CSV files with their corresponding video files based on filenames,
    searching through all subfolders.
    """
    csv_files = find_files_with_extension(csv_dir, ['.csv'])
    video_files = find_files_with_extension(video_dir, ['.mp4', '.mkv', '.avi'])

    print(f"\nCSV Files Found in {csv_dir}:")
    for f in csv_files.keys():
        print(f)
        
    print(f"\nVideo Files Found in {video_dir}:")
    for f in video_files.keys():
        print(f)
    
    matched_files = []

    # Match files with the same base filename (without extension)
    for base_name in csv_files.keys():
        if base_name in video_files:
            print(f"Match Found: {base_name}")
            matched_files.append((csv_files[base_name], video_files[base_name]))
        else:
            print(f"No match found for {base_name}")

    return matched_files

# Main function to load all CSV and video pairs and run the search and video interaction
def process_all_transcripts_and_videos(csv_dir, video_dir, keywords):
    matched_files = match_csv_and_videos(csv_dir, video_dir)
    
    if not matched_files:
        print(f"No matching CSV and video files found in {csv_dir} and {video_dir}.")
        return
    
    print(f"Found {len(matched_files)} matching CSV and video files.")
    
    for csv_file, video_file in matched_files:
        print(f"\nProcessing: {csv_file} and {video_file}")
        transcript_df = load_sentence_level_transcript(csv_file)
        if keywords:
            search_transcript_and_show_video_links(keywords, transcript_df, video_file)
        else:
            print(f"No keywords provided for search in {csv_file}.")

# Example usage: directories for CSV and video files and search keywords
csv_directory = 'path_to_csv_folder'  # Replace with your CSV folder path
video_directory = 'path_to_video_folder'  # Replace with your video folder path

# Specify the keywords you want to search for
keywords_to_search = ['equity', 'students', 'teachers']

# Process all transcripts and videos, searching for multiple keywords
process_all_transcripts_and_videos(csv_directory, video_directory, keywords_to_search)


In [None]:
import os
import pandas as pd
from moviepy.editor import VideoFileClip
from IPython.display import display
import ipywidgets as widgets

# Function to load the sentence-level transcript CSV
def load_sentence_level_transcript(csv_path):
    df = pd.read_csv(csv_path)
    return df

# Function to play video from a specific timestamp
def play_video_at_timestamp(video_file, start_time, end_time=None):
    clip = VideoFileClip(video_file)
    if end_time:
        subclip = clip.subclip(start_time, end_time)
    else:
        subclip = clip.subclip(start_time)
    subclip.preview()

# Search transcript for multiple keywords and display video links
def search_transcript_and_show_video_links(keywords, transcript_df, video_path):
    keywords_pattern = '|'.join(keywords)
    search_results = transcript_df[transcript_df['text'].str.contains(keywords_pattern, case=False, na=False)]
    for i, row in search_results.iterrows():
        text = row['text']
        start_time = row['start']
        end_time = row['end']
        speaker = row['speaker']
        result_label = widgets.Label(f"Speaker {speaker}: {text} (Start: {start_time})")
        play_button = widgets.Button(description=f"Play Segment {i+1}")
        play_button.on_click(lambda b, s=start_time, e=end_time: play_video_at_timestamp(video_path, s, e))
        display(widgets.HBox([result_label, play_button]))

# Function to find all files with a specific extension in a folder and its subfolders
def find_files_with_extension(base_dir, extensions):
    """
    Find all files with specific extensions in a directory and its subfolders.
    """
    files = {}
    for root, _, filenames in os.walk(base_dir):
        for filename in filenames:
            if any(filename.endswith(ext) for ext in extensions):
                full_path = os.path.join(root, filename)
                base_name = os.path.splitext(filename)[0]
                files[base_name] = full_path
    return files

# Function to match CSV files with corresponding video files
def match_csv_and_videos(csv_dir, video_dir):
    """
    Match CSV files with their corresponding video files based on filenames,
    searching through all subfolders.
    """
    csv_files = find_files_with_extension(csv_dir, ['.csv'])
    video_files = find_files_with_extension(video_dir, ['.mp4', '.mkv', '.avi'])

    print(f"\nCSV Files Found in {csv_dir}:")
    for f in csv_files.keys():
        print(f)
        
    print(f"\nVideo Files Found in {video_dir}:")
    for f in video_files.keys():
        print(f)
    
    matched_files = []

    # Match files with the same base filename (without extension)
    for base_name in csv_files.keys():
        if base_name in video_files:
            print(f"Match Found: {base_name}")
            matched_files.append((csv_files[base_name], video_files[base_name]))
        else:
            print(f"No match found for {base_name}")

    return matched_files

# Main function to load all CSV and video pairs and run the search and video interaction
def process_all_transcripts_and_videos(csv_dir, video_dir, keywords):
    matched_files = match_csv_and_videos(csv_dir, video_dir)
    
    if not matched_files:
        print(f"No matching CSV and video files found in {csv_dir} and {video_dir}.")
        return
    
    print(f"Found {len(matched_files)} matching CSV and video files.")
    
    for csv_file, video_file in matched_files:
        print(f"\nProcessing: {csv_file} and {video_file}")
        transcript_df = load_sentence_level_transcript(csv_file)
        if keywords:
            search_transcript_and_show_video_links(keywords, transcript_df, video_file)
        else:
            print(f"No keywords provided for search in {csv_file}.")

# Example usage: directories for CSV and video files and search keywords
csv_directory = 'path_to_csv_folder'  # Replace with your CSV folder path
video_directory = 'path_to_video_folder'  # Replace with your video folder path

# Specify the keywords you want to search for
keywords_to_search = ['equity', 'students', 'teachers']

# Process all transcripts and videos, searching for multiple keywords
process_all_transcripts_and_videos(csv_directory, video_directory, keywords_to_search)


In [10]:
path_to_whisperx_csv = "data/data/videonlp/20240718_INT_HS_Neil.csv"
path_to_video = "data/data/videonlp/20240718_INT_HS_Neil.mp4"

In [None]:
import pandas as pd
from moviepy.editor import VideoFileClip
import ipywidgets as widgets
from IPython.display import display

# Load WhisperX-generated CSV file
def load_sentence_level_transcript(csv_path):
    df = pd.read_csv(csv_path)
    return df

# Function to play video from a specific timestamp
def play_video_at_timestamp(video_file, start_time, end_time=None):
    clip = VideoFileClip(video_file)
    if end_time:
        subclip = clip.subclip(start_time, end_time)
    else:
        subclip = clip.subclip(start_time)
    subclip.preview()

# Search transcript for keyword and display video links
def search_transcript_and_show_video_links(keyword, transcript_df, video_path):
    search_results = transcript_df[transcript_df['text'].str.contains(keyword, case=False, na=False)]
    for i, row in search_results.iterrows():
        text = row['text']
        start_time = row['start']
        end_time = row['end']
        speaker = row['speaker']
        result_label = widgets.Label(f"Speaker {speaker}: {text} (Start: {start_time})")
        play_button = widgets.Button(description=f"Play Segment {i+1}")
        def on_button_click(b):
            play_video_at_timestamp(video_path, start_time, end_time)
        play_button.on_click(on_button_click)
        display(widgets.HBox([result_label, play_button]))

# Example usage
csv_path = path_to_whisperx_csv
video_path = path_to_video

# Load the transcript
transcript_df = load_sentence_level_transcript(csv_path)

# Perform a search for a keyword
keyword = 'education'  # You can modify this to search for any keyword
search_transcript_and_show_video_links(keyword, transcript_df, video_path)


In [None]:
import pandas as pd
from moviepy.editor import VideoFileClip
import ipywidgets as widgets
from IPython.display import display

# Load WhisperX-generated CSV file
def load_sentence_level_transcript(csv_path):
    df = pd.read_csv(csv_path)
    return df

# Function to play video from a specific timestamp
def play_video_at_timestamp(video_file, start_time, end_time=None):
    clip = VideoFileClip(video_file)
    if end_time:
        subclip = clip.subclip(start_time, end_time)
    else:
        subclip = clip.subclip(start_time)
    subclip.preview()

# Search transcript for keyword and display video links
def search_transcript_and_show_video_links(keyword, transcript_df, video_path):
    search_results = transcript_df[transcript_df['text'].str.contains(keyword, case=False, na=False)]
    
    for i, row in search_results.iterrows():
        text = row['text']
        start_time = row['start']
        end_time = row['end']
        speaker = row['speaker']
        
        # Display the search result with the video playback option
        result_label = widgets.Label(f"Speaker {speaker}: {text} (Start: {start_time})")
        play_button = widgets.Button(description=f"Play Segment {i+1}")
        
        # Capture the specific start and end times using default arguments
        play_button.on_click(lambda b, s=start_time, e=end_time: play_video_at_timestamp(video_path, s, e))
        
        # Display the result and the button
        display(widgets.HBox([result_label, play_button]))

# Example usage
csv_path = path_to_whisperx_csv  # Replace with the actual CSV path
video_path = path_to_video  # Replace with the actual video path

# Load the transcript
transcript_df = load_sentence_level_transcript(csv_path)

# Perform a search for a keyword
keyword = 'education'  # You can modify this to search for any keyword
search_transcript_and_show_video_links(keyword, transcript_df, video_path)


In [None]:
import ipywidgets as widgets
from IPython.display import display

button = widgets.Button(description="Test Button")
display(button)
