In [8]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import librosa
import csv
import io
from tqdm import tqdm

# Configure GPU memory growth
physical_devices = tf.config.list_physical_devices('GPU')
if len(physical_devices) > 0:
    for device in physical_devices:
        tf.config.experimental.set_memory_growth(device, True)
    print("GPU is available and configured")
else:
    print("No GPU detected. Running on CPU")

def process_audio_chunk(model, audio_chunk, class_names, threshold=0.5):
    """
    Process a single chunk of audio data
    """
    with tf.device('/GPU:0'):
        scores, embeddings, spectrogram = model(audio_chunk)
    
    # Find laugh-related classes
    laugh_indices = [i for i, name in enumerate(class_names) if 'laugh' in name.lower()]
    
    # Get timestamps where laughs were detected
    timestamps = []
    frame_duration = 0.025  # YAMNet uses 25ms frames
    
    # Process scores
    with tf.device('/GPU:0'):
        scores_np = scores.numpy()
        
    for frame_idx in range(len(scores_np)):
        if any(scores_np[frame_idx, idx] > threshold for idx in laugh_indices):
            time = frame_idx * frame_duration
            timestamps.append({
                'time': time,
                'confidence': float(max(scores_np[frame_idx, laugh_indices]))
            })
    
    return timestamps

# Print GPU information
print("\nGPU Devices:", tf.config.list_physical_devices('GPU'))

def detect_laughs_yamnet(audio_path, chunk_duration=10, threshold=0.5):
    """
    Detect laughs in audio using YAMNet, processing the audio in chunks
    """
    # Load the model
    with tf.device('/GPU:0'):
        model = hub.load('https://tfhub.dev/google/yamnet/1')
    
    # Get class names once
    class_map_path = model.class_map_path().numpy()
    class_map_csv = io.StringIO(tf.io.read_file(class_map_path).numpy().decode('utf-8'))
    class_names = [display_name for (class_index, mid, display_name) in csv.reader(class_map_csv)]
    
    # Load audio file
    audio, sr = librosa.load(audio_path, sr=16000)  # YAMNet expects 16kHz
    
    # Calculate chunk size in samples
    chunk_size = int(chunk_duration * sr)
    
    # Process audio in chunks
    all_timestamps = []
    for chunk_start in tqdm(range(0, len(audio), chunk_size)):
        # Get chunk
        chunk = audio[chunk_start:chunk_start + chunk_size]
        
        # If the chunk is too short (last chunk), pad it
        if len(chunk) < chunk_size:
            chunk = np.pad(chunk, (0, chunk_size - len(chunk)))
        
        # Convert to tensor
        chunk_tensor = tf.convert_to_tensor(chunk, dtype=tf.float32)
        
        # Process chunk
        chunk_timestamps = process_audio_chunk(model, chunk_tensor, class_names, threshold)
        
        # Adjust timestamps to account for chunk position
        chunk_start_time = chunk_start / sr
        for ts in chunk_timestamps:
            ts['time'] += chunk_start_time
        
        all_timestamps.extend(chunk_timestamps)
    
    # Sort timestamps by time
    all_timestamps.sort(key=lambda x: x['time'])
    
    return all_timestamps

GPU is available and configured

GPU Devices: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [9]:
# Test with your audio file
audio_file = "/mnt/d/Projects/kick_videos/audio.wav"  # Your audio file path
laugh_segments = detect_laughs_yamnet(audio_file, chunk_duration=2, threshold=0.3)  # Process in 10-second chunks

# Print results
for segment in laugh_segments:
    hours = int(segment['time'] // 3600)
    minutes = int((segment['time'] % 3600) // 60)
    seconds = segment['time'] % 60
    print(f"Laugh detected at {hours:02d}:{minutes:02d}:{seconds:05.2f} (confidence: {segment['confidence']:.2%})")

100%|██████████| 8484/8484 [00:31<00:00, 266.21it/s]

Laugh detected at 00:57:28.00 (confidence: 45.69%)
Laugh detected at 01:17:58.00 (confidence: 32.13%)
Laugh detected at 01:53:42.00 (confidence: 40.21%)
Laugh detected at 01:53:42.02 (confidence: 39.74%)
Laugh detected at 02:21:52.02 (confidence: 30.46%)
Laugh detected at 02:21:52.05 (confidence: 48.99%)
Laugh detected at 02:49:30.02 (confidence: 30.48%)
Laugh detected at 03:27:46.05 (confidence: 39.57%)
Laugh detected at 03:28:42.08 (confidence: 50.64%)
Laugh detected at 03:28:52.05 (confidence: 50.91%)
Laugh detected at 03:40:36.02 (confidence: 30.42%)
Laugh detected at 03:40:36.05 (confidence: 31.60%)
Laugh detected at 03:40:40.05 (confidence: 68.04%)
Laugh detected at 03:42:36.00 (confidence: 43.88%)
Laugh detected at 03:49:46.05 (confidence: 33.60%)
Laugh detected at 04:08:42.05 (confidence: 39.56%)
Laugh detected at 04:09:58.00 (confidence: 39.76%)
Laugh detected at 04:10:00.00 (confidence: 39.65%)
Laugh detected at 04:20:22.05 (confidence: 30.15%)
Laugh detected at 04:21:18.02 (




In [2]:
# Test with your audio file
audio_file = "/mnt/d/Projects/kick_videos/audio.wav"  # Replace with your audio file path


In [3]:
laugh_segments = detect_laughs_yamnet(audio_file)

I0000 00:00:1748517953.864046   11754 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5564 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3070, pci bus id: 0000:01:00.0, compute capability: 8.6
2025-05-29 13:27:00.494403: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:501] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.53GiB (rounded to 2714880000)requested by op StatefulPartitionedCall/yamnet_frames/tf_op_layer_GatherV2/GatherV2
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2025-05-29 13:27:00.494441: I external/local_xla/xla/tsl/framework/bfc_allocator.cc:1058] BFCAllocator dump for GPU_0_bfc
2025-05-29 13:27:00.494450: I external/local_xla/xla/tsl/framework/bfc_allocator.cc:1065] Bin (256): 	Total Chunks: 18, Chunks in use: 18. 4.5KiB allocated for chunks. 4.5KiB

ResourceExhaustedError: Graph execution error:

Detected at node yamnet_frames/tf_op_layer_GatherV2/GatherV2 defined at (most recent call last):
<stack traces unavailable>
OOM when allocating tensor with shape[1696800,5,80] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node yamnet_frames/tf_op_layer_GatherV2/GatherV2}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_restored_function_body_15206]

In [4]:
audio, sr = librosa.load(audio_file, sr=16000)  # YAMNet expects 16kHz
audio = tf.convert_to_tensor(audio, dtype=tf.float32)  # Convert to TensorFlow tensor
    

In [5]:
audio

<tf.Tensor: shape=(271482197,), dtype=float32, numpy=array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)>

In [3]:
with tf.device('/GPU:0'):
    model = hub.load('https://tfhub.dev/google/yamnet/1')

class_map_path = model.class_map_path().numpy()

In [4]:
class_map_path

b'/tmp/tfhub_modules/9616fd04ec2360621642ef9455b84f4b668e219e/assets/yamnet_class_map.csv'

In [6]:
import io
class_map_csv = io.StringIO(tf.io.read_file(class_map_path).numpy().decode('utf-8'))

In [7]:
class_map_csv

<_io.StringIO at 0x777612608310>

In [9]:
import csv
class_names = [display_name for (class_index, mid, display_name) in csv.reader(class_map_csv)]

In [10]:
class_names

NameError: name 'class_names' is not defined

In [12]:
laugh_indices = [i for i, name in enumerate(class_names) if 'laugh' in name.lower()]
laugh_indices

[14, 15, 18]

In [None]:
import sys
import os
# Add the parent directory to Python path so we can import from utils
sys.path.append(os.path.dirname(os.getcwd()))

from utils.video import analyze_video_emotions
# Analyze only first 10 seconds
result = analyze_video_emotions("/mnt/d/Projects/twitch_videos/twitch_2465574148.mp4", interval_sec=5, time_period="10:00-20:00")

2025-07-22 21:48:01.511403: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753213682.096327   24710 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753213682.258470   24710 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1753213683.955011   24710 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1753213683.955036   24710 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1753213683.955038   24710 computation_placer.cc:177] computation placer alr

In [2]:
result

{'video_id': 'twitch_2465574148',
 'emotion_segments': {'10:00': 'surprised',
  '10:05': 'neutral',
  '10:10': 'neutral',
  '10:15': 'neutral',
  '10:20': 'neutral',
  '10:25': 'neutral',
  '10:30': 'neutral',
  '10:35': 'surprised',
  '10:40': 'surprised',
  '10:45': 'surprised',
  '10:50': 'surprised',
  '10:55': 'happy',
  '11:00': 'neutral',
  '11:05': 'neutral',
  '11:10': 'neutral',
  '11:15': 'neutral',
  '11:20': 'neutral',
  '11:25': 'neutral',
  '11:30': 'neutral',
  '11:35': 'neutral',
  '11:40': 'neutral',
  '11:45': 'neutral',
  '11:50': 'neutral',
  '11:55': 'surprised',
  '12:00': 'neutral',
  '12:05': 'neutral',
  '12:10': 'neutral',
  '12:15': 'neutral',
  '12:20': 'neutral',
  '12:25': 'surprised',
  '12:30': 'surprised',
  '12:35': 'neutral',
  '12:40': 'neutral',
  '12:45': 'angry',
  '12:50': 'neutral',
  '12:55': 'neutral',
  '13:00': 'surprised',
  '13:05': 'neutral',
  '13:10': 'surprised',
  '13:15': 'neutral',
  '13:20': 'no_face',
  '13:25': 'no_face',
  '13:

In [None]:
import sys
import os
# Add the parent directory to Python path so we can import from utils
sys.path.append(os.path.dirname(os.getcwd()))

from utils.chat import process_audio_from_chat

# Basic usage
result = process_audio_from_chat(
    vod_url="https://www.twitch.tv/videos/2465574148",
    audio_file_path="/mnt/d/Projects/twitch/2465574148/twitch_2465574148.wav",
    top_n=10,  # Transcribe top 10 chat activity segments
    interval_seconds=5,
    audio_padding_seconds=2
)


Checking for existing files in: /mnt/d/Projects/twitch/2465574148
✓ chat.txt already exists, skipping download
✓ chat_activity.json already exists, skipping processing
Selected top 10 segments for transcription:
  161:00-161:05: 29 messages
  40:00-40:05: 24 messages
  161:10-161:15: 24 messages
  6:25-6:30: 23 messages
  40:55-41:00: 21 messages
  161:05-161:10: 21 messages
  161:15-161:20: 21 messages
  51:10-51:15: 20 messages
  253:20-253:25: 20 messages
  6:20-6:25: 19 messages
Initializing speech-to-text model...


Device set to use cuda:0


Loading audio file: /mnt/d/Projects/twitch/2465574148/twitch_2465574148.wav
Transcribing segments...
Adding 2 seconds padding before/after each segment for transcription


Processing segments: 100%|██████████| 10/10 [00:03<00:00,  3.12it/s]

Transcription completed! Results saved to: /mnt/d/Projects/twitch/2465574148/chat_transcriptions.json





In [3]:
from chat_downloader import ChatDownloader

chat_downloader = ChatDownloader()
chat = chat_downloader.get_chat("https://www.twitch.tv/videos/2518537772")

for message in chat:
    print(message)
    break

{'message_id': '61cdd654-4603-4f31-b568-9778909c38c0', 'author': {'id': '705575613', 'name': 'youngstanky1', 'display_name': 'youngstanky1', 'badges': [{'name': 'subscriber', 'version': 0, 'title': 'Subscriber', 'clickAction': 'SUBSCRIBE', 'clickURL': None, 'icons': [{'url': 'https://static-cdn.jtvnw.net/badges/v1/a23e8a7a-eddd-4299-9855-bd51c77c0845/1', 'width': 18, 'height': 18, 'id': '18x18'}, {'url': 'https://static-cdn.jtvnw.net/badges/v1/a23e8a7a-eddd-4299-9855-bd51c77c0845/2', 'width': 36, 'height': 36, 'id': '36x36'}, {'url': 'https://static-cdn.jtvnw.net/badges/v1/a23e8a7a-eddd-4299-9855-bd51c77c0845/3', 'width': 72, 'height': 72, 'id': '72x72'}]}, {'name': 'premium', 'version': 1, 'title': 'Prime Gaming', 'clickAction': 'VISIT_URL', 'clickURL': 'https://gaming.amazon.com', 'icons': [{'url': 'https://static-cdn.jtvnw.net/badges/v1/bbbe0db0-a598-423e-86d0-f9fb98ca1933/1', 'width': 18, 'height': 18, 'id': '18x18'}, {'url': 'https://static-cdn.jtvnw.net/badges/v1/bbbe0db0-a598-42