# Download negative audios

In [1]:
import pandas as pd
import yt_dlp
from pydub import AudioSegment
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading


In [None]:
negative_labels = ['/m/0gy1t2s', '/m/03p19w', '/m/0l156k', '/m/0l15bq', 
                   '/m/01b82r', '/m/05x_td', '/t/dd00013', '/m/03w41f', 
                   '/m/0jb2l', '/m/0ngt1', '/m/07rknqz', '/m/0h9mv', 
                   '/m/07pbtc8', '/m/0btp2', '/m/01d380', '/m/01h8n0', 
                   '/m/05tny_', '/m/0g6b5', '/m/06mb1', '/m/09ct_', 
                   '/m/0912c9', '/t/dd00134', '/m/053hz1', '/m/07pggtn', 
                   '/m/01bjv', '/m/0ltv', '/m/03m9d0z', '/m/04_sv', 
                   '/m/028v0c', '/m/02zsn', '/m/012f08', '/m/03qtwd', 
                   '/m/07r04', '/m/0ytgt', '/m/07jdr', '/m/0bt9lr', 
                   '/m/05zppz', '/m/0k4j', '/m/07yv9', '/m/04rlf']

csv_path = "../../data/csv_files"

class_labels = pd.read_csv(f'{csv_path}/class_labels_indices.csv')

# Load the datasets
b_train = pd.read_csv(f"{csv_path}/balanced_train_segments.csv", 
                      sep=", ", engine="python")

b_val = pd.read_csv(f"{csv_path}/eval_segments.csv", 
                    sep=", ", engine="python")

u_train = pd.read_csv(f"{csv_path}/unbalanced_train_segments.csv", 
                      sep=", ", engine="python")

datasets = [b_train, b_val, u_train]
OUTPUT_PATH = "../../data/audios/negative"


In [6]:
# Diccionario para contar las descargas por etiqueta
download_counts = {}
count_lock = threading.Lock()

def process_row(dataset, i, row):

    for negative_label in negative_labels:

        with count_lock:
            if download_counts.get(negative_label, 0) >= 200:
                continue

        row_labels = row['positive_labels'].replace('"', '').split(',')
        video_code = dataset["YTID"][i]
        if negative_label in row_labels:
            # Encontrar el nombre de la clase correspondiente a la etiqueta negativa
            class_match = class_labels[class_labels["mid"] == negative_label]["display_name"]
            class_name = class_match.values[0].replace(" ", "_")
            output_filename = f"0-{class_name}-{video_code}"
            url = f"https://www.youtube.com/watch?v={video_code}"

            ydl_opts = {
                'format': 'bestaudio/best',
                'geo_bypass': True,
                'http_headers': {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
                },
                'outtmpl': f'{OUTPUT_PATH}/{output_filename}',
                'postprocessors': [{
                    'key': 'FFmpegExtractAudio',
                    'preferredcodec': 'wav',
                    'preferredquality': '192',
                }],
                'postprocessor_args': [
                    '-ar', '16000',
                    '-ac', '1'
                ],
                'quiet': True,       # Desactiva todos los mensajes excepto errores
                'no_warnings': True  # Suprime las advertencias
            }

            try:
                with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
                    info_dict = ydl.extract_info(url, download=False)
                    duration = info_dict.get('duration', 0)

                start = dataset["start_seconds"][i] * 1000
                end = dataset["end_seconds"][i] * 1000

                if duration < 10 or (end - start) < 10000 or (end - start) > 11000:
                    break

                with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                    ydl.download([url])

                audio = AudioSegment.from_file(f"{OUTPUT_PATH}/{output_filename}.wav", format="wav")
                cropped_audio = audio[start:end]
                cropped_audio.export(f"{OUTPUT_PATH}/{output_filename}.wav", format="wav")
                
                # Actualizar el conteo después de una descarga exitosa
                with count_lock:
                    download_counts[negative_label] = download_counts.get(negative_label, 0) + 1

            except Exception:
                pass
            finally:
                break


In [None]:
print("Downloading negative audios...")

with ThreadPoolExecutor(max_workers=8) as executor:
        futures = []
        
        for dataset in datasets:
            for i, row in dataset.iterrows():
                futures.append(executor.submit(process_row, 
                                               dataset, i, row, 
                                               ))
        
        for future in as_completed(futures):
            try:
                future.result()
            except Exception as e:
                print(f"Error during processing: {e}")

print("Download completed")
