In [4]:
import pandas as pd
import requests
import os

# Define the path to your CSV file and the local folder where you want to save the audio files
csv_file_path = 'dataset_amazigh/annotations/words.csv'
local_folder = 'dataset_amazigh/wav/words_wav'

# Create the local folder if it doesn't exist
os.makedirs(local_folder, exist_ok=True)

# Read the CSV file
data = pd.read_csv(csv_file_path)

# Iterate over the rows in the CSV file
for index, row in data.iterrows():
    audio_url = row['Audio_url']
    
    # Check if audio_url is NaN
    if pd.isna(audio_url):
        print(f"Row {index} has NaN for audio URL, skipping...")
        continue
    
    audio_filename = os.path.join(local_folder, f"audio_{index}.wav")
    
    # Check if the audio file already exists
    if os.path.exists(audio_filename):
        print(f"{audio_filename} already exists, skipping...")
        continue
    
    try:
        # Download the audio file
        response = requests.get(audio_url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        with open(audio_filename, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded {audio_filename}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {audio_url}: {e}")

print("Download complete.")


dataset_amazigh/wav/words_wav/audio_0.wav already exists, skipping...
dataset_amazigh/wav/words_wav/audio_1.wav already exists, skipping...
dataset_amazigh/wav/words_wav/audio_2.wav already exists, skipping...
dataset_amazigh/wav/words_wav/audio_3.wav already exists, skipping...
dataset_amazigh/wav/words_wav/audio_4.wav already exists, skipping...
dataset_amazigh/wav/words_wav/audio_5.wav already exists, skipping...
dataset_amazigh/wav/words_wav/audio_6.wav already exists, skipping...
dataset_amazigh/wav/words_wav/audio_7.wav already exists, skipping...
dataset_amazigh/wav/words_wav/audio_8.wav already exists, skipping...
dataset_amazigh/wav/words_wav/audio_9.wav already exists, skipping...
dataset_amazigh/wav/words_wav/audio_10.wav already exists, skipping...
dataset_amazigh/wav/words_wav/audio_11.wav already exists, skipping...
dataset_amazigh/wav/words_wav/audio_12.wav already exists, skipping...
Row 13 has NaN for audio URL, skipping...
Row 14 has NaN for audio URL, skipping...
Dow