<a href="https://colab.research.google.com/github/phyllisnabangi/sunbirdai/blob/main/SpeechDataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installing dependencies

In [None]:
# !pip install pydub
# !pip install datasets

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m

## Importing Libraries

In [None]:
import os
import glob
import librosa
import numpy as np
from pydub import AudioSegment
import audioread


## Loading data in the speech-data folder

In [None]:
!ls -l "/content/drive/Shareddrives/Sunbird AI/Internships/internship-experiments/speech-data"


total 1141572
dr-x------ 2 root root       4096 Aug 24 12:32 Ateso
-r-------- 1 root root 1163387946 Aug 24 12:24 backup_uganda_raw_dataset.zip
-r-------- 1 root root    5568626 Aug 29 13:38 drive-download-20230829T133716Z-001.zip
dr-x------ 2 root root       4096 Aug 24 12:32 Luganda
dr-x------ 2 root root       4096 Aug 28 08:57 Runyankole


In [None]:
# !cp -r "/content/drive/Shareddrives/Sunbird AI/Projects/African Language Technology/Data/Runyankole Voice Over/Runyankole" "/content/drive/Shareddrives/Sunbird AI/Internships/internship-experiments/speech-data/"

# No write permissions - uploaded the files manually

In [None]:
!unzip '/content/drive/Shareddrives/Sunbird AI/Internships/internship-experiments/speech-data/backup_uganda_raw_dataset.zip'


Archive:  /content/drive/Shareddrives/Sunbird AI/Internships/internship-experiments/speech-data/backup_uganda_raw_dataset.zip
 extracting: backup_uganda_raw_dataset/audio_24.mp3  
 extracting: backup_uganda_raw_dataset/audio_20.mp3  
 extracting: backup_uganda_raw_dataset/audio_29.mp3  
 extracting: backup_uganda_raw_dataset/audio_23.mp3  
 extracting: backup_uganda_raw_dataset/audio_27.mp3  
 extracting: backup_uganda_raw_dataset/audio_13.mp3  
 extracting: backup_uganda_raw_dataset/audio_37.mp3  
 extracting: backup_uganda_raw_dataset/audio_8.mp3  
 extracting: backup_uganda_raw_dataset/audio_22.mp3  
 extracting: backup_uganda_raw_dataset/audio_10.mp3  
 extracting: backup_uganda_raw_dataset/audio_36.mp3  
 extracting: backup_uganda_raw_dataset/audio_7.mp3  
 extracting: backup_uganda_raw_dataset/audio_26.mp3  
 extracting: backup_uganda_raw_dataset/audio_30.mp3  
 extracting: backup_uganda_raw_dataset/audio_15.mp3  
 extracting: backup_uganda_raw_dataset/audio_6.mp3  
 extracting: 

In [None]:
# convert to wav
def convert_to_wav(input_folder, output_folder):
    try:
        # Ensure the output folder exists
        os.makedirs(output_folder, exist_ok=True)

        # Iterate over all files in the input folder
        for filename in os.listdir(input_folder):
            if filename.lower().endswith((".mp3", ".ogg", ".flac")):
                # Build paths for input and output files
                input_file = os.path.join(input_folder, filename)
                output_file = os.path.join(output_folder, os.path.splitext(filename)[0] + ".wav")

                # Load the input audio file using pydub
                audio = AudioSegment.from_file(input_file)

                # Export the audio to WAV format
                audio.export(output_file, format="wav")

        return True  # Conversion successful for all files
    except Exception as e:
        print(f"Error: {e}")
        return False  # Conversion failed


In [None]:
# converting the Acholi dataset from mp3 to wav
input_folder = "backup_uganda_raw_dataset"
output_folder = "Acholi_wav"

conversion_result = convert_to_wav(input_folder, output_folder)


In [None]:
# splitting audios into chuncks
def chunk_audio(audio, chunk_size_ms, overlap_ms):
  # chunk_size -> numnber of milliseconds for each chunk
  # overlap_ms -> overlap between each chunk so we do not lose some data
    sound = AudioSegment.from_wav(audio)
    duration = len(sound)

    chunks = []
    start_time = 0

    while start_time + chunk_size_ms <= duration:
        end_time = start_time + chunk_size_ms
        chunk = sound[start_time:end_time]
        chunks.append(chunk)
        start_time += chunk_size_ms - overlap_ms

    return chunks

In [None]:
# input_dir = "/content/drive/Shareddrives/Sunbird AI/Internships/internship-experiments/speech-data/Runyankole"
input_dir = "Acholi_wav"
output_dir = "/content/chunked_audio/"

In [None]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

audio_files = glob.glob(os.path.join(input_dir, "*.wav"))

for audio in audio_files:
    audio_name = os.path.splitext(os.path.basename(audio))[0]
    audio_dir = os.path.join(output_dir, audio_name)

    if not os.path.exists(audio_dir):
        os.makedirs(audio_dir)

    chunks = chunk_audio(audio, chunk_size_ms=30000, overlap_ms=1000)

    for i, chunk in enumerate(chunks):
        chunk_path = os.path.join(audio_dir, f"c{i+1}.wav")
        chunk.export(chunk_path, format="wav")

In [None]:
!ls "/content/drive/Shareddrives/Sunbird AI/Internships/internship-experiments/speech-data/Runyankole"

'RUNYANKOLE 0.wav'	'RUNYANKOLE 30.wav'	'RUNYANKOLE 64.wav'
'RUNYANKOLE 10(1).wav'	'RUNYANKOLE 31(1).wav'	'RUNYANKOLE 65.wav'
'RUNYANKOLE 10.wav'	'RUNYANKOLE 31.wav'	'RUNYANKOLE 66.wav'
'RUNYANKOLE 11(1).wav'	'RUNYANKOLE 32.wav'	'RUNYANKOLE 67.wav'
'RUNYANKOLE 11.wav'	'RUNYANKOLE 33.wav'	'RUNYANKOLE 68.wav'
'RUNYANKOLE 12(1).wav'	'RUNYANKOLE 34.wav'	'RUNYANKOLE 69.wav'
'RUNYANKOLE 12.wav'	'RUNYANKOLE 35(1).wav'	'RUNYANKOLE 6.wav'
'RUNYANKOLE 13(1).wav'	'RUNYANKOLE 35.wav'	'RUNYANKOLE 70.wav'
'RUNYANKOLE 13.wav'	'RUNYANKOLE 36(1).wav'	'RUNYANKOLE 71.wav'
'RUNYANKOLE 14(1).wav'	'RUNYANKOLE 36.wav'	'RUNYANKOLE 72.wav'
'RUNYANKOLE 14.wav'	'RUNYANKOLE 37(1).wav'	'RUNYANKOLE 73.wav'
'RUNYANKOLE 15.wav'	'RUNYANKOLE 37.wav'	'RUNYANKOLE 74.wav'
'RUNYANKOLE 16.wav'	'RUNYANKOLE 38(1).wav'	'RUNYANKOLE 75.wav'
'RUNYANKOLE 17.wav'	'RUNYANKOLE 38.wav'	'RUNYANKOLE 76.wav'
'RUNYANKOLE 18.wav'	'RUNYANKOLE 39.wav'	'RUNYANKOLE 77.wav'
'RUNYANKOLE 19.wav'	'RUNYANKOLE 3.wav'	'RUNYANKOLE 78.wav'
'RUNYANKOLE 1

In [None]:
!ls "/content/chunked_audio/"

audio_1   audio_15  audio_20  audio_26	audio_31  audio_37  audio_5
audio_10  audio_16  audio_21  audio_27	audio_32  audio_38  audio_6
audio_11  audio_17  audio_22  audio_28	audio_33  audio_39  audio_7
audio_12  audio_18  audio_23  audio_29	audio_34  audio_4   audio_8
audio_13  audio_19  audio_24  audio_3	audio_35  audio_40  audio_9
audio_14  audio_2   audio_25  audio_30	audio_36  audio_41


In [None]:
# chuncking on silence detection
from pydub import AudioSegment
from pydub.silence import split_on_silence

def chunk_audio_on_silence(audio, silence_threshold=-30):
    sound = AudioSegment.from_wav(audio)

    # Split audio based on silence detection
    audio_chunks = split_on_silence(
        sound,
        # min_silence_duration=min_silence_len,
        silence_thresh=silence_threshold,
        keep_silence=True
    )

    return audio_chunks


In [None]:
# Define the input directory containing audio files
input_dir = "Acholi_wav"
output_dir = "/content/chunked_audio/Acholi" # the output directory for the chunked audio

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# List all audio files in the input directory
audio_files = glob.glob(os.path.join(input_dir, "*.wav"))

# Process each audio file and export the chunks
for audio_file in audio_files:
    # Create a directory for each audio file's chunks within the output directory
    audio_name = os.path.splitext(os.path.basename(audio_file))[0]
    audio_dir = os.path.join(output_dir, audio_name)

    if not os.path.exists(audio_dir):
        os.makedirs(audio_dir)

    # Chunk the audio based on silence detection
    audio_chunks = chunk_audio_on_silence(audio_file)

    # Export or process audio chunks as needed
    for i, chunk in enumerate(audio_chunks):
        chunk_path = os.path.join(audio_dir, f"c{i+1}.wav")
        chunk.export(chunk_path, format="wav")


In [None]:
# checking pydub version
!pip show pydub

In [None]:
!ls


In [1]:
# # Needed imports
import numpy as np
from IPython.display import Audio
from scipy.io import wavfile

In [None]:
# Assuming you have already read the audio file using librosa
audio_data, sample_rate = librosa.load('/content/chunked_audio/Acholi/audio_1')

# Play the audio
Audio(data=audio_data, rate=sample_rate)


In [None]:
# Generate a player for mono sound
Audio(data,rate=framerate)