In [9]:
import gradio as gr
import re
import subprocess
from pydub import AudioSegment
from dataclasses import dataclass
import os


@dataclass
class AudioProcess_Config():
    """Class to store the necessary variables to processing the audio"""
    filepath: str
    audio: any
    input_format :str
    export_format: str
    output_format: str
    output_folder: str
    usable_folder: str
    not_usable_folder: str
    time_threshold: float
    whisper_model: str
    prefix : str
    
class AudioProcessor():
    ''' Class to process the audios'''

    def detect_silences(self, config, decibel="-23dB"):
        '''Function to detect silences in an audio'''

        # Executing ffmpeg to detect silences
        command = ["ffmpeg","-i",config.filepath,"-af",f"silencedetect=n={decibel}:d={str(config.time_threshold)}","-f","null","-"]
        out = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        stdout, stderr = out.communicate()

        # Decoding and splitting ffmpeg output
        output = stdout.decode("utf-8")
        silence_info = output.split('[silencedetect @')
        silence_starts = []
        silence_ends = []

        if len(silence_info) <= 1:
            return('No silence was detected')

            # Process each detected silence fragment
        for index, segment in enumerate(silence_info[1:], start=1):
            segment_details = segment.split(']')[1]
            if time_values := re.findall(r"[-+]?\d*\.\d+|\d+", segment_details):
                time = float(time_values[0])

                # Checking whether the time should be either the start or end time according to where we are in the iteration
                if index % 2 == 0 :
                    silence_ends.append(time)
                else:
                    silence_starts.append(time)

        return list(zip(silence_starts, silence_ends))

    def extract_midpoints(self, list):
        ''' Function to extract the midpoints where the audio must be sliced '''
        return [(start + end) / 2 for start, end in list]

    def process_segment(self, config, start_point, end_point):
        '''Extracts and exports a segment of the audio'''
        segment = config.audio[start_point * 1000 : end_point * 1000]
        temp_segment_name = f'temp_segment.{config.export_format}'
        temp_segment_path = os.path.join(config.output_folder, 'temp', temp_segment_name)
        segment.export(temp_segment_path, format=config.export_format)

        return temp_segment_path, len(segment)


def define_process_config(filepath, time_threshold, whisper_model, output_folder, prefix):
    usable_folder = os.path.join(output_folder, 'Usable_Audios')
    not_usable_folder = os.path.join(output_folder, 'Not_Usable_Audios')
    input_format = filepath.split('.')[-1].lower()
    audio = AudioSegment.from_file(filepath)

    return AudioProcess_Config(
        filepath=filepath,
        audio=audio,
        input_format=input_format,
        export_format=input_format,
        output_folder=output_folder,
        usable_folder=usable_folder,
        not_usable_folder=not_usable_folder,
        time_threshold=time_threshold,
        whisper_model=whisper_model,
        prefix=prefix
        
    )
    
def main(filepath, time_threshold, whisper_model, output_folder, prefix=None):
    process_config = define_process_config(filepath, time_threshold, whisper_model, output_folder, prefix)
    ap = AudioProcessor()

    if silence_list := ap.detect_silences(process_config):
        midpoints = ap.extract_midpoints(silence_list)
        start_point = 0
        transcriptions_dict = {}

        for end_point in midpoints:
            segment_path, segment_length = ap.process_segment(process_config, start_point, end_point)

        
       
    else:
        print('no silences detected')
    

    





In [10]:

demo = gr.Interface(
    fn=main,
    inputs=[
        gr.Audio(sources="upload", 
                 type="filepath"),
        gr.Number(label = 'Time Threshold',
                 info = 'Choose the approximate duration of a silence in the audio'), 
        gr.Dropdown(
            [
                "Tiny",
                "Base",
                "Medium",
                "Large"
            ],
            label = "Whisper model",
            info = "Choose the Whisper model with which you want to do the retranscription"
        ),
     
          gr.Textbox(
            label = 'Output Folder',
            info = 'Type the path where you want to output the segmented audios)'
        ),
           gr.Textbox(
            label = 'Prefix',
            info = 'Choose a prefix for your extracted audio segments (like the name and chapter of the book)'
        )
        
        
    
    ],
    outputs=["text"],
)

demo.launch()

Running on local URL:  http://127.0.0.1:7912

To create a public link, set `share=True` in `launch()`.




Traceback (most recent call last):
  File "/home/maelys/anaconda3/envs/audio-dataset-manager/lib/python3.9/site-packages/gradio/queueing.py", line 495, in call_prediction
    output = await route_utils.call_process_api(
  File "/home/maelys/anaconda3/envs/audio-dataset-manager/lib/python3.9/site-packages/gradio/route_utils.py", line 232, in call_process_api
    output = await app.get_blocks().process_api(
  File "/home/maelys/anaconda3/envs/audio-dataset-manager/lib/python3.9/site-packages/gradio/blocks.py", line 1561, in process_api
    result = await self.call_function(
  File "/home/maelys/anaconda3/envs/audio-dataset-manager/lib/python3.9/site-packages/gradio/blocks.py", line 1179, in call_function
    prediction = await anyio.to_thread.run_sync(
  File "/home/maelys/anaconda3/envs/audio-dataset-manager/lib/python3.9/site-packages/anyio/to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
  File "/home/maelys/anaconda3/envs/audio-datas