In [None]:
import gradio as gr
import re
import subprocess
from pydub import AudioSegment
from dataclasses import dataclass
import os


@dataclass
class AudioProcess_Config():
    """Class to store the necessary variables to processing the audio"""
    filepath: str
    output_folder: str
    usable_folder: str
    not_usable_folder: str
    time_threshold: float
    whisper_model: str
    prefix : str
    
class AudioProcessor():
    def detect_silences(self, config, decibel="-23dB"):
        
        # Executing ffmpeg to detect silences
        command = ["ffmpeg","-i",config.filepath,"-af",f"silencedetect=n={decibel}:d={str(config.time_threshold)}","-f","null","-"]
        out = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        stdout, stderr = out.communicate()

        # Decoding and splitting ffmpeg output
        output = stdout.decode("utf-8")
        silence_info = output.split('[silencedetect @')
        silence_starts = []
        silence_ends = []

        if len(silence_info) > 1:
        
            # Process each detected silence fragment 
            for index, segment in enumerate(silence_info[1:], start=1):
                segment_details = segment.split(']')[1]
                time_values = re.findall(r"[-+]?\d*\.\d+|\d+", segment_details)
        
                if time_values:
                    time = float(time_values[0])
        
                    # Checking whether the time should be either the start or end time according to where we are in the iteration
                    if index % 2 == 0 :
                        silence_ends.append(time)
                    else:
                        silence_starts.append(time)
        
                else:
                    continue

        else :
            return('No silence was detected')

        return list(zip(silence_starts, silence_ends))

def define_process_config(filepath, time_threshold, whisper_model, output_folder, prefix):
    usable_folder = os.path.join(output_folder, 'Usable_Audios')
    not_usable_folder = os.path.join(output_folder, 'Not_Usable_Audios')

    return AudioProcess_Config(
        filepath=filepath,
        output_folder=output_folder,
        usable_folder=usable_folder,
        not_usable_folder=not_usable_folder,
        time_threshold=time_threshold,
        whisper_model=whisper_model,
        prefix=prefix
        
    )
    
def main(filepath, time_threshold, whisper_model, output_folder, prefix=None):
    process_config = define_process_config(filepath, time_threshold, whisper_model, output_folder, prefix)
    ap = AudioProcessor()

    silence_list = ap.detect_silences(process_config)

    return silence_list





In [None]:

demo = gr.Interface(
    fn=main,
    inputs=[
        gr.Audio(sources="upload", 
                 type="filepath"),
        gr.Number(label = 'Time Threshold',
                 info = 'Choose the approximate duration of a silence in the audio'), 
        gr.Dropdown(
            [
                "Tiny",
                "Base",
                "Medium",
                "Large"
            ],
            label = "Whisper model",
            info = "Choose the Whisper model with which you want to do the retranscription"
        ),
     
          gr.Textbox(
            label = 'Output Folder',
            info = 'Type the path where you want to output the segmented audios)'
        ),
           gr.Textbox(
            label = 'Prefix',
            info = 'Choose a prefix for your extracted audio segments (like the name and chapter of the book)'
        )
        
        
    
    ],
    outputs=["text"],
)

demo.launch()