# Removal of leading and trailing silences

The goal of this code is to remove leading and trailing silences of a list of 461 audio files by performing the following:
1. Remove any leading or trailing silences in the duration of an entire audio file, with the silence threshold set at -40dBFS i.e. any segments with loudness of -40dBFS or lower will be regarded as a silent portion
2. Creating a new excel sheet with the (1) duration 

In [1]:
import pandas as pd
import numpy as np
import os
from pydub import AudioSegment
from pydub import AudioSegment, silence
from os.path import join



In [2]:
#Specify folder path where original audio files are stored 
audio_folder = '/Users/jannaha/Desktop/audio_files/'

In [3]:
#Specify folder path where manipulated audio files will be stored
output_folder = '/Users/jannaha/Desktop/audio_files/output/'

In [4]:
#Path of excel file where list of audio file names (.wav) are stored
excel_file = '/Users/jannaha/Desktop/audio_files/wavefiles.xlsx'

In [5]:
df = pd.read_excel(excel_file)

In [6]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,wavefile,word,token_no,new-wavefile
0,0,0,001 Saya 01.wav,saya,1,001 Saya 01-new.wav
1,1,1,002 kita 01.wav,kita,1,002 kita 01-new.wav
2,2,2,003 kalau 01.wav,kalau,1,003 kalau 01-new.wav
3,3,3,004 jadi 01.wav,jadi,1,004 jadi 01-new.wav
4,4,4,005 tapi 01.wav,tapi,1,005 tapi 01-new.wav


In [7]:
#Number of rows corresponds to number of audio (.wav) files in folder are stored in 
df.shape

(13, 6)

In [8]:
#Creating a new dataframe to store metadata or attributes of each audio file 
#The duration of original audio file, timestamp where silence is at is added as columns in df2 dataframe
df2 = pd.DataFrame(columns = ['wavefile','duration', 'silence','total_len', 'dbfs_loudness'])

In [9]:
def get_info():
    for i in df.index:
        audiofile = df.loc[i, 'wavefile']
        wavefile_path = os.path.join(audio_folder, audiofile)
        
        # checks if file path exists
        if not os.path.isfile(wavefile_path):
            print(f"{wavefile_path} does not exist.")
        
        sound = AudioSegment.from_file(wavefile_path)
        
        #search for sections with silences, silence threshold set at -50dBFS
        silence_dur = silence.detect_silence(sound, min_silence_len= 50, silence_thresh=-50, seek_step =1)
        total_dur = len(sound)
        list_count = len(silence_dur)
        
        #if else statement to remove only leading and trailing silences rather than silences in between word utterance
        if list_count == 1 and silence_dur[0][0] > 0:
            new_file = sound[:int(silence_dur[0][0])]
            
        elif list_count == 1 and silence_dur[0][0] == 0:
            new_file = sound
            
        elif list_count > 1:
            new_file = sound[int(silence_dur[0][1]):int(silence_dur[-1][0])]
        
        #writing new file name 
        new_file_path = output_folder + "/" + audiofile[:-4] + "-new.wav"
        new_file.export(new_file_path, format = "wav")
        new_sound = AudioSegment.from_file(new_file_path)
        
        #specifying contents to write in df2
        wavefile = audiofile
        total_len = len(new_sound)
        dbfs_loudness = new_sound.dBFS
        
        df2.loc[i] = [wavefile, total_dur, silence_dur, total_len, dbfs_loudness]


In [10]:
get_info()

In [11]:
df2

Unnamed: 0,wavefile,duration,silence,total_len,dbfs_loudness
0,001 Saya 01.wav,657,"[[0, 73], [592, 657]]",519,-31.47877
1,002 kita 01.wav,1200,"[[0, 81], [1013, 1073], [1124, 1200]]",1043,-36.793432
2,003 kalau 01.wav,776,"[[119, 177], [574, 776]]",397,-32.367459
3,004 jadi 01.wav,766,"[[623, 766]]",623,-31.528603
4,005 tapi 01.wav,604,"[[184, 270], [470, 523]]",200,-36.005652
5,006 tahu 01.wav,745,"[[0, 123], [604, 715]]",481,-28.871359
6,007 saja01.wav,810,"[[0, 50], [748, 810]]",698,-35.314272
7,008 lagi 01.wav,749,"[[0, 119], [605, 749]]",486,-33.970922
8,009 mana 01.wav,746,"[[0, 117], [618, 746]]",501,-32.225511
9,010 satu 01.wav,760,"[[0, 134], [406, 539], [635, 697]]",501,-33.150293


In [12]:
#creating excel writer
write_to_path = '/Users/jannaha/Desktop/audio_files/output/wavefiles_delsilence.xlsx'

#writing dataframe to excel sheet, named Sheet 1
df2.to_excel(write_to_path, index=False)