In [None]:
import pandas as pd
import numpy as np
import os, datetime

from pyAudioAnalysis import audioBasicIO, ShortTermFeatures
from pydub import AudioSegment

import soundfile

import tensorflow as tf

### !!!! IF STARTING HERE:<br>
#### Move and unzip the datasets from http://www.openslr.org/12/ 
#### into the '\\Vocal_Replace\\Data\\Initial_Data' subdirectory!!!
#### RENAME THE MAIN FOLDERS TO: 
#### LibriSpeech - dev-clean
#### LibriSpeech - test-clean
#### LibriSpeech - train-clean-100
#### LibriSpeech - train-clean-360
#### (Original naming scheme did not follow the scheme.) <br><br>

##### The datasets used are: test-clean.tar.gz, train-clean-360.tar.gz, train-clean-100.tar.gz, and dev-clean.tar.gz
##### Datasets were unzipped using 7-Zip (free-source software)
##### Please read the README file from the databases as it informs how the files are named, ordered, 
##### the speaker associated, etc.<br><br>

This script is to extract the audio properties from the audio recordings.

CHAPTERS_Modified_Final.csv will serve as the guide for accessing into all directories and files.

SoundFile is used to convert the FLAC audio files to WAV.

pyAudioAnalysis is used as the extractor for audio properties. The audio properties extracted are:


In [None]:
# Loading in CHAPTERS_Modified_Final.csv:
current_directory = os.getcwd()

directory_reference = pd.read_csv(current_directory + "\\Vocal_Replace\\Data\\Modified_Data\\CHAPTERS_Modified_Final.csv")

directory_reference.head()

In [None]:
# Loading in the SPEAKER_Modified_Final.csv to obtain the gender for each speaker:
speaker_df = pd.read_csv(current_directory + "\\Vocal_Replace\\Data\\Modified_Data\\SPEAKER_Modified_Final.csv")

speaker_df.head()

In [None]:
# Creating default filepath to access all audio databases:
databases_directories = [('LibriSpeech - dev-clean', 'dev-clean'), 
                         ('LibriSpeech - test-clean', 'test-clean'), 
                         ('LibriSpeech - train-clean-100', 'train-clean-100'), 
                         ('LibriSpeech - train-clean-360', 'train-clean-360')]


# Splitting and cleaning text files that reference the audio:
for database_directory in databases_directories:
    ## Creating a DF that only contains the SUBSET values that match the subdirectory = database_directory[1]
    directory_select_df = directory_reference[directory_reference.SUBSET == database_directory[1]].copy()
    
    ### Accessing each subdirectory that contains the data files using directory_select_df
    for index, row in directory_select_df.iterrows():
        speaker_ID = row.READER
        chapter_ID = row.ID
        
        directory_filepath = (current_directory + "\\Data\\Initial_Data\\" + database_directory[0] + "\\" 
                              + database_directory[1] + "\\" + str(speaker_ID) + "\\" + str(chapter_ID) + "\\")
        
        #### Creating a list of filenames:
        file_list = os.listdir(directory_filepath)
        
        #### Creating a list of filenames without the file extensions (excluding the .txt file) and a variable assigned to
        #### the .txt file (which includes the .txt file extension in the string):
        filename_list = [filename.strip('.flac') for filename in file_list if '.flac' in filename]
        
        text_file = [filename for filename in file_list if '.txt' in filename][0]
                
        #### Creating a DF of the text associated with each filename:
        with open(directory_filepath + text_file, 'r') as reader:
            text_list = [reader.readline() for x in range(len(filename_list))]
            
        text_df = pd.DataFrame(columns= ['SPEAKER_CHAPTER_ORDER_ID', 'TEXT'], index= range(len(text_list)))
        
        counter = 0
        
        for text in text_list:
            for filename in filename_list:
                if filename in text:
                    text = text.strip(filename + ' ').strip('\n')
                    
                    text_df['SPEAKER_CHAPTER_ORDER_ID'].loc[counter] = filename
                    text_df['TEXT'].loc[counter] = text
                    
                    counter += 1
                    
                    break
                    
        
        #### Creating subdirectories in the Modifed_Data subdirectory to store text_df:
        if os.path.exists(current_directory + '\\Vocal_Replace\\Data\\Modified_Data\\Text_And_Audio_Data\\') != True:
            os.mkdir(current_directory + '\\Vocal_Replace\\Data\\Modified_Data\\Text_And_Audio_Data\\')
        
        if os.path.exists(current_directory + '\\Vocal_Replace\\Data\\Modified_Data\\Text_And_Audio_Data\\' + str(speaker_ID)) != True:
            os.mkdir(current_directory + '\\Vocal_Replace\\Data\\Modified_Data\\Text_And_Audio_Data\\' + str(speaker_ID))
        
        if os.path.exists(current_directory + '\\Vocal_Replace\\Data\\Modified_Data\\Text_And_Audio_Data\\' + str(speaker_ID) + '\\' + 
                          str(chapter_ID)) != True:
            os.mkdir(current_directory + '\\Vocal_Replace\\Data\\Modified_Data\\Text_And_Audio_Data\\' + str(speaker_ID) + '\\' + str(chapter_ID))
        
        text_df.to_csv(current_directory + '\\Vocal_Replace\\Data\\Modified_Data\\Text_And_Audio_Data\\' + str(speaker_ID) + '\\' + str(chapter_ID) 
                       + '\\' + str(speaker_ID) + '-' + str(chapter_ID) + ' - Text DataFrame.csv')
        
        
        #### Creating audio DFs that contain the audio properties that will used as the input variables:
        for file in file_list:
            if '.flac' in file:
                audio_filepath = directory_filepath + file
                audio_time_series, sample_rate = librosa.load(audio_filepath)
                
                ###### Mel-scaled spectrogram​:
                mel_spectrogram = librosa.feature.melspectrogram(y= audio_time_series, sr= sample_rate, n_mels= len(audio_time_series))
                
                ###### Pitch & magnitude:
                pitches, magnitudes = librosa.piptrack(y=audio_time_series, sr=sample_rate, n_fft= len(audio_time_series), hop_length=len(audio_time_series))
                
                ###### Chroma:
                chroma = librosa.feature.chroma_stft(y=audio_time_series, sr=sample_rate, n_chroma= len(audio_time_series), hop_length= len(audio_time_series))
                
                ###### Tempo (for entire audio):
                tempo = librosa.beat.tempo(sr=sample_rate, y= audio_time_series)
                
                ###### Short-time Fourier Transformation:
                short_time_fourier_transform = librosa.stft(y=audio_time_series, n_fft= len(audio_time_series), hop_length= len(audio_time_series))
                
                ###### Audio length/duration in seconds:
                duration = librosa.get_duration(y= audio_time_series, sr= sample_rate)
                
                ###### Time = one sample per time slot/row:
                time_list = []
                for x in range(len(audio_time_series)):
                    if x == 0:
                        time = datetime.timedelta()
                        time_list.append(time)
                        time_increment = datetime.timedelta(seconds= duration/len(audio_time_series))

                    else:
                        time += time_increment
                        time_list.append(time)
                
                
                ###### Scaling/doubling the lists for pitches, magnitudes, and short_time_fourier_transform:
                pitches_repeated = []
                magnitudes_repeated = []
                short_time_fourier_transform_repeated = []

                for pitch in pitches:
                    pitches_repeated.append(float(pitch))
                    pitches_repeated.append(float(pitch))

                for magnitude in magnitudes:
                    magnitudes_repeated.append(float(magnitude))
                    magnitudes_repeated.append(float(magnitude))

                for fourier in short_time_fourier_transform:
                    short_time_fourier_transform_repeated.append(fourier[0])
                    short_time_fourier_transform_repeated.append(fourier[0])

                if len(audio_time_series) % 2 != 0:
                    pitches_repeated.pop(-1)
                    magnitudes_repeated.pop(-1)
                    short_time_fourier_transform_repeated.pop(-1)
                    
                    
                ###### Obtaining the gender of the speaker:
                gender = speaker_df[speaker_df['ID'] == speaker_ID].SEX.values[0]
                
                ###### Obtaining the text associated to the audio file:
                text = text_df[text_df.SPEAKER_CHAPTER_ORDER_ID == file.strip('.flac')].TEXT.values[0]
                    
                    
                ###### Combining all the audio properties, text, and speaker info into one DF:
                combine_df = pd.DataFrame({'TIME': time_list, 'AUDIO_TIME_SERIES': audio_time_series, 
                                           'SAMPLE_RATE': sample_rate, 'MEL_SPECTROGRAM': list(mel_spectrogram), 
                                           'PITCH': pitches_repeated, 'MAGNITUDE': magnitudes_repeated, 
                                           'CHROMA': list(chroma), 'GLOBAL_TEMPO': float(tempo), 
                                           'DURATION_SEC': duration, 'SPEAKER': speaker_ID, 
                                           'GENDER': gender, 'TEXT': text})
                