In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
########## After running this cell, move and unzip the datasets from http://www.openslr.org/12/ 
########## into the '\\Vocal_Replace\\Data\\Initial_Data' subdirectory!!!
########## RENAME THE MAIN FOLDERS TO: 
############### LibriSpeech - dev-clean
############### LibriSpeech - test-clean
############### LibriSpeech - train-clean-100
############### LibriSpeech - train-clean-360
######### Original naming scheme did not follow the scheme.

##### The datasets used are: test-clean.tar.gz, train-clean-360.tar.gz, train-clean-100.tar.gz, and dev-clean.tar.gz
##### Datasets were unzipped using 7-Zip (free-source software)
##### Please read the README file from the databases as it informs how the files are named, ordered, 
##### the speaker associated, etc.


# Creating/initializing all directories:
current_directory = os.getcwd()


## Main Directory:
if os.path.exists(current_directory + '\\Vocal_Replace') != True:
    os.mkdir(current_directory + '\\Vocal_Replace')
    

### Creating Data Directory & Subdirectories:
if os.path.exists(current_directory + '\\Vocal_Replace\\Data') != True:
    os.mkdir(current_directory + '\\Vocal_Replace\\Data')

if os.path.exists(current_directory + '\\Vocal_Replace\\Data\\Initial_Data') != True:
    os.mkdir(current_directory + '\\Vocal_Replace\\Data\\Initial_Data')
    #### Initial_Data will not be modified
    
if os.path.exists(current_directory + '\\Vocal_Replace\\Data\\Modified_Data') != True:
    os.mkdir(current_directory + '\\Vocal_Replace\\Data\\Modified_Data')
    #### Modified_Data will be all modifications to the Initial_Data; More subdirectories will be created later
    

### Creating Vocal Profiles Directory & Subdirectories:
if os.path.exists(current_directory + '\\Vocal_Replace\\Vocal_Profiles') != True:
    os.mkdir(current_directory + '\\Vocal_Replace\\Vocal_Profiles')
    #### Vocal_Profiles will contain the extracted vocal signatures for each speaker
    
if os.path.exists(current_directory + '\\Vocal_Replace\\Vocal_Profiles\\Male') != True:
    os.mkdir(current_directory + '\\Vocal_Replace\\Vocal_Profiles\\Male')
    
if os.path.exists(current_directory + '\\Vocal_Replace\\Vocal_Profiles\\Female') != True:
    os.mkdir(current_directory + '\\Vocal_Replace\\Vocal_Profiles\\Female')
    
    
### Creating User Vocal Data Directory & Subdirectories:
if os.path.exists(current_directory + '\\Vocal_Replace\\User_Vocal_Data') != True:
    os.mkdir(current_directory + '\\Vocal_Replace\\User_Vocal_Data')
    #### All user data will be placed here
    
if os.path.exists(current_directory + '\\Vocal_Replace\\User_Vocal_Data\\Recordings') != True:
    os.mkdir(current_directory + '\\Vocal_Replace\\User_Vocal_Data\\Recordings')
    #### User vocal recordings
    
if os.path.exists(current_directory + '\\Vocal_Replace\\User_Vocal_Data\\Text') != True:
    os.mkdir(current_directory + '\\Vocal_Replace\\User_Vocal_Data\\Text')
    #### The text of the vocal recordings
    
if os.path.exists(current_directory + '\\Vocal_Replace\\User_Vocal_Data\\User_Transformation') != True:
    os.mkdir(current_directory + '\\Vocal_Replace\\User_Vocal_Data\\User_Transformation')
    #### Modifications to the recordings and text
    
if os.path.exists(current_directory + '\\Vocal_Replace\\User_Vocal_Data\\User_Final') != True:
    os.mkdir(current_directory + '\\Vocal_Replace\\User_Vocal_Data\\User_Final')
    #### Final data to be input into the model
    

Looking at one of the SPEAKER.txt file in any of the dataset folders (it is the same .txt file in all folders), it shows the speaker ID, gender of the speaker, which dataset they are featured in, the total time they have recorded in the dataset (in minutes), and the speaker's idenity. 

From this file, I want to extract and create a CSV file with the ID, gender, subset, and minutes. This file is separated by "|". Unfortunately, there is text in the beginning of the file and in the name column, there is one name with "|" in it and other names with parentheses or punctuations. This will make it harder to parse. 

However, since there is only a few names with these notations, the easy & simple way to start would be to manually copy-and-paste the table information I want out/delete the text in the beginning. Then, using Notepad++, I "mark"-ed/highlighted all "|" characters to examine which names contained this character (ID = 60 was the only speaker: |CBW|Simon --> CBW Simon). <br>
This was repeated for: <br>

Names with parentheses: <br>
        Cynthia Lyons (1946-2011) --> Cynthia Lyons<br>
        Alan Davis Drake (1945-2010) --> Alan Davis Drake<br><br>
        Zale Schafer (Rose May Chamberlin Memorial Foundat --> Zale Schafer<br>
        Nelly () --> Nelly<br>
        icyjumbo (1964-2010) --> icyjumbo<br>
        Gregg Margarite (1957-2012) --> Gregg Margarite<br>
        Jacqueline (Jacqui) Grady --> Jacqueline Grady<br><br>
    
Names with commas: <br>
    Cyril Law, Jr. --> Cyril Law<br>
    Sandra in Wales, United Kingdom --> Sandra<br>
    Carl Vonnoh, III --> Carl Vonnoh<br>
    George Deprez, PhD --> George Deprez<br>
    Priya, India --> Priya<br>
    Pete Williams, Pittsburgh, PA --> Pete Williams<br>
    Jamie Strassenburg, Cypress, California --> Jamie Strassenburg<br>
    LaraC, Louisville, KY --> LaraC<br>
    Lori Fuller Chugiak, AK --> Lori Fuller<br><br>

Then using the find-and-replace tool in Notepad++, I replaced all "|" with commas to create a comma-separated txt.
The txt file was saved as SPEAKER_Modified.txt in the \Vocal_Replace\Data\Modified_Data subdirectory.<br><br>

(I do not expect these datasets to be changed as it has been a few years since they were last modified according to the file properties.)

In [None]:
# Loading in SPEAKER_Modified.txt as a Pandas DataFrame:
speaker_df = pd.read_csv(current_directory + "\\Vocal_Replace\\Data\\Modified_Data\\SPEAKER_Modified.txt")

print(speaker_df.head(5))
print(speaker_df.columns)
print(speaker_df.info())

In [None]:
# Looking at the columns and values, there are whitespaces in every column.
## Need to remove all irrelevant whitespaces:

speaker_df_modified = pd.DataFrame(columns= ['ID', 'SEX', 'SUBSET', 'MINUTES', 'NAME'], index= range(0, len(speaker_df)))

for index, row in speaker_df.iterrows():
    speaker_df_modified['ID'].loc[index] = int(str(row['ID  ']).strip(' '))
    speaker_df_modified['SEX'].loc[index] = row['SEX'].strip(' ')
    speaker_df_modified['SUBSET'].loc[index] = row[' SUBSET           '].strip(' ')
    speaker_df_modified['MINUTES'].loc[index] = float(str(row['MINUTES']).strip(' '))
    speaker_df_modified['NAME'].loc[index] = row[' NAME'].strip(' ')
    
speaker_df_modified = speaker_df_modified.set_index('ID')
    
speaker_df_modified.to_csv(current_directory + "\\Vocal_Replace\\Data\\Modified_Data\\SPEAKER_Modified_Final.csv")

Now repeating the same steps done on SPEAKER.txt but with CHAPTERS.txt

The beginning text was removed and all commas were deleted using the find-and-replace tool in Notepad++.

Then all "|" were replaced with commas.

The txt file was saved as CHAPTERS_Modified.txt in the \Vocal_Replace\Data\Modified_Data subdirectory.<br><br>


Since the CHAPTER.txt contains all the information needed to access each and every audio recording and text associated with the audio recording, this will be the most used DataFrame (\Vocal_Replace\Data\Modified_Data\CHAPTERS_Modified_Final.txt)

In [None]:
# Loading in CHAPTERS_Modified.txt as a Pandas DataFrame:
chapters_df = pd.read_csv(current_directory + "\\Vocal_Replace\\Data\\Modified_Data\\CHAPTERS_Modified.txt")

print(chapters_df.head(5))
print(chapters_df.columns)
print(chapters_df.info())

In [None]:
# Looking at the columns and values, there are whitespaces in every column.
# The book title and chapter titles were not taken since they are not important in the overall scope.
## Need to remove all irrelevant whitespaces:

chapters_df_modified = pd.DataFrame(columns= ['ID', 'READER', 'MINUTES', 'SUBSET', 'PROJECT', 'BOOK_ID', 
                                              ], index= range(0, len(chapters_df)))

for index, row in chapters_df.iterrows():  
    chapters_df_modified['ID'].loc[index] = int(str(row['ID    ']).strip(' '))
    chapters_df_modified['READER'].loc[index] = int(str(row['READER']).strip(' '))
    chapters_df_modified['MINUTES'].loc[index] = float(str(row['MINUTES']).strip(' '))
    chapters_df_modified['SUBSET'].loc[index] = row[' SUBSET           '].strip(' ')
    chapters_df_modified['PROJECT'].loc[index] = int(str(row[' PROJ.']).strip(' '))
    chapters_df_modified['BOOK_ID'].loc[index] = int(str(row['BOOK ID']).strip(' '))
    
chapters_df_modified = chapters_df_modified.set_index('READER').sort_index()
    
chapters_df_modified.to_csv(current_directory + "\\Vocal_Replace\\Data\\Modified_Data\\CHAPTERS_Modified_Final.csv")