#### Data Collection: Part 3

This notebook contains both functions and explicit code written to download and extract audio files from VoxForge.org

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup as BS

import tarfile
import os

In [2]:
def get_starting_num(abbr):
    """
    Finds the sample number to start downloading
    
    Parameters:
        abbr (str) : the two letter language abbreviation to have its samples counted
        
    Returns:
        start (int) : one more than the previous highest sample number downloaded
    """
    existing_samples = [f for f in os.listdir('../audio/4_voxforge/' + abbr + '/') if '.wav' in f]
    if len(existing_samples) == 0:
        start = 0
    else:
        sample_numbers = [int(f[2:6]) for f in existing_samples]
        start = max(sample_numbers) + 1
    return start

In [3]:
def save_tgz(url):
    """
    Saves a given tgz archive
    
    Parameters:
        url (str) : the url of the download
    
    Returns:
        None
    """
    # requests gets the sample download
    dl_res = requests.get(url)
    
    fname = url.split('/')[-1]
    # content is saved to downloads folder
    with open('../downloads/' + fname, 'wb') as f:
        f.write(dl_res.content)

In [4]:
def extract_readme(tar):
    """
    Extracts the gender, age, and dialect labels from an archive's README file
    
    Parameters:
        tar (tarfile) : the archive to extract from
        
    Returns:
        gender (str) : gender label from README
        age (str) : age label from README
        dialect (str) : dialect label from README
    """
    # find readme file
    ex = [m for m in tar.getmembers() if 'README' in m.name][0]
    
    # extract
    tar.extract(ex, '../downloads/')
    
    # open
    with open('../downloads/' + ex.name, 'r') as f:
        lines = f.readlines()
        
    # grab attributes    
    gender = [l for l in lines if 'Gender' in l][0].replace('\n', '')
    age = [l for l in lines if 'Age' in l][0].replace('\n', '')
    dialect = [l for l in lines if 'dialect' in l][0].replace('\n', '')
    
    # remove file
    os.remove('../downloads/' + ex.name)
    
    # remove directories
    dirname = ex.name.split('/')[0]
    submame = ex.name.split('/')[1]
    
    try:
        os.removedirs('../downloads/' + dirname + '/' + subname)
        os.removedirs('../downloads/' + dirname)
    except:
        try:
            os.remove('../downloads/' + dirname + '/.DS_Store')
            os.remove('../downloads/' + dirname + '/' + subname + '/.DS_Store')
            os.removedirs('../downloads/' + dirname + '/' + subname)
            os.removedirs('../downloads/' + dirname)
        except:
            pass
    
    # return attributes
    return (gender, age, dialect)

In [5]:
def sav_wavs(abbr, tar, m):
    """
    Saves the wave files from a given tar archive
    
    Parameters:
        abbr (str) : the two letter language abbreviation of the audio
        tar (tarfile) : the tar archive to extract audio from
        m (int) : the index number of the download, used for naming
        
    Returns:
        saved_files (list of str) : the file paths of the saved wave files
    """
    # list tar members, filter for .wav files
    wav_list = [member for member in tar.getmembers() if '.wav' in member.name]
    saved_files = []

    # extract and save each filtered file
    for i, wav in enumerate(wav_list):
        tar.extract(wav, '../downloads/')

        # using os.renames, move and rename .wav files (first four digits download n, last three sample n)
        fname = f'{abbr}{m:04}-{i:03}'
        path = '../audio/4_voxforge/' + abbr + '/' + fname + '.wav'
        os.renames('../downloads/' + wav.name, path)
        saved_files.append(fname)
    return saved_files

In [17]:
# inputs are lang abbr, number of samples to download, possibly starting number
def voxforge_download(abbr, n=1, start=0):
    """
    Downloads and saves audio files from VoxForge
    
    Parameters:
        abbr (str) : the two letter abbreviation of the language to download samples of
        n (int) : the number of samples to download, default 1
        start (int) : the VoxForge sample number to start with
    
    Returns:
        update (pandas DataFrame) : dataframe listing of the downloads with their attributes and a list of saved file names
    """
    # requests gets the download page, made into BS
    url = 'http://www.repository.voxforge1.org/downloads/' + abbr + '/Trunk/Audio/Main/16kHz_16bit/'
    page_res = requests.get(url)
    soup = BS(page_res.text)
    
    # find 'a' entries, filter for .tgz ending
    entries = soup.find_all(name = 'a')
    dl_names = [e['href'] for e in entries if '.tgz' in e['href']]
    
    start = get_starting_num(abbr)
    saved_file_dict = {}
    
    # for n samples:
    for m in range(start, start + n):
        save_tgz(url + dl_names[m])
        
        # tarfile opens archive
        tar = tarfile.open('../downloads/' + dl_names[m])
        
        # save files
        saved_files = sav_wavs(abbr, tar, m)
        
        # extract attributes from readme
        gender, age, dialect = extract_readme(tar, m)
        
        saved_file_dict.update({f'{abbr}{m:04}' : {
            'language' : abbr,
            'samples' : saved_files,
            'gender' : gender,
            'age' : age,
            'dialect' : dialect,
        }})
        
        update = pd.DataFrame(saved_file_dict).T
    
    # update file listing
    try:
        df = pd.read_csv('../audio/4_voxforge/filelisting.csv')
    except:
        df = pd.DataFrame(columns=['age', 'dialect', 'gender', 'language', 'samples'])
    df = pd.concat([df, update])
    df.reset_index(inplace=True)
    df.rename(columns={'index' : 'sample_number'})
    df.to_csv('../audio/4_voxforge/filelisting.csv', index=False)
    
    # remove tgz file
    try:
        os.remove('../downloads/' + dl_names[m])
    except:
        pass
    
    return update

Downloading Spanish, French, Russian, and English:

(VoxForge currently does not have any Mandarin Chinese audio files)

In [8]:
voxforge_download('es', 500, get_starting_num('es'))

Unnamed: 0,language,samples,gender,age,dialect
es0100,es,"[es0100-000, es0100-001, es0100-002, es0100-00...",Gender: Masculino,Age Range: Adulto,Pronunciation dialect: Español Latinoamerica
es0101,es,"[es0101-000, es0101-001, es0101-002, es0101-00...",Gender: Masculino,Age Range: Adulto,Pronunciation dialect: Español España
es0102,es,"[es0102-000, es0102-001, es0102-002, es0102-00...",Gender: Masculino,Age Range: Adulto,Pronunciation dialect: Español España
es0103,es,"[es0103-000, es0103-001, es0103-002, es0103-00...",Gender: Masculino,Age Range: Adulto,Pronunciation dialect: Español España
es0104,es,"[es0104-000, es0104-001, es0104-002, es0104-00...",Gender: Masculino,Age Range: Adulto,Pronunciation dialect: Español España
...,...,...,...,...,...
es0595,es,"[es0595-000, es0595-001, es0595-002, es0595-00...",Gender: Masculino,Age Range: Adulto,Pronunciation dialect: Español Latinoamerica
es0596,es,"[es0596-000, es0596-001, es0596-002, es0596-00...",Gender: Masculino,Age Range: Adulto,Pronunciation dialect: Español España
es0597,es,"[es0597-000, es0597-001, es0597-002, es0597-00...",Gender: Masculino,Age Range: Adulto,Pronunciation dialect: Español España
es0598,es,"[es0598-000, es0598-001, es0598-002, es0598-00...",Gender: Masculino,Age Range: Adulto,Pronunciation dialect: Español Argentina


In [10]:
voxforge_download('fr', 600)

Unnamed: 0,language,samples,gender,age,dialect
fr0000,fr,"[fr0000-000, fr0000-001, fr0000-002, fr0000-00...",Gender: Masculin,Age Range: Adulte,Pronunciation dialect: Français Suisse
fr0001,fr,"[fr0001-000, fr0001-001, fr0001-002, fr0001-00...",Gender: Masculin,Age Range: Adulte,Pronunciation dialect: Français Belgique
fr0002,fr,"[fr0002-000, fr0002-001, fr0002-002, fr0002-00...",Gender: Masculin,Age Range: Adulte,Pronunciation dialect: Français Belgique
fr0003,fr,"[fr0003-000, fr0003-001, fr0003-002, fr0003-00...",Gender: Masculin,Age Range: Adulte,Pronunciation dialect: Français Belgique
fr0004,fr,"[fr0004-000, fr0004-001, fr0004-002, fr0004-00...",Gender: Masculin,Age Range: Adulte,Pronunciation dialect: Français Belgique
...,...,...,...,...,...
fr0595,fr,"[fr0595-000, fr0595-001, fr0595-002, fr0595-00...",Gender: Masculin,Age Range: Adulte,Pronunciation dialect: Français France
fr0596,fr,"[fr0596-000, fr0596-001, fr0596-002, fr0596-00...",Gender: Masculin,Age Range: Adulte,Pronunciation dialect: Français France
fr0597,fr,"[fr0597-000, fr0597-001, fr0597-002, fr0597-00...",Gender: Masculin,Age Range: Adulte,Pronunciation dialect: Français France
fr0598,fr,"[fr0598-000, fr0598-001, fr0598-002, fr0598-00...",Gender: Masculin,Age Range: Adulte,Pronunciation dialect: Français France


In [11]:
voxforge_download('ru', 600)

IndexError: list index out of range

In [12]:
voxforge_download('en', 600)

Unnamed: 0,language,samples,gender,age,dialect
en0000,en,"[en0000-000, en0000-001, en0000-002, en0000-00...",Gender: Male,Age Range: Adult,Pronunciation dialect: American English
en0001,en,"[en0001-000, en0001-001, en0001-002, en0001-00...",Gender: Female,Age Range: Adult,Pronunciation dialect: Other
en0002,en,"[en0002-000, en0002-001, en0002-002, en0002-00...",Gender: Female,Age Range: Adult,Pronunciation dialect: Other
en0003,en,"[en0003-000, en0003-001, en0003-002, en0003-00...",Gender: Weiblich,Age Range: Erwachsener,Pronunciation dialect: Westdeutschland
en0004,en,"[en0004-000, en0004-001, en0004-002, en0004-00...",Gender: Male,Age Range: Adult,Pronunciation dialect: American English
...,...,...,...,...,...
en0595,en,"[en0595-000, en0595-001, en0595-002, en0595-00...",Gender: Male,Age Range: Youth,Pronunciation dialect: American English
en0596,en,"[en0596-000, en0596-001, en0596-002, en0596-00...",Gender: Masculino,Age Range: Adulto,Pronunciation dialect: Español España
en0597,en,"[en0597-000, en0597-001, en0597-002, en0597-00...",Gender: Male,Age Range: Adult,Pronunciation dialect: Canadian English
en0598,en,"[en0598-000, en0598-001, en0598-002, en0598-00...",Gender: Male,Age Range: Adult,Pronunciation dialect: Canadian English


Examining the dataframe of samples

In [34]:
df = pd.read_csv('../audio/4_voxforge/filelisting.csv')
df

Unnamed: 0,sample_number,age,dialect,gender,language,samples
0,es0000,Age Range: Adulto,Pronunciation dialect: Español España,Gender: Masculino,es,"['es0000-000', 'es0000-001', 'es0000-002', 'es..."
1,es0001,Age Range: Adulto,Pronunciation dialect: Español España,Gender: Masculino,es,"['es0001-000', 'es0001-001', 'es0001-002', 'es..."
2,es0002,Age Range: Adulto,Pronunciation dialect: Español Argentina,Gender: Masculino,es,"['es0002-000', 'es0002-001', 'es0002-002', 'es..."
3,es0003,Age Range: Adulto,Pronunciation dialect: Español Mexicano,Gender: Masculino,es,"['es0003-000', 'es0003-001', 'es0003-002', 'es..."
4,es0004,Age Range: Adulto,Pronunciation dialect: Español Argentina,Gender: Masculino,es,"['es0004-000', 'es0004-001', 'es0004-002', 'es..."
...,...,...,...,...,...,...
1795,en0595,Age Range: Youth,Pronunciation dialect: American English,Gender: Male,en,"['en0595-000', 'en0595-001', 'en0595-002', 'en..."
1796,en0596,Age Range: Adulto,Pronunciation dialect: Español España,Gender: Masculino,en,"['en0596-000', 'en0596-001', 'en0596-002', 'en..."
1797,en0597,Age Range: Adult,Pronunciation dialect: Canadian English,Gender: Male,en,"['en0597-000', 'en0597-001', 'en0597-002', 'en..."
1798,en0598,Age Range: Adult,Pronunciation dialect: Canadian English,Gender: Male,en,"['en0598-000', 'en0598-001', 'en0598-002', 'en..."


I noticed that not all of the samples labelled as English were indeed English. To fix this, I manually deleted the files listed on those entries and dropped them from the dataframe.

In [35]:
df['test'] = [d.split(' ')[-1] for d in df['dialect']]

In [44]:
german = df[(df['language'] == 'en') & (df['test'] == 'Westdeutschland')]
german

Unnamed: 0,sample_number,age,dialect,gender,language,samples,test
1203,en0003,Age Range: Erwachsener,Pronunciation dialect: Westdeutschland,Gender: Weiblich,en,"['en0003-000', 'en0003-001', 'en0003-002', 'en...",Westdeutschland


In [45]:
spanish = df[(df['language'] == 'en') & (df['test'] == 'España')]
spanish

Unnamed: 0,sample_number,age,dialect,gender,language,samples,test
1796,en0596,Age Range: Adulto,Pronunciation dialect: Español España,Gender: Masculino,en,"['en0596-000', 'en0596-001', 'en0596-002', 'en...",España


In [48]:
df = df.drop(german.index)
df

Unnamed: 0,sample_number,age,dialect,gender,language,samples,test
0,es0000,Age Range: Adulto,Pronunciation dialect: Español España,Gender: Masculino,es,"['es0000-000', 'es0000-001', 'es0000-002', 'es...",España
1,es0001,Age Range: Adulto,Pronunciation dialect: Español España,Gender: Masculino,es,"['es0001-000', 'es0001-001', 'es0001-002', 'es...",España
2,es0002,Age Range: Adulto,Pronunciation dialect: Español Argentina,Gender: Masculino,es,"['es0002-000', 'es0002-001', 'es0002-002', 'es...",Argentina
3,es0003,Age Range: Adulto,Pronunciation dialect: Español Mexicano,Gender: Masculino,es,"['es0003-000', 'es0003-001', 'es0003-002', 'es...",Mexicano
4,es0004,Age Range: Adulto,Pronunciation dialect: Español Argentina,Gender: Masculino,es,"['es0004-000', 'es0004-001', 'es0004-002', 'es...",Argentina
...,...,...,...,...,...,...,...
1795,en0595,Age Range: Youth,Pronunciation dialect: American English,Gender: Male,en,"['en0595-000', 'en0595-001', 'en0595-002', 'en...",English
1796,en0596,Age Range: Adulto,Pronunciation dialect: Español España,Gender: Masculino,en,"['en0596-000', 'en0596-001', 'en0596-002', 'en...",España
1797,en0597,Age Range: Adult,Pronunciation dialect: Canadian English,Gender: Male,en,"['en0597-000', 'en0597-001', 'en0597-002', 'en...",English
1798,en0598,Age Range: Adult,Pronunciation dialect: Canadian English,Gender: Male,en,"['en0598-000', 'en0598-001', 'en0598-002', 'en...",English


In [49]:
df = df.drop(spanish.index)
df

Unnamed: 0,sample_number,age,dialect,gender,language,samples,test
0,es0000,Age Range: Adulto,Pronunciation dialect: Español España,Gender: Masculino,es,"['es0000-000', 'es0000-001', 'es0000-002', 'es...",España
1,es0001,Age Range: Adulto,Pronunciation dialect: Español España,Gender: Masculino,es,"['es0001-000', 'es0001-001', 'es0001-002', 'es...",España
2,es0002,Age Range: Adulto,Pronunciation dialect: Español Argentina,Gender: Masculino,es,"['es0002-000', 'es0002-001', 'es0002-002', 'es...",Argentina
3,es0003,Age Range: Adulto,Pronunciation dialect: Español Mexicano,Gender: Masculino,es,"['es0003-000', 'es0003-001', 'es0003-002', 'es...",Mexicano
4,es0004,Age Range: Adulto,Pronunciation dialect: Español Argentina,Gender: Masculino,es,"['es0004-000', 'es0004-001', 'es0004-002', 'es...",Argentina
...,...,...,...,...,...,...,...
1794,en0594,Age Range: Adult,Pronunciation dialect: European English,Gender: Male,en,"['en0594-000', 'en0594-001', 'en0594-002', 'en...",English
1795,en0595,Age Range: Youth,Pronunciation dialect: American English,Gender: Male,en,"['en0595-000', 'en0595-001', 'en0595-002', 'en...",English
1797,en0597,Age Range: Adult,Pronunciation dialect: Canadian English,Gender: Male,en,"['en0597-000', 'en0597-001', 'en0597-002', 'en...",English
1798,en0598,Age Range: Adult,Pronunciation dialect: Canadian English,Gender: Male,en,"['en0598-000', 'en0598-001', 'en0598-002', 'en...",English


In [50]:
df.drop(columns='test', inplace=True)

In [52]:
df.to_csv('../audio/4_voxforge/filelisting.csv', index=False)