#### Data Collection: Part 1

This notebook contains the functions that were used to mass download audio files from audio-lingua.eu

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup as BS

In [2]:
def writefile(res, language, filename):
    """
    Saves mp3 audio files
    
    Args:
        res: request object of the audio file page
        language: language spoken in sample
        filename: file name to be saved as
        
    Returns:
        None
    """
    with open('../audio/1_audiolingua/' + language + '/' + filename + '.mp3', 'wb') as f:
        f.write(res.content)

In [3]:
def pagelist(url, language):
    """
    Pulls, saves, and documents all the samples linked on a given audio-lingua webpage
    
    Args:
        url: string of the webpage address
        language: language of the samples on the page
    
    Returns:
        samples: a pandas dataframe of the new samples saved and added to the master list
    """
    base_url = 'https://www.audio-lingua.eu/'
    page_dict = {}
    
    page_res = requests.get(url)
    soup = BS(page_res.text)
    
    # find the first 5 article entries on the page--beyond that have no download links
    entries = soup.find_all(name='article', attrs={'class' : 'entry article hentry'})[:5]
    
    # iterate through page posts
    for entry in entries:
        try:
            # request object is the download link
            entry_res = entry.find(name='a', attrs={'title' : 'Download'})
            # go to direct download page
            audio_res = requests.get(base_url + entry_res['href'])
            # grab the name prior to the extension
            fname = entry_res['href'].split('/')[2].split('.')[0]
            # grab the labels on the entry
            labels = [label.text.strip() for label in entry.find_all(
                name='a', attrs={'class' : 'label'}
            )]

            # save audio files
            writefile(audio_res, language, fname)

            # add audio entries to dictionary
            page_dict.update({
                fname : {'language' : language, 'labels' : labels}
            })
        # if there is an error (usually because entries is an empty list), end the attempt
        except:
            pass
    
    # samples to be updated
    samples = pd.DataFrame(page_dict).T
    samples.reset_index(inplace=True)
    samples.rename(columns={'index' : 'file_name'}, inplace=True)
    
    # add to file listing
    update_listing(samples)
    
    # show added samples
    return samples


In [4]:
def update_listing(df):
    """
    Updates and saves the master list of samples.
    
    Args:
        df: a pandas dataframe of the new samples to be added 
    """
    try:
        file_listing = pd.read_csv('../audio/1_audiolingua/file_listing.csv')
    except:
        file_listing = pd.DataFrame(columns=['file_name', 'language', 'labels'])
        
    file_listing = pd.concat([file_listing, df])
    file_listing = file_listing.drop_duplicates(subset='file_name', keep='last')
    file_listing.to_csv('../audio/1_audiolingua/file_listing.csv', index=False)

In [5]:
def count_samples():
    """
    Shows the number of samples collected for each language. 
    
    Args:
        None
        
    Returns:
        pd.Series of languages and their counts
    """
    return pd.read_csv('../audio/1_audiolingua/file_listing.csv')['language'].value_counts()

The only languages with samples collected from audio-lingua were English and Mandarin, as during the course of the project the website's certificate changed or expired, leaving it no longer entirely safe to be using. 

In [10]:
en_base = 'https://www.audio-lingua.eu/spip.php?rubrique2&lang=en'
pages = list(range(0, 501, 5))

for p in pages:
    pagelist(en_base + '&debut_articles=' + str(p) + '#pagination_articles', 'en')

In [11]:
zh_base = 'https://audio-lingua.eu/spip.php?rubrique9&lang=en'
pages = list(range(0, 501, 5))

for p in pages:
    pagelist(zh_base + '&debut_articles=' + str(p) + '#pagination_articles', 'zh')