# PodcastMix subset

In this dataset I compile and create all the directory structure + the metadata for the dataset. The dataset is created from a subset of the VCTK dataset (10 speakers, ~7.000 audio files) + ~1.700 most popular and featured songs from the Jamendo App.

This notebook downloads the already compiled subset of the VCTK and does all the job of querying the Jamnedo API to obtain the metadata and then download all the 1.700 flac files. Because of this, the excecution of this notebook take about 4 hours, only to prepatare the dataset. You are free to run all the cells of the notebook to compile the dataset for your own and to tweak the parameters to prefer. However, if you want to save time, I have uploaded the compiled PodcastMix and in the notebook called 'FinalProject_audio_process' I do all the training and evaluation tasks, assuming that the dataset already exists.

Feel free to download the compiled version directly ~XXGb or to run this notebook to compile it by yourself, and then go to the mentioned notebook.

# Connect to Drive 
the data this notebook will download its about 2Gb

In [None]:
# change the path to wherever you want to create the podcastmix in your Drive
%cd /content/drive/MyDrive/Colab Notebooks/MIR/FinalProject

In [None]:
# download the VCTK Subset (collected by myself):
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1VWqB07HIE-r3xngMBgZq2E-d0upKAaVE' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1VWqB07HIE-r3xngMBgZq2E-d0upKAaVE" -O vctk-subset.zip && rm -rf /tmp/cookies.txt

In [None]:
# download the metadata:
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1ldYpoW5HTdpFkgJmyddHHKVzkKYuqpsw' -O speaker-info-subset.txt

In [None]:
# unzip the subset
!unzip vctk-subset.zip

In [None]:
# install python wget tool to download songs from the Jamendo App
!pip install wget

In [6]:
import json
import os.path
import wget
import requests

In [7]:
%mkdir Jamendo

In [None]:
%cd Jamendo

In [9]:
%mkdir music

In [10]:
def download_metadata_from_jamendo(n_pages = 10):
  """
    Using your token from Jamendo, it queries the API to get the metadata
    from the most popular & featured songs. The metadata.json generated file will
    be used to download the files later. Files are downloaded in mp3, otherwise the 
    dataset get too big.

    n_pages: number of pages to retrieve. Each page retrives 200 songs
  """
  # your Jamendo API token here. You can get your own from https://devportal.jamendo.com/admin
  client_id = '08bac555'

  json_keys = {}
  last_offset = 1
  counter = 0
  for i in range(n_pages):
    offset = (i * 200)
    r =requests.get('https://api.jamendo.com/v3.0/tracks/?client_id='+client_id+'&format=json&boost=popularity_total&audiodlformat=mp31&featured=true&limit=200&offset='+str(offset)+'&include=licenses')
    songs = json.loads(r.text)['results']
    for song in songs:
      if(song['audiodownload_allowed']):
        id = song['id']
        json_keys[id] = song
        last_offset += 1

  file_name = 'metadata.json'
  with open(file_name, 'w') as outfile:
      json.dump(json_keys, outfile)

In [11]:
def download_tracks_from_jamendo():
  with open('metadata.json') as file:
    json_file = json.load(file)

  errors = {}
  counter = 0
  for song_id in json_file:
    song = json_file.get(song_id)
    file_name = song_id + '.mp3'
    url = song['audiodownload']
    if not os.path.isfile('music/' + file_name):
      try:
        filename = wget.download(url, out='music/' + file_name)
      except:
        errors[song['id']]=song
    counter +=1

  with open('errors.json', 'w') as outfile:
      json.dump(errors, outfile)


In [12]:
download_metadata_from_jamendo(n_pages = 10)

In [None]:
download_tracks_from_jamendo()

In [None]:
%cd ..

In [None]:
!pip install torchaudio

In [3]:
from shutil import copyfile
from os import listdir
from os.path import isfile, join
import random
import numpy as np
import re
import sys
import json
import csv
import torchaudio

In [4]:
# declare the path of speech and music
# modify if necesary:
speech_path = "subset"
speech_metadata_path = "speaker-info-subset.txt"

music_path = "Jamendo/music"
music_metadata_path = "Jamendo/metadata.json"

In [None]:
# create files structure
if not os.path.exists('podcastmix'):
        os.makedirs('podcastmix')

def create_folder_structure(path):
    if not os.path.exists(path):
        os.makedirs(path)
    if not os.path.exists(path + '/music'):
        os.makedirs(path + '/music')
    if not os.path.exists(path + '/speech'):
        os.makedirs(path + '/speech')

# create files structure
train_path = "podcastmix/train"
create_folder_structure(train_path)

val_path = "podcastmix/val"
create_folder_structure(val_path)

test_path = "podcastmix/test"
create_folder_structure(test_path)

# create the metadata directory
if not os.path.exists('podcastmix/metadata'):
    os.makedirs('podcastmix/metadata')
if not os.path.exists('podcastmix/metadata/train'):
    os.makedirs('podcastmix/metadata/train')
if not os.path.exists('podcastmix/metadata/val'):
    os.makedirs('podcastmix/metadata/val')
if not os.path.exists('podcastmix/metadata/test'):
    os.makedirs('podcastmix/metadata/test')


In [None]:
def create_csv_metadata(csv_path, headers):
    with open(csv_path, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(headers)

# create the train csv file
speech_headers = [
    "speech_ID",
    "speaker_id",
    "speaker_age",
    "speaker_gender",
    "speaker_accent",
    "speech_path",
    "length"
    ]
music_headers = [
    "music_ID",
    "jamendo_id",
    "name",
    "artist_name",
    "album_name",
    "license_ccurl",
    "releasedate",
    "music_path",
    "length"
    ]

csv_path_tr_s = 'podcastmix/metadata/train/speech.csv'
csv_path_tr_m = 'podcastmix/metadata/train/music.csv'
csv_path_va_s = 'podcastmix/metadata/val/speech.csv'
csv_path_va_m = 'podcastmix/metadata/val/music.csv'
csv_path_te_s = 'podcastmix/metadata/test/speech.csv'
csv_path_te_m = 'podcastmix/metadata/test/music.csv'

In [None]:
create_csv_metadata(csv_path_tr_s, speech_headers)
create_csv_metadata(csv_path_tr_m, music_headers)
create_csv_metadata(csv_path_va_s, speech_headers)
create_csv_metadata(csv_path_va_m, music_headers)
create_csv_metadata(csv_path_te_s, speech_headers)
create_csv_metadata(csv_path_te_m, music_headers)

In [None]:
# initialize the random seed
random.seed(1)

# determine the train/test partition
train_prop = 0.8
val_prop = 0.1
test_prop = 0.1

In [None]:
# Initialize usefull arrays
counter = 0
speech_train_set = []
speech_val_set = []
speech_test_set = []
music_train_set = []
music_val_set = []
music_test_set = []

In [None]:
# process music files
# open music metadata
with open(music_metadata_path) as file:
    json_file = json.load(file)

# shuffle music
keys = list(json_file.keys())
random.shuffle(keys)
errors = []

In [None]:
# read the json and start copying the files to the respective directory.
# at the same time the csv files are being filled.
for song_id in keys:
    song = json_file.get(song_id)
    
    try:
        audio_info = torchaudio.info(music_path + '/' + song['id'] + '.mp3')
        print(counter, '/', len(keys))
        channels = audio_info.num_channels
        if channels == 2:
            if counter < int(train_prop * len(keys)):
                # train
                destination = train_path + '/music/' + song['id'] + '.mp3'
                # audio = librosa.load(music_path + '/' + song['id'] + '.mp3', sr=44100, mono=False)[0]
                # sf.write(destination, audio.T, samplerate=44100)
                copyfile(music_path + '/' + song['id'] + '.mp3', destination)
                song['local_path'] = destination
                music_train_set.append(song)
                csv_path = 'podcastmix/metadata/train/music.csv'
            elif counter >= int(train_prop * len(keys)) and counter < int((train_prop + val_prop) * len(keys)):
                # val
                destination = val_path + '/music/' + song['id'] + '.mp3'
                # audio = librosa.load(music_path + '/' + song['id'] + '.mp3', sr=44100, mono=False)[0]
                # sf.write(destination, audio.T, samplerate=44100)
                copyfile(music_path + '/' + song['id'] + '.mp3', destination)
                song['local_path'] = destination
                music_val_set.append(song)
                csv_path = 'podcastmix/metadata/val/music.csv'
            else:
                # test
                destination = test_path + '/music/' + song['id'] + '.mp3'
                # audio = librosa.load(music_path + '/' + song['id'] + '.mp3', sr=44100, mono=False)[0]
                # sf.write(destination, audio.T, samplerate=44100)
                copyfile(music_path + '/' + song['id'] + '.mp3', destination)
                song['local_path'] = destination
                music_test_set.append(song)
                csv_path = 'podcastmix/metadata/test/music.csv'
            with open(csv_path, 'a', newline='') as file:
                writer = csv.writer(file)
                song_length = audio_info.num_frames
                writer.writerow([song['id'],song['id'],song['name'].replace(',',''),song['artist_name'].replace(',',''),song['album_name'].replace(',',''),song['license_ccurl'],song['releasedate'],song['local_path'] ,song_length])
        else:
            errors.append(song_id)
    except:
        errors.append(song_id)
    counter +=1
print('errores',errors)
print('keys',keys[0])

In [None]:
!pip install ffmpeg

In [None]:
# process speech files
speech_files = np.array([])
for path, subdirs, files in os.walk(speech_path):
   for name in files:
       speech_files = np.append(speech_files, os.path.join(path, name))

# shuffle speech
np.random.shuffle(speech_files)

# read speech metadata.txt

# get the speakers metadata from the csv:
speaker_params = {}
s_m = open(speech_metadata_path, 'r')
lines = s_m.readlines()
count = 0
for line in lines:
   if count != 0:
       #skip headers
       cols = re.split('\s+', line)
       speaker_params[cols[0]] = {'speaker_id':cols[0],'speaker_age':cols[1],'speaker_gender':cols[2],'speaker_accent':cols[3]}
   count += 1

# iterate over the speakers downsampling and normalizing the audio.
# The new 44.1hKz versions are then written in the respective directory
# inside the podcastmix. the metadata csv for the podcastmix is also filled
counter = 0
for speech_path_dir in speech_files:
    print(counter, '/', len(speech_files))
    if counter < int(train_prop * len(speech_files)):
        # train
        destination = train_path + '/speech/' + speech_path_dir.split('/')[-1].split('.')[0] + '.mp3'
        # resample from 48kHz -> 44.1kHz
        audio, original_sr = torchaudio.load(speech_path_dir, normalize=True)
        resampled_audio = torchaudio.transforms.Resample(original_sr, 44100)(audio)
        torchaudio.save(filepath = destination, src = resampled_audio, sample_rate=44100)
        # copyfile(speech_path_dir, destination)
        speech_train_set.append(destination)
        csv_path = 'podcastmix/metadata/train/speech.csv'
    elif counter >= int(train_prop * len(speech_files)) and counter < int((train_prop + val_prop) * len(speech_files)):
        # val
        destination = val_path + '/speech/' + speech_path_dir.split('/')[-1].split('.')[0] + '.mp3'
        # resample from 48kHz -> 44.1kHz
        audio, original_sr = torchaudio.load(speech_path_dir, normalize=True)
        resampled_audio = torchaudio.transforms.Resample(original_sr, 44100)(audio)
        torchaudio.save(filepath = destination, src = resampled_audio, sample_rate=44100)
        # copyfile(speech_path_dir, destination)
        speech_val_set.append(destination)
        csv_path = 'podcastmix/metadata/val/speech.csv'
    else:
        # test
        destination = test_path + '/speech/' + speech_path_dir.split('/')[-1].split('.')[0] + '.mp3'
        # resample from 48kHz -> 44.1kHz
        audio, original_sr = torchaudio.load(speech_path_dir, normalize=True)
        resampled_audio = torchaudio.transforms.Resample(original_sr, 44100)(audio)
        torchaudio.save(filepath = destination, src = resampled_audio, sample_rate=44100)
        # copyfile(speech_path_dir, destination)
        speech_test_set.append(destination)
        csv_path = 'podcastmix/metadata/test/speech.csv'
    with open(csv_path, 'a', newline='') as file:
        writer = csv.writer(file)
        audio = torchaudio.info(destination)
        element_length = audio.num_frames
        speech_cmp = destination.split('/')[-1].split('_')
        params = speaker_params[speech_cmp[0]]
        writer.writerow([speech_cmp[1]+'_'+speech_cmp[2].split('.')[0], speech_cmp[0], params['speaker_age'], params['speaker_gender'], params['speaker_accent'].replace(',',''), destination, element_length])
    counter += 1


# Check dataset consistency

In [None]:
import os
from csv import reader

def check_files_against_csv(csv_path, files_path, index_of_path_in_csv = 7):
    not_missing = []

    with open(csv_path, 'r') as read_obj:
        csv_reader = reader(read_obj)
        header = next(csv_reader)
        # Check file as empty
        if header != None:
            # Iterate over each row after the header in the csv
            for row in csv_reader:
                # row variable is a list that represents a row in csv
                path = row[index_of_path_in_csv]
                if os.path.isfile(path):
                    not_missing.append(path.split('/')[3])
                else:
                    print("im not file", path)

    onlyfiles = [f for f in listdir(files_path) if isfile(join(files_path, f))]
    print('Check diff between:', files_path, csv_path)
    diff = list(set(onlyfiles) - set(not_missing))
    print('List of files minus list in csv:', diff)
    diff2 = list(set(not_missing) - set(onlyfiles))
    print('List in csv minus list of files:', diff2)

In [None]:
# check consistency of the dataset:
files_path = 'podcastmix/test/music'
csv_path = 'podcastmix/metadata/test/music.csv'
check_files_against_csv(csv_path, files_path, 7)

files_path = 'podcastmix/val/music'
csv_path = 'podcastmix/metadata/val/music.csv'
check_files_against_csv(csv_path, files_path, 7)

files_path = 'podcastmix/train/music'
csv_path = 'podcastmix/metadata/train/music.csv'
check_files_against_csv(csv_path, files_path, 7)

files_path = 'podcastmix/train/speech'
csv_path = 'podcastmix/metadata/train/speech.csv'
check_files_against_csv(csv_path, files_path, 5)

files_path = 'podcastmix/val/speech'
csv_path = 'podcastmix/metadata/val/speech.csv'
check_files_against_csv(csv_path, files_path, 5)

files_path = 'podcastmix/test/speech'
csv_path = 'podcastmix/metadata/test/speech.csv'
check_files_against_csv(csv_path, files_path, 5)

#Dataset ready! 
Now go to the FinalProject_audio_processing notebook to train and evaluate the models over this dataset.
