Skip to content

Commit

Permalink
Merge pull request #58 from marl/drive-access
Browse files Browse the repository at this point in the history
google drive api
  • Loading branch information
rabitt committed Jan 12, 2017
2 parents 29f4383 + 226d7ad commit 2b6c577
Show file tree
Hide file tree
Showing 9 changed files with 673 additions and 2 deletions.
16 changes: 16 additions & 0 deletions medleydb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,18 @@
for line in fhandle.readlines():
TRACK_LIST_V1.append(line.strip('\n'))

TRACK_LIST_V2 = []
with open(path.join(path.dirname(__file__), 'resources',
'tracklist_v2.txt'), 'r') as fhandle:
for line in fhandle.readlines():
TRACK_LIST_V2.append(line.strip('\n'))

TRACK_LIST_EXTRA = []
with open(path.join(path.dirname(__file__), 'resources',
'tracklist_extra.txt'), 'r') as fhandle:
for line in fhandle.readlines():
TRACK_LIST_EXTRA.append(line.strip('\n'))

with open(path.join(path.dirname(__file__), 'resources',
'taxonomy.yaml'), 'r') as fhandle:
INST_TAXONOMY = yaml.load(fhandle)
Expand All @@ -68,6 +80,10 @@
'artist_index.json'), 'r') as fhandle:
ARTIST_INDEX = json.load(fhandle)

GRDIVE_CONFIG_PATH = path.join(
path.dirname(__file__), 'resources', 'client_secrets.json'
)

# Audio is downloaded separately and is not version controlled :'(.
# This is the motivation for requesting the user to set the MEDLEYDB_PATH
if AUDIO_AVAILABLE:
Expand Down
319 changes: 319 additions & 0 deletions medleydb/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,319 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Methods for downloading audio from google drive."""
from medleydb import MEDLEYDB_PATH
from medleydb import AUDIO_PATH
from medleydb import GRDIVE_CONFIG_PATH

import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive

GAUTH = None
DRIVE = None

FOLDER_MIME = 'application/vnd.google-apps.folder'
BASEDIR_WRITEABLE = False
DOWNLOADED_FILEPATHS = []

GDRIVE_FOLDERS = {
'V1': '0B72xIeDqCfuUdFhhWUJOb0l2eDg',
'V2': '0B72xIeDqCfuURlo2M3U4eXhiRmM',
'EXTRA': '0B72xIeDqCfuULUkySDVQUXhIWGs'
}


def authorize_google_drive():
global GAUTH
global DRIVE
if GAUTH is None or DRIVE is None:
GAUTH = GoogleAuth()
# Creates local webserver and auto handles authentication.
GAUTH.LoadClientConfigFile(client_config_file=GRDIVE_CONFIG_PATH)
GAUTH.LocalWebserverAuth()
DRIVE = GoogleDrive(GAUTH)
return True
else:
return True


def purge_downloaded_files():
"""Delete all files downloaded this session.
"""
for fpath in DOWNLOADED_FILEPATHS:
os.remove(fpath)


def check_basedir_writeable():
"""Check if the AUDIO_PATH exists and is writeable.
If it doesn't exist, this tries to create it.
Returns
-------
status : bool
True on success
"""
if MEDLEYDB_PATH is None:
raise EnvironmentError(
"The environment variable MEDLEYDB_PATH must be set "
"to use the download module."
)

if not os.path.exists(MEDLEYDB_PATH):
try:
os.mkdir(MEDLEYDB_PATH)
except:
raise EnvironmentError(
"The value set for the MEDLEYDB_PATH does not exist and "
"cannot be created."
)

if not os.path.exists(AUDIO_PATH):
os.mkdir(AUDIO_PATH)

global BASEDIR_WRITEABLE
BASEDIR_WRITEABLE = True
return True


def make_mtrack_basedir(mtrack):
"""Create a multitrack objects' Audio directory structure if it doesn't
already exist.
Returns
-------
status : bool
True on success
"""
if not BASEDIR_WRITEABLE:
check_basedir_writeable()

if not os.path.exists(mtrack.audio_path):
os.mkdir(mtrack.audio_path)

if not os.path.exists(mtrack._stem_dir_path):
os.mkdir(mtrack._stem_dir_path)

if not os.path.exists(mtrack._raw_dir_path):
os.mkdir(mtrack._raw_dir_path)

return True


def download_mix(mtrack):
"""Download a multitracks mix to the mix_path.
Parameters
----------
mtrack : MultiTrack
A multitrack object
Returns
-------
status : bool
True on success
"""

if os.path.exists(mtrack.mix_path):
return True

try:
top_folderid = GDRIVE_FOLDERS[mtrack.dataset_version]
except KeyError:
raise IOError("Unable to find data in Google Drive for this version.")

file_list = get_named_child(top_folderid, mtrack.title)
correct_file = [f for f in file_list if f['title'] == mtrack.track_id]

if len(correct_file) == 0:
raise IOError("Could not find multitrack")
else:
mtrack_file = correct_file[0]

mix_file_list = get_named_child(mtrack_file['id'], 'MIX')
if len(mix_file_list) > 0:
mix_file = mix_file_list[0]
else:
raise IOError("Could not find Mix")

make_mtrack_basedir(mtrack)
download_file(mix_file['id'], mtrack.mix_path)

DOWNLOADED_FILEPATHS.append(mtrack.mix_path)

return True


def download_stem(mtrack, stemid):
"""Download a multitrack's stem to the stem's audio path.
Parameters
----------
mtrack : MultiTrack
A multitrack object
stemid : int
The stem id to download
Returns
-------
status : bool
True on success
"""
stem = mtrack.stems[stemid]

if os.path.exists(stem.audio_path):
return True

try:
top_folderid = GDRIVE_FOLDERS[mtrack.dataset_version]
except KeyError:
raise IOError("Unable to find data in Google Drive for this version.")

file_list = get_named_child(top_folderid, mtrack.title)
correct_file = [f for f in file_list if f['title'] == mtrack.track_id]

if len(correct_file) == 0:
raise IOError("Could not find multitrack")
else:
mtrack_file = correct_file[0]

stem_file_list = get_named_child(mtrack_file['id'], 'STEMS')
if len(stem_file_list) > 0:
stem_folder = stem_file_list[0]
else:
raise IOError("Could not find stems folder")

stem_file_list2 = get_named_child(
stem_folder['id'], os.path.basename(stem.audio_path)
)
if len(stem_file_list2) > 0:
stem_file = stem_file_list2[0]
else:
raise IOError("Could not find stem file")

make_mtrack_basedir(mtrack)
download_file(stem_file['id'], stem.audio_path)

DOWNLOADED_FILEPATHS.append(stem.audio_path)

return True


def download_raw(mtrack, stemid, rawid):
"""Download a specific raw file to the raw track's audio path.
Parameters
----------
mtrack : MultiTrack
A multitrack object
stemid : int
The raw track's stem id
rawid : int
The raw track's id
Returns
-------
status : bool
True on success
"""
raw_track = mtrack.raw_audio[stemid][rawid]

if os.path.exists(raw_track.audio_path):
return True

try:
top_folderid = GDRIVE_FOLDERS[mtrack.dataset_version]
except KeyError:
raise IOError("Unable to find data in Google Drive for this version.")

file_list = get_named_child(top_folderid, mtrack.title)
correct_file = [f for f in file_list if f['title'] == mtrack.track_id]

if len(correct_file) == 0:
raise IOError("Could not find multitrack")
else:
mtrack_file = correct_file[0]

raw_file_list = get_named_child(mtrack_file['id'], 'RAW')
if len(raw_file_list) > 0:
raw_folder = raw_file_list[0]
else:
raise IOError("Could not find raws folder")

raw_file_list2 = get_named_child(
raw_folder['id'], os.path.basename(raw_track.audio_path)
)
if len(raw_file_list2) > 0:
raw_file = raw_file_list2[0]
else:
raise IOError("Could not find raw file")

make_mtrack_basedir(mtrack)
download_file(raw_file['id'], raw_track.audio_path)

DOWNLOADED_FILEPATHS.append(raw_track.audio_path)

return True


def get_named_child(parent_id, child_name):
"""Get a file given the id of a parent folder and the title
Parameters
----------
parent_id : str
Google drive id of parent folder
child_name : str
File name of the child to find.
Returns
-------
file_list : list
List of files matching the query.
"""
authorize_google_drive()
query = "'{}' in parents and title contains '{}' and trashed=false".format(
parent_id, child_name
)
file_list = DRIVE.ListFile(
{'q': query}
).GetList()
return file_list


def get_files_in_folder(folderid):
"""get a list of the files in a google drive folder given the folder id
"""
authorize_google_drive()
file_list = DRIVE.ListFile(
{'q': "'{}' in parents and trashed=false".format(folderid)}
).GetList()
return file_list


def is_folder(file_object):
"""Determine if a google drive file object is a folder or not
"""
return file_object['mimeType'] == FOLDER_MIME


def download_file(fileid, save_path):
"""Download a google drive fileid to the specified save path.
Returns
-------
status : bool
True on success
"""
authorize_google_drive()
file_object = DRIVE.CreateFile({'id': fileid})
file_object.GetContentFile(save_path)
return True

16 changes: 16 additions & 0 deletions medleydb/multitrack.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
from . import ANNOT_PATH
from . import METADATA_PATH
from . import AUDIO_PATH
from . import TRACK_LIST_V1
from . import TRACK_LIST_V2
from . import TRACK_LIST_EXTRA

_YESNO = dict(yes=True, no=False)
_TRACKID_FMT = "%s_%s"
Expand Down Expand Up @@ -114,6 +117,10 @@ class MultiTrack(object):
True if multitrack has at least one melody stem
predominant_stem : Track or None
Track object for the predominant stem if availalbe, otherwise None
dataset_version : string
Iteration a multitrack came from.
(E.g. "V1" for MedleyDB dataset_version 1,
"V2" for MedleyDB dataset_version 2)
_stem_activations : np.array
Matrix of stem activations
_stem_activations_idx : dictionary
Expand Down Expand Up @@ -153,6 +160,15 @@ def __init__(self, track_id):
self.title = track_id.split('_')[1]
self.track_id = track_id

if track_id in TRACK_LIST_V1:
self.dataset_version = 'V1'
elif track_id in TRACK_LIST_V2:
self.dataset_version = 'V2'
elif track_id in TRACK_LIST_EXTRA:
self.dataset_version = 'EXTRA'
else:
self.dataset_version = ''

# Filenames and Filepaths #
self._meta_path = os.path.join(
METADATA_PATH, _METADATA_FMT % self.track_id
Expand Down
1 change: 1 addition & 0 deletions medleydb/resources/client_secrets.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"web":{"client_id":"378301188007-c65lbvu39daldp0nfe5rpl20undan4ii.apps.googleusercontent.com","project_id":"civic-depth-154600","auth_uri":"https://accounts.google.com/o/oauth2/auth","token_uri":"https://accounts.google.com/o/oauth2/token","auth_provider_x509_cert_url":"https://www.googleapis.com/oauth2/v1/certs","client_secret":"18aPs1V-8ylgBTT4ph0McCeO","redirect_uris":["http://localhost:8080/"],"javascript_origins":["http://localhost:8080"]}}
Loading

0 comments on commit 2b6c577

Please sign in to comment.