Skip to content

Commit

Permalink
Merge 469cc61 into cb7096e
Browse files Browse the repository at this point in the history
  • Loading branch information
auroracramer committed Jul 10, 2019
2 parents cb7096e + 469cc61 commit 79cf6f9
Show file tree
Hide file tree
Showing 23 changed files with 1,668 additions and 312 deletions.
7 changes: 5 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@ before_install:
- pip install python-coveralls
- pip install "pytest-faulthandler>=1.5.0,<2.0.0"
- pip install pytest-cov
# - pip install numpydoc
- sudo add-apt-repository -y ppa:mc3man/trusty-media
- sudo apt-get update
- sudo apt-get install -y ffmpeg
- ffmpeg -hwaccels

install:
- pip install "tensorflow>=1.12.0,<1.14.0"
Expand All @@ -31,4 +34,4 @@ script:
- kill %1

after_success:
- coveralls
- coveralls
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# OpenL3

OpenL3 is an open-source Python library for computing deep audio and (eventually) image embeddings.
OpenL3 is an open-source Python library for computing deep audio and image embeddings.

[![PyPI](https://img.shields.io/badge/python-2.7%2C%203.5%2C%203.6-blue.svg)](https://pypi.python.org/pypi/openl3)
[![MIT license](https://img.shields.io/badge/License-MIT-blue.svg)](https://choosealicense.com/licenses/mit/)
Expand Down
4 changes: 3 additions & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ def getattr(cls, name):
MOCK_MODULES = [
'numpy', 'soundfile', 'resampy', 'keras', 'tensorflow',
'kapre', 'kapre.time_frequency', 'keras.layers', 'keras.models',
'keras.regularizers', 'sklearn', 'sklearn.decomposition'
'keras.regularizers', 'sklearn', 'sklearn.decomposition', 'skimage',
'moviepy', 'skimage.io', 'moviepy.video', 'moviepy.video.io',
'moviepy.video.io.VideoFileClip'
]

sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
Expand Down
2 changes: 1 addition & 1 deletion docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ OpenL3
tutorial


OpenL3 is an open-source Python library for computing deep audio and (eventually) image embeddings.
OpenL3 is an open-source Python library for computing deep audio and image embeddings.

The audio and image embedding models provided here are published as part of [1], and are based on the Look, Listen and Learn approach [2]. For details about the embedding models and how they were trained, please see:

Expand Down
3 changes: 2 additions & 1 deletion openl3/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .version import version as __version__
from .core import get_embedding, get_output_path, process_file
from .core import get_audio_embedding, get_image_embedding, get_output_path, \
process_audio_file, process_image_file, process_video_file
127 changes: 92 additions & 35 deletions openl3/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import os
import sys
import sklearn.decomposition
from openl3 import process_file
from openl3.models import load_embedding_model
from openl3 import process_audio_file, process_image_file, process_video_file
from openl3.models import load_audio_embedding_model, load_image_embedding_model
from openl3.openl3_exceptions import OpenL3Error
from argparse import ArgumentParser, RawDescriptionHelpFormatter, ArgumentTypeError
from collections import Iterable
Expand Down Expand Up @@ -41,13 +41,17 @@ def get_file_list(input_list):
return file_list


def run(inputs, output_dir=None, suffix=None, input_repr="mel256", content_type="music",
embedding_size=6144, center=True, hop_size=0.1, verbose=False):
def run(modality, inputs, output_dir=None, suffix=None,
input_repr="mel256", content_type="music",
audio_embedding_size=6144, audio_center=True, audio_hop_size=0.1,
image_embedding_size=8192, verbose=False):
"""
Computes and saves L3 embedding for given inputs.
Parameters
----------
modality : str
String to specify the modalities to be processed: audio, image, or video
inputs : list of str, or str
File/directory path or list of file/directory paths to be processed
output_dir : str or None
Expand All @@ -60,15 +64,17 @@ def run(inputs, output_dir=None, suffix=None, input_repr="mel256", content_type=
Spectrogram representation used for model.
content_type : "music" or "env"
Type of content used to train embedding.
embedding_size : 6144 or 512
Embedding dimensionality.
center : boolean
audio_embedding_size : 6144 or 512
Audio embedding dimensionality.
audio_center : boolean
If True, pads beginning of signal so timestamps correspond
to center of window.
hop_size : float
audio_hop_size : float
Hop size in seconds.
quiet : boolean
If True, suppress all non-error output to stdout
image_embedding_size : 8192 or 512
Image embedding dimensionality.
verbose : boolean
If True, print verbose messages.
Returns
-------
Expand All @@ -82,23 +88,59 @@ def run(inputs, output_dir=None, suffix=None, input_repr="mel256", content_type=
raise OpenL3Error('Invalid input: {}'.format(str(inputs)))

if len(file_list) == 0:
print('openl3: No WAV files found in {}. Aborting.'.format(str(inputs)))
print('openl3: No files found in {}. Aborting.'.format(str(inputs)))
sys.exit(-1)

# Load model
model = load_embedding_model(input_repr, content_type, embedding_size)

# Process all files in the arguments
for filepath in file_list:
if verbose:
print('openl3: Processing: {}'.format(filepath))
process_file(filepath,
output_dir=output_dir,
suffix=suffix,
model=model,
center=center,
hop_size=hop_size,
verbose=verbose)
if modality == 'audio':
model = load_audio_embedding_model(input_repr, content_type,
audio_embedding_size)

# Process all files in the arguments
for filepath in file_list:
if verbose:
print('openl3: Processing: {}'.format(filepath))
process_audio_file(filepath,
output_dir=output_dir,
suffix=suffix,
model=model,
center=audio_center,
hop_size=audio_hop_size,
verbose=verbose)
elif modality == 'image':
model = load_image_embedding_model(input_repr, content_type,
image_embedding_size)

# Process all files in the arguments
for filepath in file_list:
if verbose:
print('openl3: Processing: {}'.format(filepath))
process_image_file(filepath,
output_dir=output_dir,
suffix=suffix,
model=model,
verbose=verbose)
elif modality == 'video':
audio_model = load_audio_embedding_model(input_repr, content_type,
audio_embedding_size)
image_model = load_image_embedding_model(input_repr, content_type,
image_embedding_size)

# Process all files in the arguments
for filepath in file_list:
if verbose:
print('openl3: Processing: {}'.format(filepath))
process_video_file(filepath,
output_dir=output_dir,
suffix=suffix,
audio_model=audio_model,
image_model=image_model,
audio_embedding_size=audio_embedding_size,
audio_center=audio_center,
audio_hop_size=audio_hop_size,
image_embedding_size=image_embedding_size,
verbose=verbose)

if verbose:
print('openl3: Done!')

Expand All @@ -107,6 +149,11 @@ def parse_args(args):
parser = ArgumentParser(sys.argv[0], description=main.__doc__,
formatter_class=RawDescriptionHelpFormatter)

parser.add_argument('modality',
choices=['audio', 'image', 'video'],
help='String to specify the modality to the '
'embedding model, audio, image, or video.')

parser.add_argument('inputs', nargs='+',
help='Path or paths to files to process, or path to '
'a directory of files to process.')
Expand All @@ -124,21 +171,29 @@ def parse_args(args):
parser.add_argument('--input-repr', '-i', default='mel256',
choices=['linear', 'mel128', 'mel256'],
help='String specifying the time-frequency input '
'representation for the embedding model.')
'representation for the audio embedding model.')

parser.add_argument('--content-type', '-c', default='music',
choices=['music', 'env'],
help='Content type used to train embedding model.')

parser.add_argument('--embedding-size', '-s', type=int, default=6144,
help='Embedding dimensionality.')
parser.add_argument('--audio-embedding-size', '-as', type=int, default=6144,
choices=[6144, 512],
help='Audio embedding dimensionality.')

parser.add_argument('--no-centering', '-n', action='store_true', default=False,
help='Do not pad signal; timestamps will correspond to '
parser.add_argument('--no-audio-centering', '-n', action='store_true',
default=False,
help='Used for audio embeddings. Do not pad signal; '
'timestamps will correspond to '
'the beginning of each analysis window.')

parser.add_argument('--hop-size', '-t', type=positive_float, default=0.1,
help='Hop size in seconds for processing audio files.')
parser.add_argument('--audio-hop-size', '-t', type=positive_float, default=0.1,
help='Used for audio embeddings. '
'Hop size in seconds for processing audio files.')

parser.add_argument('--image-embedding-size', '-is', type=int, default=8192,
choices=[8192, 512],
help='Image embedding dimensionality.')

parser.add_argument('--quiet', '-q', action='store_true', default=False,
help='Suppress all non-error messages to stdout.')
Expand All @@ -152,12 +207,14 @@ def main():
"""
args = parse_args(sys.argv[1:])

run(args.inputs,
run(args.modality,
args.inputs,
output_dir=args.output_dir,
suffix=args.suffix,
input_repr=args.input_repr,
content_type=args.content_type,
embedding_size=args.embedding_size,
center=not args.no_centering,
hop_size=args.hop_size,
audio_embedding_size=args.audio_embedding_size,
audio_center=not args.no_audio_centering,
audio_hop_size=args.audio_hop_size,
image_embedding_size=args.image_embedding_size,
verbose=not args.quiet)

0 comments on commit 79cf6f9

Please sign in to comment.