In [1]:
import IPython, numpy as np, scipy as sp, matplotlib.pyplot as plt, matplotlib, sklearn, librosa
from IPython.display import Audio
%matplotlib inline

import audio_utils
import vamp

import os

os.environ["VAMP_PATH"] = "./melodia_plugins/"

In [2]:
def melodic_frequency_filepath(filepath):
    """
    Deprecated.
    """
    audio, sr = librosa.load(filepath, mono=True)
    data = vamp.collect(audio, sr, "mtg-melodia:melodia")
    hop, melody = data['vector']
    return melody

def melodic_frequency(audio, sr):
    """
    Uses Melodia to extract the main, monophonic frequencies played at each instance of time for a given signal.
    """
    data = vamp.collect(audio, sr, "mtg-melodia:melodia")
    hop, melody = data['vector']
    return melody

In [3]:
# A clearer option is to get rid of the negative values before plotting
def plot_melody(melody):
    timestamps = 8 * 128/44100.0 + np.arange(len(melody)) * (128/44100.0)
    melody_pos = melody[:]
    melody_pos[melody<=0] = None
    plt.figure(figsize=(18,6))
    plt.plot(timestamps, melody_pos)
    plt.xlabel('Time (s)')
    plt.ylabel('Frequency (Hz)')
    plt.show()

In [4]:
VAMP_CONSTANT = (128/44100.0)

def timestamp_to_index(timestamp):
    """
    Used to index into output of melodic_frequency function.
    """
    index = (timestamp - (8 * VAMP_CONSTANT)) / VAMP_CONSTANT
    return int(index)

def index_to_timestamp(idx):
    """
    Inverses the operation of the timestamp_to_index function.
    
    Won't be the exact timestamp because of the int() operation in timestamp_to_idx.
    """
    timestamp = (idx * VAMP_CONSTANT) + (8 * VAMP_CONSTANT)
    return timestamp

def find_interval_freq(start_time, end_time, melody):
    """
    Given a start, end time and a melody found by Melodia, finds the fundamental frequency being played at the interval.
    
    Takes the median frequency being played in the given interval.
    """
    start_index = timestamp_to_index(start_time)
    end_index = timestamp_to_index(end_time)
    
    relevant_frequencies = melody[start_index:end_index]
    
    freq = find_freq(relevant_frequencies)
    
    return freq

def find_freq(melody):
    voiced_frequencies = np.array([ freq for freq in melody if freq > 0 ])
    freq = np.median(voiced_frequencies)
    return freq

In [5]:
def plot_and_play(y, sr, label=''):
    print(label)
    IPython.display.display(Audio(y, rate=sr))
    audio_utils.plot_audio(y, sr=sr)

In [6]:
from pydub import AudioSegment
import subprocess

SPEECH_FILEPATH = 'temp/tts_dump.wav'
TEXT_FILEPATH = 'temp/text.txt'
SCRIPT_FILEPATH = "./text_gen.sh"

def tts(text):
    """
    Deprecated.
    """
    tokens = text.split(' ')
    
    audio_chunks = [tts_word(token) for token in tokens]

    return audio_chunks

def tts_word(text):
    """
    Returns list of AudioSegments corresponding to each word in text.
    """

    with open(TEXT_FILEPATH, 'w+') as f:
        f.write(text)

    p = subprocess.call(SCRIPT_FILEPATH, shell=True)

    y = AudioSegment.from_wav(SPEECH_FILEPATH)

    return y



In [7]:
from pydub.silence import split_on_silence

def get_syllable_signals(text, sr):
    """
    Returns list of signals corresponding to syllables in words of the provided text.
    """
    params = {
        'min_silence_len': 40, 
        'silence_thresh': -18, 
        'keep_silence': 10
    }
    audio_words = tts_word(text)
    audio_words = audio_words.set_frame_rate(sr)

    audio_chunks = split_on_silence(audio_words, 
                                    min_silence_len=params['min_silence_len'], 
                                    silence_thresh=params['silence_thresh'], 
                                    keep_silence=params['keep_silence'])

    all_segments = [np.array(chunk.get_array_of_samples(), dtype=float) for chunk in audio_chunks]

    return all_segments

In [22]:
def onset_intervals(y, y_melody, sr):
    """
    Returns a list of tuples of (onset_start, onset_end). Onset end is essentially when the note ends.
    """
    onset_times = librosa.onset.onset_detect(y=y, sr=sr, units='time')
    
    final_time = len(y)/ sr
    
    onset_intervals = [onset_interval(i, onset, onset_times, final_time, y_melody) for i, onset in enumerate(onset_times)]
    
    return onset_intervals

def onset_interval(i, onset_start, onset_times, last_timestamp, y_melody):
    """
    "Helper function for above ^"
    """
    # converting to indices so we may index into y_melody (sourced from melodia)
    start_idx = timestamp_to_index(onset_start)
    if i+1 >= len(onset_times):
        end_idx = len(y_melody)
    else:
        end_idx = timestamp_to_index(min(onset_times[i+1], last_timestamp))

    # iterating over interval between this and the next onset
    for idx in range(start_idx, end_idx):
        # if no melody playing, the note must have ended
        if y_melody[idx] < 0.0:
            onset_end = index_to_timestamp(idx)
            return onset_start, onset_end

    # means that there was no silence in the melody during the interval duration
    return (onset_start, onset_times[i+1])
    
def onset_frequencies(y, sr):
    """
    Given a signal and sample rate:
    
     - returns list of intervals and corresponding frequencies as a tuple of 2 lists.
    """
    y_melody = melodic_frequency(y, sr)
    intervals = onset_intervals(y, y_melody, sr)
#     print(" printing output of melodic_frequency function:") # debug
#     print(y_melody) # debug
    y_freqs = [find_interval_freq(start, end, y_melody) for start, end in intervals]
    return intervals, y_freqs

In [23]:
def pitch_difference(fn, f0):
    n_steps = np.round(np.log2(float(fn)/f0) * 12)
    return n_steps

In [24]:
def autotune(signals, fn, sr):
    """
    Given a list of signals and corresponding frequencies, autotunes each signal to corresponding frequency.
    """
    
    shifted_signals = [pitch_correct(y=y, fn=f, sr=sr) for y, f in zip(signals, fn)]
    
    return shifted_signals

def pitch_correct(y, fn, sr):
    """
    Given a signal and a frequency, autotunes the signal to the frequency.
    """
    f0 = 150
    
    n_steps = pitch_difference(fn=fn, f0=f0)

    shifted_y = librosa.effects.pitch_shift(y=y, sr=sr, n_steps=n_steps)

    return shifted_y
    

In [25]:
def stretch_signals(signals, intervals, sr):
    stretchy = [stretch_signal(y=y, interval=interval, sr=sr) for y, interval in zip(signals, intervals)]
    return stretchy

def stretch_signal(y, interval, sr):
    """
    Stretches a signal to sound for the duration of the interval.
    """
    note_duration = interval[1] - interval[0]
    y_duration = librosa.get_duration(y=y, sr=sr)
    rate = y_duration/note_duration
    stretched_y = librosa.effects.time_stretch(y, rate=rate)
    return stretched_y

In [26]:
def e2e(filename, text, sr):
    song, sr = librosa.load(filename, sr=sr)
    
    intervals, frequencies = onset_frequencies(song, sr)
    
    syllables = get_syllable_signals(text, sr)

    pitch_corrected_syllables = autotune(syllables, fn=frequencies, sr=sr)
    
    time_corrected_syllables = stretch_signals(pitch_corrected_syllables, intervals, sr=sr)
    
    tuned_voice_signal = np.concatenate(time_corrected_syllables)
    
    song_with_voice = overlay_signals(song, tuned_voice_signal, sr)
    
    return song_with_voice
    

In [27]:
def overlay_signals(song_signal, vocal_signal, sr):
    
    first_onset_sample = librosa.onset.onset_detect(y=song_signal, sr=sr, units='samples')[0]
    
    song_trimmed = song_signal[first_onset_sample:]

    min_length = min(len(song_trimmed), len(vocal_signal))
    
    both = librosa.util.normalize(song_trimmed[:min_length]) + librosa.util.normalize(vocal_signal[:min_length])
    
    return both

In [None]:
# sr = 44100
# text = 'happy birthday to you'
# y = e2e(filename='audio/hbd_snip.wav', text=text, sr=sr)
# plot_and_play(y, sr)

sr = 44100
text = 'Trumpet'
y = e2e(filename='audio/trumpet.wav', text=text, sr=sr)
plot_and_play(y, sr)