In [None]:
import IPython, numpy as np, scipy as sp, matplotlib.pyplot as plt, matplotlib, sklearn, librosa
from IPython.display import Audio
%matplotlib inline

import audio_utils
import vamp

import os

os.environ["VAMP_PATH"] = "./melodia_plugins/"

In [None]:
def melodic_frequency_filepath(filepath):
    audio, sr = librosa.load(filepath, mono=True)
    data = vamp.collect(audio, sr, "mtg-melodia:melodia")
    hop, melody = data['vector']
    return melody

def melodic_frequency(audio, sr):
    data = vamp.collect(audio, sr, "mtg-melodia:melodia")
    hop, melody = data['vector']
    return melody

In [None]:
# A clearer option is to get rid of the negative values before plotting
def plot_melody(melody):
    timestamps = 8 * 128/44100.0 + np.arange(len(melody)) * (128/44100.0)
    melody_pos = melody[:]
    melody_pos[melody<=0] = None
    plt.figure(figsize=(18,6))
    plt.plot(timestamps, melody_pos)
    plt.xlabel('Time (s)')
    plt.ylabel('Frequency (Hz)')
    plt.show()

In [None]:
VAMP_CONSTANT = (128/44100.0)

def timestamp_to_index(timestamp):
    index = (timestamp - (8 * VAMP_CONSTANT)) / VAMP_CONSTANT
    return int(index)

def find_interval_freq(start_time, end_time, melody):
    start_index = timestamp_to_index(start_time)
    end_index = timestamp_to_index(end_time)
    
    relevant_frequencies = melody[start_index:end_index]
    
    freq = find_freq(relevant_frequencies)
    
    return freq

def find_freq(melody):
    voiced_frequencies = np.array([ freq for freq in melody if freq > 0 ])
    freq = np.median(voiced_frequencies)
    return freq

In [None]:
def plot_and_play(y, sr, label=''):
    print(label)
    IPython.display.display(Audio(y, rate=sr))
    audio_utils.plot_audio(y, sr=sr)

In [None]:
from pydub import AudioSegment
import subprocess

SPEECH_FILEPATH = 'temp/tts_dump.wav'
TEXT_FILEPATH = 'temp/text.txt'
SCRIPT_FILEPATH = "./text_gen.sh"

def tts(text):
    """
        returns list of AudioSegments corresponding to each word in text
    """
    tokens = text.split(' ')
    
    audio_chunks = [tts_word(token) for token in tokens]

    return audio_chunks

def tts_word(text):

    with open(TEXT_FILEPATH, 'w+') as f:
        f.write(text)

    p = subprocess.call(SCRIPT_FILEPATH, shell=True)

    y = AudioSegment.from_wav(SPEECH_FILEPATH)

    return y

In [None]:
from pydub.silence import split_on_silence

def get_syllable_signals(text, sr):
    params = {'min_silence_len': 40, 
              'silence_thresh': -18, 
              'keep_silence': 10
             }
    audio_words = tts_word(text)
    audio_words = audio_words.set_frame_rate(sr)

    audio_chunks = split_on_silence(audio_words, 
                                    min_silence_len=params['min_silence_len'], 
                                    silence_thresh=params['silence_thresh'], 
                                    keep_silence=params['keep_silence'])

    all_segments = [np.array(chunk.get_array_of_samples(), dtype=float) for chunk in audio_chunks]

    return all_segments

In [None]:
def onset_intervals(y, sr):
#     onset_times = librosa.onset.onset_detect(y=y, sr=sr, units='time')
    
    onset_env = librosa.onset.onset_strength(y=y, sr=sr, aggregate=np.median)
    times = librosa.frames_to_time(np.arange(len(onset_env)), sr=sr)
    peaks = librosa.util.peak_pick(onset_env, pre_max=50, post_max=50, pre_avg=100, post_avg=100, delta=0.03, wait=60)
    onset_times = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time',pre_max=50, post_max=50, pre_avg=100, post_avg=100, delta=0.03, wait=60)
    
    plt.figure()
    plt.plot(times, onset_env, alpha=0.8, label='Onset strength')
    plt.vlines(times[peaks], 0, onset_env.max(), color='r', alpha=0.8, label='Selected peaks')
    plt.legend(frameon=True, framealpha=0.8)
    plt.axis('tight')
    plt.tight_layout()
    print(peaks)
    print(onset_times.shape)
    
    final_time = len(y)/ sr
    
    intervals = [onset_interval(i, onset, onset_times, final_time) for i, onset in enumerate(onset_times)]
    
    return intervals

def onset_interval(i, onset_start, onset_times, last_timestamp):
    if i+1 >= len(onset_times):
        return (onset_start, last_timestamp)

    else:
        return (onset_start, onset_times[i+1])
    
def onset_frequencies(y, sr):
    intervals = onset_intervals(y, sr)
    y_melody = melodic_frequency(y, sr)
    y_freqs = [find_interval_freq(start, end, y_melody) for start, end in intervals]
    return intervals, y_freqs

In [None]:
def pitch_difference(fn, f0):
    n_steps = np.round(np.log2(float(fn)/f0) * 12)
    return n_steps

In [None]:
def read_voice_frequency():
    filepath = 'espeak-data/voices/robot'
    with open(filepath) as f:
        lines = [line.split(' ') for line in f.readlines()]
        line = [line for line in lines if line and line[0] == 'pitch'][0]
        f0 = int(line[1])
        return f0

In [None]:
def autotune(signals, fn, sr):
    
    shifted_signals = [pitch_correct(y=y, fn=f, sr=sr) for y, f in zip(signals, fn)]
    
    return shifted_signals

def pitch_correct(y, fn, sr):
    f0 = read_voice_frequency()
    
    n_steps = pitch_difference(fn=fn, f0=f0)

    shifted_y = librosa.effects.pitch_shift(y=y, sr=sr, n_steps=n_steps)

    return shifted_y
    

In [None]:
def stretch_signals(signals, intervals, sr):
    stretchy = [stretch_signal(y=y, interval=interval, sr=sr) for y, interval in zip(signals, intervals)]
    return stretchy

def stretch_signal(y, interval, sr):
    note_duration = interval[1] - interval[0]
    y_duration = librosa.get_duration(y=y, sr=sr)
    rate = y_duration/note_duration
    stretched_y = librosa.effects.time_stretch(y, rate=rate)
    return stretched_y

In [None]:
import nussl

def get_melody(y, sr):
    signal = nussl.AudioSignal(audio_data_array=y, sample_rate=sr)
    
    melodia = nussl.separation.melodia.Melodia(signal)

    melodia.run()

    foreground_and_background = melodia.make_audio_signals()

    foreground = foreground_and_background[1]
    background = foreground_and_background[0]
    
    backtrack = background.to_mono()
    plot_and_play(backtrack, sr)
    
    melody = foreground.to_mono()
    
    return melody

In [None]:
def normalize_length(lst1, lst2, lst3):
    
    length = min(len(lst1), len(lst2), len(lst3))
    
    return lst1[:length], lst2[:length], lst3[:length]

In [None]:
def overlay_signals(song_signal, vocal_signal, sr):
    
    first_onset_sample = librosa.onset.onset_detect(y=song_signal, sr=sr, units='samples')[0]
    
    song_trimmed = song_signal[first_onset_sample:]

    min_length = min(len(song_trimmed), len(vocal_signal))
    
    both = librosa.util.normalize(song_trimmed[:min_length]) + librosa.util.normalize(vocal_signal[:min_length])
    
    return both

In [None]:
def e2e(filename, text, sr):
    song, sr = librosa.load(filename, sr=sr)
    
    melody = get_melody(song, sr)
    
    intervals, frequencies = onset_frequencies(melody, sr)
    
    print(len(intervals))
    
    for i, f in zip(intervals, frequencies):
        print(i, f)
    
    syllables = get_syllable_signals(text, sr)
    
    syllables, intervals, frequencies = normalize_length(syllables, intervals, frequencies)
    
    pitch_corrected_syllables = autotune(syllables, fn=frequencies, sr=sr)
    
    time_corrected_syllables = stretch_signals(pitch_corrected_syllables, intervals, sr=sr)
    
    tuned_voice_signal = np.concatenate(time_corrected_syllables)
    
    song_with_voice = overlay_signals(song, tuned_voice_signal, sr)
    
    return song_with_voice
    

In [None]:
sr = 44100
text = 'happy birthday to you'
y = e2e(filename='audio/hbd_snip.wav', text=text, sr=sr)


plot_and_play(y, sr)