In [51]:
!python3 -m pip install -U git+https://github.com/facebookresearch/demucs#egg=demucs
!python3 -m pip install requests
!python3 -m pip install pydub
!python3 -m pip install scipy
!python3 -m pip install librosa

Collecting demucs
  Cloning https://github.com/facebookresearch/demucs to /tmp/pip-install-wbiojqyl/demucs_2643e27b2265487b80031e5ef1ffbfe8
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/demucs /tmp/pip-install-wbiojqyl/demucs_2643e27b2265487b80031e5ef1ffbfe8
  Resolved https://github.com/facebookresearch/demucs to commit e976d93ecc3865e5757426930257e200846a520a
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [16]:
# link drive.
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
# Customize the following options!
model = "htdemucs"
extensions = ["mp3", "wav", "ogg", "flac"]  # we will look for all those file types.
two_stems = "vocals"   # only separate one stems from the rest, for instance
# two_stems = "vocals"

# Options for the output audio.
mp3 = True
mp3_rate = 320
float32 = False  # output as float 32 wavs, unsused if 'mp3' is True.
int24 = False    # output as int24 wavs, unused if 'mp3' is True.
# You cannot set both `float32 = True` and `int24 = True` !!

in_path = '/gdrive/MyDrive/demucs/'
out_path = '/gdrive/MyDrive/demucs_separated/'

#for conversion
input_path = "/gdrive/MyDrive/demucs_separated/htdemucs/original_jp/vocals.mp3"
api_token = "Bg6m_NeL.ruaiCyH8TICCvSo1BRkzwrKE" #note that this API token only last for 1 month from 6/20/2024

#for final recombination
input_converted_path ='converted.wav'
background_path ="/gdrive/MyDrive/demucs_separated/htdemucs/original_jp/no_vocals.mp3"
final_output_path = 'output.mp3'

In [None]:
#@title Useful functions, don't forget to execute
import io
from pathlib import Path
import select
from shutil import rmtree
import subprocess as sp
import sys
from typing import Dict, Tuple, Optional, IO

from google.colab import files

def find_files(in_path):
    out = []
    for file in Path(in_path).iterdir():
        if file.suffix.lower().lstrip(".") in extensions:
            out.append(file)
    return out

def copy_process_streams(process: sp.Popen):
    def raw(stream: Optional[IO[bytes]]) -> IO[bytes]:
        assert stream is not None
        if isinstance(stream, io.BufferedIOBase):
            stream = stream.raw
        return stream

    p_stdout, p_stderr = raw(process.stdout), raw(process.stderr)
    stream_by_fd: Dict[int, Tuple[IO[bytes], io.StringIO, IO[str]]] = {
        p_stdout.fileno(): (p_stdout, sys.stdout),
        p_stderr.fileno(): (p_stderr, sys.stderr),
    }
    fds = list(stream_by_fd.keys())

    while fds:
        # `select` syscall will wait until one of the file descriptors has content.
        ready, _, _ = select.select(fds, [], [])
        for fd in ready:
            p_stream, std = stream_by_fd[fd]
            raw_buf = p_stream.read(2 ** 16)
            if not raw_buf:
                fds.remove(fd)
                continue
            buf = raw_buf.decode()
            std.write(buf)
            std.flush()

def separate(inp=None, outp=None):
    inp = inp or in_path
    outp = outp or out_path
    cmd = ["python3", "-m", "demucs.separate", "-o", str(outp), "-n", model]
    if mp3:
        cmd += ["--mp3", f"--mp3-bitrate={mp3_rate}"]
    if float32:
        cmd += ["--float32"]
    if int24:
        cmd += ["--int24"]
    if two_stems is not None:
        cmd += [f"--two-stems={two_stems}"]
    files = [str(f) for f in find_files(inp)]
    if not files:
        print(f"No valid audio files in {in_path}")
        return
    print("Going to separate the files:")
    print('\n'.join(files))
    print("With command: ", " ".join(cmd))
    p = sp.Popen(cmd + files, stdout=sp.PIPE, stderr=sp.PIPE)
    copy_process_streams(p)
    p.wait()
    if p.returncode != 0:
        print("Command failed, something went wrong.")


def from_upload():
    out_path = Path('separated')
    in_path = Path('tmp_in')

    if in_path.exists():
        rmtree(in_path)
    in_path.mkdir()

    if out_path.exists():
        rmtree(out_path)
    out_path.mkdir()

    uploaded = files.upload()
    for name, content in uploaded.items():
        (in_path / name).write_bytes(content)
    separate(in_path, out_path)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
separate()

Going to separate the files:
/gdrive/MyDrive/demucs/original_jp.mp3
With command:  python3 -m demucs.separate -o /gdrive/MyDrive/demucs_separated/ -n htdemucs --mp3 --mp3-bitrate=320 --two-stems=vocals
Selected model is a bag of 1 models. You will see that many progress bars per track.
Separated tracks will be stored in /gdrive/MyDrive/demucs_separated/htdemucs
Separating track /gdrive/MyDrive/demucs/original_jp.mp3


100%|██████████████████████████████████████████████████████████████████████████| 93.6/93.6 [00:04<00:00, 18.81seconds/s]


### **Vocal Conversion**

In [39]:
import requests
import json
from pydub import AudioSegment

audio = AudioSegment.from_mp3(input_path)
audio.export("target.wav", format="wav")

url = "https://arpeggi.io/api/kits/v1/voice-conversions"
headers = {"Authorization": f"Bearer {api_token}"}
params = {
    "order" : "asc",
    "page" : 1,
    "perPage" : 10,
    "myModels" : "true"
}
response = requests.get(url=url, headers=headers, params = params)
if response.status_code == 200:
  print("Success")
  # print(json.dumps(response.json(),indent=4))
  # instruments = [{voice["model"]["title"] : voice["model"]["id"]} for voice in response.json()["data"]]
  # print(instruments)
else:
  print("Failed")


id = 221129
# for item in instruments:
#   try:
#     print(f"ID: {item['Overdriven Guitar']}")
#     id = item['Overdriven Guitar']
#   except:
#     pass

url_conv = 'https://arpeggi.io/api/kits/v1/voice-conversions'

data = {
    'voiceModelId':id,
    'conversionStrength': 1,
    'modelVolumeMix': 0.8,
    'pitchShift' : 0
}

file = {
    'soundFile' : ("target.wav", open("target.wav", "rb"))
}

response = requests.post(url=url_conv, headers=headers, data=data, files=file)
if response.status_code == 200:
  print("Success")
  print(json.dumps(response.json(),indent=4))
  conversion_data = response.json()
  job_id = conversion_data["id"]
  print(job_id)
else:
  print("Failed")
  print(response.status_code)
  print(response.json())

Success
Success
{
    "id": 29723065,
    "createdAt": "2024-06-19T20:16:01.242+00:00",
    "type": "infer",
    "status": "running",
    "voiceModelId": 221129,
    "jobStartTime": null,
    "jobEndTime": null,
    "outputFileUrl": null,
    "lossyOutputFileUrl": null,
    "recombinedAudioFileUrl": null,
    "model": {
        "id": 221129,
        "title": "Overdriven Guitar",
        "tags": [],
        "twitterLink": null,
        "instagramLink": null,
        "tiktokLink": null,
        "spotifyLink": null,
        "youtubeLink": null,
        "imageUrl": "https://arpeggi-prod-public.s3.us-west-2.amazonaws.com/rvc/user_uploaded_images/cllgunasjqzc601mlditz0rsi.jpeg?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=ASIAXSSI5BUE4AFAY6VT%2F20240619%2Fus-west-2%2Fs3%2Faws4_request&X-Amz-Date=20240619T201602Z&X-Amz-Expires=900&X-Amz-Security-Token=IQoJb3JpZ2luX2VjEIv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJHMEUCIQD7%2BvWErkXjg%2FY8uPuXjWnsa

In [46]:
url_get = f'https://arpeggi.io/api/kits/v1/voice-conversions/{job_id}'
response = requests.get(url=url_get, headers=headers)
output_file_url = None
if response.status_code == 200:
  print("Success")
  job_data = response.json()
  print(json.dumps(response.json(),indent=4))
  output_file_url = job_data["outputFileUrl"]
else:
  print(response.status_code)
  print(response)

if output_file_url:
  response = requests.get(output_file_url)
  if response.status_code == 200:
    with open("converted.wav", "wb") as f:
      f.write(response.content)


Success
{
    "id": 29723065,
    "createdAt": "2024-06-19T20:16:01.242+00:00",
    "type": "infer",
    "status": "success",
    "voiceModelId": 221129,
    "jobStartTime": "2024-06-19T20:16:01.967+00:00",
    "jobEndTime": "2024-06-19T20:16:10.660+00:00",
    "outputFileUrl": "https://arpeggi-prod-private.s3.us-west-2.amazonaws.com/rvc/output_audio/clxm9wnor0qji01mv99v2gcz8.wav?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=ASIAXSSI5BUETPKCIAVL%2F20240619%2Fus-west-2%2Fs3%2Faws4_request&X-Amz-Date=20240619T202005Z&X-Amz-Expires=900&X-Amz-Security-Token=IQoJb3JpZ2luX2VjEIn%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJIMEYCIQDWhHOc6ZX6%2B2%2FcW1I%2Fj2YCWN3Yse9x8D%2F70vXX2AhPawIhAJuUgNEvavtYqbIIBvk%2FMEGFFaDqa4Jbn5ekTeA4732BKvMDCDIQAhoMNTIwOTE3NjgzNDY1IgxbTfrZOp5Z5PTJGwwq0APWjgSd1x9%2BbjG9WTNfVQm4afH8Edj5nXV2gN%2FUE11TbM1ZONyyfQkAA38GsoJFbnnIzyOD9t0LEu9R%2Bqx7C2KfqaoJqcVj54EomxaQH7DRCLnTnyszocAU0jhb2dgYpBZDXfKJAEIuBuDpOBzJ7X26fEg%2FdDN92lX%2B85

### **Final Recombination**

In [58]:
import scipy.io.wavfile as wav
import numpy as np
import librosa


def combine_audio_files(input_path, background_path, output_file, volume_db1=0, volume_db2=0):
    # Load audio files
    fs1, audio1 = wav.read(input_path)
    audio2, fs2 = librosa.load(background_path, sr=None)

    print(f"input sampling rate: {fs1}")
    print(f"background sampling rate: {fs2}")
    # Ensure the sample rates match
    if fs1 != fs2:
        print("Files not the same sample rate")
        target_sr = fs1
        audio2 = librosa.resample(audio2, orig_sr=fs2, target_sr=fs1)

    # Normalize audio data to range [-1, 1]
    audio1 = audio1.astype(np.float32) / np.max(np.abs(audio1))
    audio2 = audio2.astype(np.float32) / np.max(np.abs(audio2))

    # Convert volume from dB to linear scale
    volume1 = 10**(volume_db1 / 20.0)
    volume2 = 10**(volume_db2 / 20.0)

    # Apply volume adjustment
    audio1 *= volume1
    audio2 *= volume2

    # Ensure both audios are of the same length
    min_length = min(len(audio1), len(audio2))
    audio1 = audio1[:min_length]
    audio2 = audio2[:min_length]


    # Combine audio channels
    combined_audio = audio1 + audio2

    # Normalize combined audio to prevent clipping
    combined_audio = combined_audio / np.max(np.abs(combined_audio))

    # Scale back to integer PCM values
    combined_audio = np.int16(combined_audio * 32767)

    audio_segment = AudioSegment(
    combined_audio.tobytes(),
    frame_rate=fs2,
    sample_width=combined_audio.dtype.itemsize,
    channels=1)
    # Write combined audio to file
    audio_segment.export(output_file, format="mp3")
    print(f"Combined audio saved to {output_file}")

combine_audio_files(input_converted_path, background_path, final_output_path, volume_db1 = -5, volume_db2 = 0)


input sampling rate: 40000
background sampling rate: 44100
Files not the same sample rate
Combined audio saved to output.mp3
