<a href="https://colab.research.google.com/github/nschmidtg/thesis/blob/main/Test1_as_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# In Google Colab: Be sure to select a GPU runtime (Runtime → Change runtime type → Hardware accelarator).


In [1]:
# First off, install asteroid
!pip install git+https://github.com/asteroid-team/asteroid --quiet

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
  Building wheel for asteroid (PEP 517) ... [?25l[?25hdone


## After installing requirements, you need to Restart Runtime (Ctrl + M).
Else it will fail to import asteroid

In [2]:
!pip install pytorch-lightning --quiet

In [3]:
# Asteroid is based on PyTorch and PyTorch-Lightning.
from torch import optim
from pytorch_lightning import Trainer

In [4]:
# We train the same model architecture that we used for inference above.
from asteroid import DPRNNTasNet

In [5]:
# In this example we use Permutation Invariant Training (PIT) and the SI-SDR loss.
from asteroid.losses import pairwise_neg_sisdr, PITLossWrapper

In [6]:
# install musdb:
!pip install musdb --quiet

In [7]:
# install ffmpeg (stems are mp4 by default)
!sudo apt-get install ffmpeg

Reading package lists... Done
Building dependency tree       
Reading state information... Done
ffmpeg is already the newest version (7:3.4.8-0ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 15 not upgraded.


In [8]:
# MiniLibriMix is a tiny version of LibriMix (https://github.com/JorisCos/LibriMix),
# which is a free speech separation dataset.
from asteroid.data import LibriMix

# import musdb to create the mixtures: https://github.com/sigsep/sigsep-mus-db
import musdb
# Asteroid's System is a convenience wrapper for PyTorch-Lightning.
from asteroid.engine import System

from IPython.display import display, Audio


In [9]:
# download the musdb library
mus = musdb.DB(download=True)

# To use the full dataset, set a dataset root directory
# mus = musdb.DB(root="/path/to/musdb)

# To work directly with wav: https://github.com/sigsep/sigsep-mus-db#using-wav-files-optional

In [11]:
# This will automatically download MiniLibriMix from Zenodo on the first run.
train_loader, val_loader = LibriMix.loaders_from_mini(task="sep_clean", batch_size=8)

HBox(children=(FloatProgress(value=0.0, max=640547371.0), HTML(value='')))


Drop 0 utterances from 800 (shorter than 3 seconds)
Drop 0 utterances from 200 (shorter than 3 seconds)


# Create the augmented dataset

using the LibriMix and the Musdb18 datasets, an augmented podcast/radioshow like dataset is created

In [12]:
import librosa, os

In [13]:
def create_folder_structure(path):
    if not os.path.exists(path):
        os.makedirs(path)
    if not os.path.exists(path + '/linear_mono'):
        os.makedirs(path + '/linear_mono')
    if not os.path.exists(path + '/linear_stereo'):
        os.makedirs(path + '/linear_stereo')
    if not os.path.exists(path + '/sidechain_mono'):
        os.makedirs(path + '/sidechain_mono')
    if not os.path.exists(path + '/sidechain_stereo'):
        os.makedirs(path + '/sidechain_stereo')
    if not os.path.exists(path + '/track_mono'):
        os.makedirs(path + '/track_mono')
    if not os.path.exists(path + '/track_stereo'):
        os.makedirs(path + '/track_stereo')
    if not os.path.exists(path + '/speech_mono'):
        os.makedirs(path + '/speech_mono')

In [14]:
# create files structure
train_path = "augmented_dataset/train"
create_folder_structure(train_path)

test_path = "augmented_dataset/test"
create_folder_structure(test_path)

if not os.path.exists('augmented_dataset/metadata'):
    os.makedirs('augmented_dataset/metadata')

In [20]:
from os import listdir
from os.path import isfile, join
import random
import numpy as np
import re
import csv

In [17]:
speech_path = "MiniLibriMix/val/s1/"
    
speech_array = [f for f in listdir(speech_path) if isfile(join(speech_path, f))]

print(speech_array)


['652-130726-0018_3536-8226-0001.wav', '6313-76958-0012_2035-147960-0001.wav', '3536-23268-0027_6295-244435-0008.wav', '3081-166546-0048_2035-152373-0018.wav', '6295-244435-0025_8297-275154-0007.wav', '1919-142785-0055_2412-153954-0015.wav', '3576-138058-0037_1462-170142-0013.wav', '1919-142785-0042_1988-148538-0002.wav', '5895-34615-0000_1988-24833-0017.wav', '2412-153954-0018_3536-23268-0004.wav', '5536-43359-0005_8842-302201-0007.wav', '2078-142845-0011_5536-43363-0010.wav', '5536-43358-0010_777-126732-0037.wav', '251-136532-0017_6295-64301-0005.wav', '5536-43363-0013_3536-8226-0021.wav', '2078-142845-0030_6241-61946-0013.wav', '1462-170138-0018_1988-24833-0012.wav', '1988-24833-0023_777-126732-0066.wav', '6241-66616-0022_6313-66129-0005.wav', '1993-147966-0001_2428-83699-0025.wav', '1988-24833-0015_3536-8226-0021.wav', '84-121123-0008_6319-57405-0000.wav', '84-121550-0019_6345-93302-0018.wav', '5338-24640-0004_6241-61943-0017.wav', '1462-170142-0009_5694-64029-0009.wav', '652-13072

In [27]:
def mix_audio_sources(track_path, speech_path, output_path, music_to_speech_ratio = 0.2):
    """
    Creates 4 mixes for the a music and a speech track and locates it in the output_path
    the 4 mixes are: linear_mono, linear_stereo, sidechain_mono, sidechain_stereo
    librimix is mono and musdb stereo
    """
    # read the files
    track, fs_track = librosa.load(track_path, sr=44100, mono=False)
    speech, fs_speech = librosa.load(speech_path, sr=44100)
    # match the length of the files
    min_lenght = min(len(track[0]), len(speech))
    
    # crop the files to match in length
    cropped_track_stereo = np.array([track[0][0:min_lenght], track[1][0:min_lenght]])
    cropped_track_mono = cropped_track_stereo[0] + cropped_track_stereo[1]
    cropped_speech = speech[0:min_lenght]
    
    linear_stereo = cropped_track_stereo * music_to_speech_ratio + cropped_speech
    linear_mono = cropped_track_mono * music_to_speech_ratio + cropped_speech
    
    # write the files

    
    file_name = re.sub("[^0-9a-zA-Z]+", "-", track_path.split('/')[-1]) + '_' + speech_path.split('/')[-1]
    librosa.output.write_wav(output_path + "/linear_mono/" + file_name, linear_mono, 44100, norm=True)
    librosa.output.write_wav(output_path + "/linear_stereo/" + file_name, linear_stereo, 44100, norm=True)
    librosa.output.write_wav(output_path + "/speech_mono/" + file_name, cropped_speech, 44100, norm=True)
    librosa.output.write_wav(output_path + "/track_mono/" + file_name, cropped_track_mono, 44100, norm=True)
    librosa.output.write_wav(output_path + "/track_stereo/" + file_name, cropped_track_stereo, 44100, norm=True)

    csv_path = 'augmented_dataset/metadata/mixture_train_linear_stereo.csv'
    with open(csv_path, 'a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([
                        file_name,
                        file_name,
                        output_path + "/linear_stereo/" + file_name,
                        output_path + "/track_stereo/" + file_name,
                        output_path + "/speech_mono/" + file_name,
                        min_lenght
             ])
    
    csv_path = 'augmented_dataset/metadata/mixture_train_linear_mono.csv'
    with open(csv_path, 'a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([
                        file_name,
                        file_name,
                        output_path + "/linear_mono/" + file_name,
                        output_path + "/track_mono/" + file_name,
                        output_path + "/speech_mono/" + file_name,
                        min_lenght
             ])

    return file_name

In [28]:
i=0
random.seed(1)
csv_path = 'augmented_dataset/metadata/mixture_train_linear_stereo.csv'
with open(csv_path, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["","mixture_ID","mixture_path","track_path","speech_path","length"])

csv_path = 'augmented_dataset/metadata/mixture_train_linear_mono.csv'
with open(csv_path, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["","mixture_ID","mixture_path","track_path","speech_path","length"])

for track in mus:
    
    track_file = track.path
    # get the speech name for the csv
    speech_name = speech_array[random.randint(0,len(speech_array))]

    
    # path of the speech
    speech_file = speech_path + speech_name
    
    # mixes = mix_audio_sources(mono_track, speech, train_path, music_to_speech_ratio= 0.01)
    file_path = mix_audio_sources(track_file, speech_file, train_path, music_to_speech_ratio= 0.1)

    if i == 1:
      break
    else:
      i += 1

# Train the network

In [None]:
# Tell DPRNN that we want to separate to 2 sources.
model = DPRNNTasNet(n_src=2)

In [None]:
# PITLossWrapper works with any loss function.
loss = PITLossWrapper(pairwise_neg_sisdr, pit_from="pw_mtx")

optimizer = optim.Adam(model.parameters(), lr=1e-3)

system = System(model, optimizer, loss, train_loader, val_loader)

In [None]:
# Train for 1 epoch using a single GPU. If you're running this on Google Colab,
# be sure to select a GPU runtime (Runtime → Change runtime type → Hardware accelarator).
trainer = Trainer(max_epochs=1, gpus=1)
trainer.fit(system)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type           | Params
---------------------------------------------
0 | model     | DPRNNTasNet    | 3 M   
1 | loss_func | PITLossWrapper | 0     


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Saving latest checkpoint..





1

In [None]:
!pip install librosa --quiet

In [None]:
%%capture
!wget https://www.merl.com/demos/deep-clustering/media/female-female-mixture.wav

In [None]:
import librosa

# You can pass a NumPy array:
mixture, _ = librosa.load("female-female-mixture.wav", sr=8000)
model.separate(mixture)

# Or simply a file name:
model.separate("female-female-mixture.wav")



In [None]:
from IPython.display import display, Audio

display(Audio("female-female-mixture_est1.wav"))

display(Audio("female-female-mixture_est2.wav"))

ValueError: ignored

In [None]:
s1, _ = librosa.load("female-female-mixture_es1.wav", sr=8000)


# Try to use ConvTasNet


In [None]:
from asteroid import ConvTasNet

In [None]:
# Tell DPRNN that we want to separate to 2 sources.
model = ConvTasNet(n_src=2)

In [None]:
# PITLossWrapper works with any loss function.
loss = PITLossWrapper(pairwise_neg_sisdr, pit_from="pw_mtx")

optimizer = optim.Adam(model.parameters(), lr=1e-3)

system = System(model, optimizer, loss, train_loader, val_loader)

In [None]:
# Train for 1 epoch using a single GPU. If you're running this on Google Colab,
# be sure to select a GPU runtime (Runtime → Change runtime type → Hardware accelarator).
trainer = Trainer(max_epochs=1, gpus=1)
trainer.fit(system)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type           | Params
---------------------------------------------
0 | model     | ConvTasNet     | 5 M   
1 | loss_func | PITLossWrapper | 0     


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Saving latest checkpoint..





1