# Neural audio coding for speech enhancement

To run this file, AudioDec needs to installed so that the import work correctly: `pip install -e ./AudioDec`

In [None]:
# %pip install -e ./AudioDec
# %clearml-init
#imports
from preprocessing.DataEncoder import DataEncoder
from train.LatentTrainer import LatentTrainer

## Generate encoded dataset

Here we can create the pre-encoded versions of the audio files. The following parameters are expected:
- `noise_files`: root directory of the noise files
- `speech_files`: root directory of the speech files
- `encoded_mixed_files`: output directory for the pre-encoded noisy audio files
- `encoded_speech_files`: output directory for the pre-encoded clean audio files

In [None]:
noise_files = './'
speech_files = './'
encoded_mixed_files = '/dtu/blackhole/15/203189/data/train/speech_code'
encoded_speech_files = '/dtu/blackhole/15/203189/data/train/mixed_code'

data_encoder = DataEncoder(
    speech_files=speech_files,
    noise_files=noise_files,
    encoded_speech_files=encoded_speech_files,
    encoded_mixed_files=encoded_mixed_files
)
data_encoder.encode_data()


## Training the model

Here we can train the network. The following parameters are expected:
- `model_dir`: output directory for saving the model checkpoints
- `encoded_mixed_files`: directory containing the pre-encoded noisy audio files
- `encoded_speech_files`: directory contaning the pre-encoded clean audio files

In [None]:
model_dir = '/dtu/blackhole/09/203081/saved_models/'
trainer = LatentTrainer(
    model_dir=model_dir,
    data_dir = encoded_speech_files,
    noise_dir = encoded_mixed_files)

trainer.train(num_epochs = 10)


## Making predictions

Here we can make predictions for a given input data. The prerequisites are the followings:
- Downloaded weights for AudioDec autoencoder `symAD_vctk_48000_hop300/checkpoint-700000steps.pkl` (available from [AudioDec's GitHub](https://github.com/facebookresearch/AudioDec)) which is expected to placed in `./AudioDec/exp/autoencoder/symAD_vctk_48000_hop300/checkpoint-700000steps.pkl`
- `PATH_TO_LATENT_NET_PARAMS`: path to the trained model parameters
- `PATH_TO_INPUT`: path to an input file to be denoised

In [None]:
PATH_TO_LATENT_NET_PARAMS = "/Users/madiistvan/Dev/DTU/Fall23/02456-Deep-learning/Poster/Project/Neural-audio-coding-for-speech-enhancement/34 (3).pt"
PATH_TO_INPUT = "x.wav"

In [None]:
from AudioDec.models.autoencoder.AudioDec import Generator
from train.LatentNetwork2 import LatentNetwork
generator = Generator()
latent = LatentNetwork()

In [None]:
import os
import torch
import torchaudio
from AudioDec.models.autoencoder.AudioDec import Generator
tx_steps = 700000
encoder_checkpoint = os.path.join('./AudioDec','exp', 'autoencoder', 'symAD_vctk_48000_hop300', f"checkpoint-{tx_steps}steps.pkl")
generator = Generator()
generator.load_state_dict(torch.load(encoder_checkpoint, map_location='cpu')['model']['generator'])
data, _ = torchaudio.load(PATH_TO_INPUT, backend="soundfile")

latent.load_state_dict(torch.load(PATH_TO_LATENT_NET_PARAMS, map_location=torch.device('cpu')))
generator.eval()
latent.eval()
x = generator.encoder(data.unsqueeze(0))
x = generator.projector(x)
x, _, _ = generator.quantizer(latent(x))
x = generator.decoder(x)

torchaudio.save("pred.wav", x.detach().cpu().squeeze(1), 48000)

