In [1]:
import sys
sys.path.append("../")

In [2]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import pandas as pd
import numpy as np
from numpy import random

import IPython.display as ipd
import torchaudio, yaml, torch
from copy import deepcopy
from augmentations import AugmentArguments, transform_dict
from helper import DataArguments, ModelArguments
from transformers import TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
with open("../config.yml") as file:
    train_config = yaml.safe_load(file)
data_args = DataArguments(**train_config["data_args"])
model_args = ModelArguments(**train_config["model_args"])
augment_args = AugmentArguments(**train_config["augment_args"])

In [4]:
df = pd.read_csv("../csv/finnish_df.csv")
random_path = random.choice(df.recording_path)
random_speech, sr = torchaudio.load(random_path)

ipd.Audio(random_speech, rate=sr)

In [5]:
# create examples
example = {"speech":random_speech.squeeze()}

In [8]:
def plot_signal(transform_name: str, 
                example, 
                zoom: int = 1,
                height: int = 600,
                plot_time: bool = True,
                plot_freq: bool = True, 
                overlap: bool = False):
    
    names = ["Original", "Augmented"]
    transformed_example = deepcopy(example)
    transform = transform_dict[transform_name]
    transformed_example = transform(data_args, 
                                    getattr(augment_args, transform_name), 
                                    transformed_example)
    
    duration = max(len(example["speech"]), 
                   len(transformed_example["speech"]))

    fig = make_subplots(rows=1 if overlap else 2, 
                        cols=1, 
                        vertical_spacing=0.1, shared_yaxes=True, shared_xaxes=True)
    if plot_time:
        time = np.arange(duration)/sr
        for i, speech in enumerate([example["speech"], transformed_example["speech"]]):   
            fig.add_trace(
                go.Scatter(x=time, y=speech, name=names[i]), 
                row=i+1, col=1
            )
    
        fig.update_xaxes(title_text="Time (s)", row=2, col=1)
        fig.update_yaxes(title_text="Amplitude")
        fig.update_traces(line=dict(width=1.2))
        fig.update_layout(height=600, width=800)
        fig.show()
    
    if plot_freq:
        n_sample = len(example["speech"])
        
        magnitudes = torch.abs(torch.fft.fft(example["speech"]))[:n_sample//2//zoom]
        transformed_magnitudes = torch.abs(torch.fft.fft(transformed_example["speech"]))[:n_sample//2//zoom]
        print(transformed_magnitudes[-1])
        
        frequencies = torch.fft.fftfreq(n_sample, 1/sr)[:n_sample//2//zoom]
        
        for i, m in enumerate([magnitudes, transformed_magnitudes]):
            x_axis = []
            y_axis = []
            for x, y in zip(frequencies, m):
                x_axis.extend([x.item(), x.item(), None])
                y_axis.extend([0, y.item(), None])
            fig.add_trace(go.Scatter(x=x_axis, y=y_axis, name=names[i], opacity=.7 if overlap else 1), 
                         col=1, 
                         row=1 if overlap else i+1)
        
        fig.update_xaxes(title_text="Frequency (Hz)", col=1, 
                        row=1 if overlap else 2)
        fig.update_yaxes(title_text="Magnitude")
        fig.update_traces(line=dict(width=1.2))
        fig.update_layout(height=height)
        fig.show()
        
    return transformed_example

In [9]:
transformed_example = plot_signal("pitch_shift", example, zoom=10, plot_time=False, overlap=True)

tensor(20.9837)


In [127]:
ipd.Audio(transformed_example["speech"], rate=sr)

In [24]:
import glob
from WavAugment import augment



In [43]:
def noise_generator():
    noise_path = glob.glob(augment_args.additive_noise.noise_dir + "/*.wav")[10]
    noise, sr = torchaudio.load(noise_path)
    speech_len = random_speech.size(1)

    noise_len = noise.size(1)

    if speech_len > noise_len: 
        repeat_factor = speech_len//noise_len + 1
        noise = noise.repeat(1, repeat_factor)
    noise = noise[:, :speech_len]
    return noise 

In [57]:
noise = noise_generator()
snr = 10
snr_linear = 10**(snr/10)
signal_ratio = snr_linear / (snr_linear+1)
speech_power = (random_speech**2).mean()
noise_power = (noise**2).mean()
noise_factor = np.sqrt(speech_power/(noise_power*snr_linear))
noised1 = random_speech + noise_factor * noise

In [58]:
noised2 = augment.EffectChain().additive_noise(noise_generator, 10).apply(random_speech, {'rate':sr})

In [59]:
ipd.Audio(noised1, rate=sr)

In [47]:
ipd.Audio(random_speech, rate=sr)

In [48]:
ipd.Audio(noised2, rate=sr)

In [60]:
noised1.max()

tensor(0.9102)

In [61]:
noised2.max()

tensor(0.8263)

In [111]:
def freq2mel(f): return 2595*np.log10(1 + (f/700))
mel_max = freq2mel(16000/2) - 27
mel_max

2813.023046708319

In [112]:
start_mel = np.random.uniform(0, mel_max, 2)
start_mel

array([2107.48094055,  660.09781757])

In [113]:
def mel2freq(m): return 700*(10**(m/2595) - 1)
start_f = mel2freq(start_mel)
start_f

array([3841.81482599,  557.39287403])

In [114]:
freq2mel(332)

437.46940063364815

In [115]:
mel2freq(437.5+20)

350.5061954756869