In [19]:
import sys
sys.path.append("../")

In [20]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import pandas as pd
import numpy as np
from numpy import random

import IPython.display as ipd
import torchaudio, yaml, torch
from copy import deepcopy
from augmentations import AugmentArguments, transform_dict
from helper import DataArguments, ModelArguments
from transformers import TrainingArguments
import plotly.express as px

In [21]:
with open("../config.yml") as file:
    train_config = yaml.safe_load(file)
data_args = DataArguments(**train_config["data_args"])
model_args = ModelArguments(**train_config["model_args"])
augment_args = AugmentArguments(**train_config["augment_args"])

In [22]:
df = pd.read_csv(data_args.csv_fi)
recording_path = df.recording_path[520]
speech, sr = torchaudio.load(recording_path)
kiitos = speech[:,41485:41485+16000]
ipd.Audio(kiitos, rate=sr)

In [23]:
def plot_signal(transform_name: str, 
                example, 
                height: int = 600,
                plot_time: bool = True,
                plot_freq: bool = True, 
                overlap: bool = False):
    
    names = ["Original", "Augmented"]
    transformed_example = deepcopy(example)
    transform = transform_dict[transform_name]
    transformed_example = transform(data_args, 
                                    getattr(augment_args, transform_name), 
                                    transformed_example)
    colors = ["#6096B4", "#F2D388"]
    duration = max(len(example["speech"]), 
                   len(transformed_example["speech"]))

    fig = make_subplots(rows=1 if overlap else 2, 
                        cols=1, 
                        vertical_spacing=0.1, shared_yaxes=True, shared_xaxes=True)
    if plot_time:
        time = np.arange(duration)/sr
        for i, speech in enumerate([example["speech"], transformed_example["speech"]]):   
            fig.add_trace(
                go.Scatter(x=time, y=speech, name=names[i], line=dict(color=colors[i])), 
                row=i+1, col=1
            )
    
        fig.update_xaxes(title_text="Time (s)", row=2, col=1)
        fig.update_yaxes(title_text="Amplitude")
        fig.update_traces(line=dict(width=1.2))
        fig.update_layout(height=600, width=800, plot_bgcolor="whitesmoke")
        fig.show()
    
    if plot_freq:
        n_sample = len(example["speech"])

        speech_fft = torch.fft.rfft(example["speech"])
        mag = torch.abs(speech_fft)
        mag_db = 20 * torch.log10(mag)

        transformed_speech_fft = torch.fft.fft(transformed_example["speech"])
        transformed_mag = torch.abs(transformed_speech_fft)
        transformed_mag_db = 20 * torch.log10(transformed_mag)
        
        frequencies = torch.fft.rfftfreq(n_sample, 1/sr)
        for i, m in enumerate([mag_db, transformed_mag_db]):
            fig.add_trace(go.Scatter(x=frequencies, y=m, name=names[i], opacity=.7 if overlap else 1), 
                         col=1, 
                         row=1 if overlap else i+1)
        
        fig.update_xaxes(title_text="Frequency (Hz)", col=1, 
                        row=1 if overlap else 2)
        fig.update_yaxes(title_text="Magnitude (dB)")
        fig.update_traces(line=dict(width=1.2))
        fig.update_layout(height=height, plot_bgcolor="whitesmoke")
        fig.show()
        
    return transformed_example

In [24]:
# create examples
example = {"speech":kiitos.squeeze()}
augmented_example = deepcopy(example)
transformed_example = plot_signal("tempo_perturbation", augmented_example, plot_freq=False, plot_time=True, overlap=False)


In [25]:
ipd.Audio(transformed_example["speech"], rate=sr)

In [27]:
synthesised_speech, sr = torchaudio.load("../../TTS/synthesised/synsample_520_opiskelija_109_teht_27.wav")

In [16]:
ipd.Audio(synthesised_speech[:,12000:12000+16000], rate=sr)

In [26]:
# torchaudio.save("synthesised.wav", synthesised_speech[:,12000:12000+16000], sample_rate=16000)

In [15]:
import librosa
import numpy as np

def categorize_voice(file_path):
    y, sr = librosa.load(file_path)
    
    # Extract pitch using librosa's piptrack method
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
    
    # Get the most prominent pitch for each frame (where magnitude is highest)
    # Also, ensure that the pitch isn't zero (which means it wasn't detected)
    pitches = pitches[magnitudes.argmax(axis=0)]
    valid_pitches = pitches[pitches > 0]
    
    # Compute the median (which is less sensitive to outliers than the mean)
    median_pitch = np.median(valid_pitches)
    
    # These thresholds are just general values and might need adjustment based on your data
    print(median_pitch)
    if median_pitch < 160:
        return "low"
    elif median_pitch < 260:
        return "medium"
    else:
        return "high"

file_path = random_path
voice_category = categorize_voice(file_path)
print(f"The voice is categorized as: {voice_category}")

246.91959
The voice is categorized as: medium


In [24]:
import glob
from WavAugment import augment

In [43]:
def noise_generator():
    noise_path = glob.glob(augment_args.additive_noise.noise_dir + "/*.wav")[10]
    noise, sr = torchaudio.load(noise_path)
    speech_len = random_speech.size(1)

    noise_len = noise.size(1)

    if speech_len > noise_len: 
        repeat_factor = speech_len//noise_len + 1
        noise = noise.repeat(1, repeat_factor)
    noise = noise[:, :speech_len]
    return noise 

In [57]:
noise = noise_generator()
snr = 10
snr_linear = 10**(snr/10)
signal_ratio = snr_linear / (snr_linear+1)
speech_power = (random_speech**2).mean()
noise_power = (noise**2).mean()
noise_factor = np.sqrt(speech_power/(noise_power*snr_linear))
noised1 = random_speech + noise_factor * noise

In [58]:
noised2 = augment.EffectChain().additive_noise(noise_generator, 10).apply(random_speech, {'rate':sr})

In [59]:
ipd.Audio(noised1, rate=sr)

In [47]:
ipd.Audio(random_speech, rate=sr)

In [48]:
ipd.Audio(noised2, rate=sr)

In [60]:
noised1.max()

tensor(0.9102)

In [61]:
noised2.max()

tensor(0.8263)

In [111]:
def freq2mel(f): return 2595*np.log10(1 + (f/700))
mel_max = freq2mel(16000/2) - 27
mel_max

2813.023046708319

In [112]:
start_mel = np.random.uniform(0, mel_max, 2)
start_mel

array([2107.48094055,  660.09781757])

In [113]:
def mel2freq(m): return 700*(10**(m/2595) - 1)
start_f = mel2freq(start_mel)
start_f

array([3841.81482599,  557.39287403])

In [114]:
freq2mel(332)

437.46940063364815

In [115]:
mel2freq(437.5+20)

350.5061954756869