In [4]:
from dataloader import dataloader, SignalDataset
from utils import prepare_data
from param import dataset_path, sample_universe_size
from torch.utils.data import ConcatDataset
from MTL_w_cascade_info import MtlCascadeModel
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import ExponentialLR
import torch

# from torch.nn.utils import weight_norm
from torch import nn


hp = {
    "n_layers": 1,
    "sp_hidden_nodes": 20,
    "n_sp_hidden_lyrs": 1,
    "mu_hidden_nodes": 20,
    "n_mu_hidden_lyrs": 1,
    "smr_hidden_nodes": 20,
    "n_smr_hidden_lyrs": 1,
    "n_epochs": 100,
    "batch_size": 20,
    "train_ratio": 0.8,
}
combined_dataset = torch.load('dataset.pth')
print("data_loader")
train_loader, test_loader = dataloader(
    datasets=combined_dataset,
    train_ratio=hp["train_ratio"],
    train_batch_size=hp["batch_size"],
    test_batch_size=1,
)

def train(
    train_loader,
    model,
    epoch,
    out_dict,
    loss_sp_fn,
    loss_mu_fn,
    loss_smr_fn,
    optimizer,
):
    correct = 0
    for data in train_loader:
        feature, label = data
        y = [out_dict[x] for x in label]
        out_sp, out_mu, out_smr = model(feature)

        sp_list = [inner_list[0] for inner_list in y]
        mu_list = [inner_list[1] for inner_list in y]
        smr_list = [inner_list[2:] for inner_list in y]
        y_sp = torch.Tensor(sp_list).unsqueeze(1)
        y_mu = torch.Tensor(mu_list).unsqueeze(1)
        y_smr = torch.Tensor(smr_list).unsqueeze(1)

        loss_sp = loss_sp_fn(out_sp, y_sp)
        loss_mu = loss_mu_fn(out_mu, y_mu)
        loss_smr = loss_smr_fn(out_smr, y_smr)

        total_loss = loss_sp + loss_mu + loss_smr
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

        pred_y = torch.cat((out_sp, out_mu, out_smr), dim=1)
        result = (pred_y > 0.5).float()
        target = torch.tensor(y).float()
        for i in range(result.size(0)):
            if torch.all(torch.eq(result[i], target[i])):
                correct += 1
    accuracy = correct / len(train_loader.dataset)
    print(
        f"Epoch: {epoch}, Loss_sp: {loss_sp}, Loss_mu: {loss_mu}, Loss_smr: {loss_smr}, Accuracy: {accuracy}"
    )


data_loader


In [6]:
torch.autograd.set_detect_anomaly(True)

print("model_init")
model = MtlCascadeModel(hp)
# print(model)
loss_sp = nn.BCEWithLogitsLoss()
loss_mu = nn.BCEWithLogitsLoss()
loss_smr = nn.MSELoss()

out_dict = {"speech": [1, 0, 0, 0], "music": [0, 1, 0, 0], "mixture": [0, 0, 1, 1]}

# Optimizer and learning rate scheduler
optimizer = Adam(model.parameters(), lr=0.002)
# optimizer = SGD(model.parameters(), lr=0.002, momentum=0.9)
scheduler = ExponentialLR(optimizer, gamma=0.1)

print("start_training")
for epoch in range(1,hp["n_epochs"]+1):
    train(
        train_loader, model, epoch, out_dict, loss_sp, loss_mu, loss_smr, optimizer
    )
# train(
#     train_loader, model, hp["n_epochs"], out_dict, loss_sp, loss_mu, loss_smr, optimizer
# )

model_init
start_training


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 1, Loss_sp: 0.5497570037841797, Loss_mu: 1.669691562652588, Loss_smr: 0.08134792745113373, Accuracy: 0.22821576763485477
Epoch: 2, Loss_sp: 0.12731775641441345, Loss_mu: 0.3549463748931885, Loss_smr: 0.1975173056125641, Accuracy: 0.36751630112625966
Epoch: 3, Loss_sp: 0.13728009164333344, Loss_mu: 0.4615171551704407, Loss_smr: 0.19479364156723022, Accuracy: 0.4013040901007706
Epoch: 4, Loss_sp: 0.18401402235031128, Loss_mu: 0.39979904890060425, Loss_smr: 0.3128724694252014, Accuracy: 0.41567871962062836
Epoch: 5, Loss_sp: 0.07096236944198608, Loss_mu: 0.4647606313228607, Loss_smr: 0.23608353734016418, Accuracy: 0.40915826911677533
Epoch: 6, Loss_sp: 0.35188257694244385, Loss_mu: 0.545046865940094, Loss_smr: 0.19359494745731354, Accuracy: 0.43375815056312983
Epoch: 7, Loss_sp: 0.19236329197883606, Loss_mu: 0.5008758902549744, Loss_smr: 0.2694648802280426, Accuracy: 0.4350918790752816
Epoch: 8, Loss_sp: 1.4852579832077026, Loss_mu: 0.21463799476623535, Loss_smr: 0.1586016565561294

Testing the classical fourier trasformation

In [4]:
import librosa

# Load audio file
y, sr = librosa.load('audio_file.wav')

# Perform HPSS
harmonic, percussive = librosa.effects.hpss(y)

# Save the harmonic and percussive components
librosa.output.write_wav('harmonic.wav', harmonic, sr)
librosa.output.write_wav('percussive.wav', percussive, sr)

In [26]:
import librosa
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Function to compute skewness
def compute_skewness(matrix):
    return np.mean(((matrix - np.mean(matrix, axis=0)) / np.std(matrix, axis=0)) ** 3, axis=0)
def hpss_classification (speech,music,mixture,sr):
    # Perform HPSS
    harmonic_speech, percussive_speech = librosa.effects.hpss(speech)
    harmonic_music, percussive_music = librosa.effects.hpss(music)
    harmonic_mixture, percussive_mixture = librosa.effects.hpss(mixture)

    # Compute skewness vectors
    skewness_speech = compute_skewness(harmonic_speech) + compute_skewness(percussive_speech)
    skewness_music = compute_skewness(harmonic_music) + compute_skewness(percussive_music)
    skewness_mixture = compute_skewness(harmonic_mixture) + compute_skewness(percussive_mixture)

    # Concatenate skewness vectors for t-SNE
    all_skewness = np.vstack((skewness_speech, skewness_music, skewness_mixture))

    # Apply t-SNE for dimensionality reduction
    # tsne = TSNE(n_components=2,perplexity=2, random_state=0)
    # reduced_skewness = tsne.fit_transform(all_skewness)
    reduced_skewness = all_skewness
    # Plot the results
    print( reduced_skewness.shape)
    plt.scatter(reduced_skewness[:1, 0], [0], label='Speech')
    plt.scatter(reduced_skewness[1:2, 0], [0], label='Music')
    plt.scatter(reduced_skewness[2:, 0], [0]*len(reduced_skewness[2:, 0]), label='Mixture')
    plt.legend()
    plt.title('t-SNE Visualization of Class Separability')
    plt.show()

In [5]:
from utils import prepare_data
import pandas as pd
from param import dataset_path, sample_universe_size
combination_paths = prepare_data(f".{dataset_path}")
sampled_df = combination_paths.sample(
    frac=sample_universe_size, random_state=42, ignore_index=True
    )

In [6]:
sampled_df.head()

Unnamed: 0,music,speech
0,../data/musan/music/jamendo/music-jamendo-0169...,../data/musan/speech/librivox/speech-librivox-...
1,../data/musan/music/fma/music-fma-0042.wav,../data/musan/speech/librivox/speech-librivox-...
2,../data/musan/music/jamendo/music-jamendo-0146...,../data/musan/speech/us-gov/speech-us-gov-0023...
3,../data/musan/music/fma/music-fma-0023.wav,../data/musan/speech/us-gov/speech-us-gov-0018...
4,../data/musan/music/jamendo/music-jamendo-0212...,../data/musan/speech/librivox/speech-librivox-...


In [73]:
sampled_df.shape

(281, 2)

In [85]:
from utils import load_audio,load_music,mix_signals
from param import sampling_rate
import pandas as pd
df = pd.DataFrame(columns=['Iteration', 'Category','data_type', 'Min', 'Max', 'Mean', 'Median', 'STD'])
list_data =[]
for i in range(281):
    # print(sampled_df.iloc[i]["speech"])
    # print(sampled_df.iloc[i]["music"])

    speech_wave = load_music(sampled_df.iloc[i]["speech"])
    music_wave = load_music(sampled_df.iloc[i]["music"])
    mixed_wave = mix_signals(sampled_df.iloc[i]["speech"],sampled_df.iloc[0]["music"])
    #hpss_classification(speech_wave,music_wave,mixed_wave,sampling_rate)
    harmonic_speech, percussive_speech = librosa.effects.hpss(speech_wave)
    harmonic_music, percussive_music = librosa.effects.hpss(music_wave)
    harmonic_mixture, percussive_mixture = librosa.effects.hpss(mixed_wave)
    # plot_fig(harmonic_speech,percussive_speech,"speech")
    # plot_fig(harmonic_music,percussive_music,"music")
    # plot_fig(harmonic_mixture,percussive_mixture,"mixture")
    for type_data in ["harmonic", "skewness"]:
        if type_data == "harmonic":
            data_speech = harmonic_speech
            data_music = harmonic_music
            data_mixture = harmonic_mixture
        else:
            data_speech = compute_skewness(harmonic_speech)
            data_music = compute_skewness(harmonic_music)
            data_mixture = compute_skewness(harmonic_mixture)

    
    # data_speech = np.abs(harmonic_speech- percussive_speech)
    # data_music = np.abs(harmonic_music- percussive_music)
    # data_mixture = np.abs(harmonic_mixture- percussive_mixture)
    # # Append the data for each category to the DataFrame
        list_data.append({
            'Iteration': i,
            'Category': 'Speech',
            'data_type': type_data,
            'Min': round(np.min(data_speech),2),
            'Max': round(np.max(data_speech),2),
            'Mean': round(np.mean(data_speech),2),
            'Median': round(np.median(data_speech),2),
            'STD': round(np.std(data_speech),2),
        })
        
        list_data.append({
            'Iteration': i,
            'Category': 'Music',
            'data_type': type_data,
            'Min': round(np.min(data_music),2),
            'Max': round(np.max(data_music),2),
            'Mean': round(np.mean(data_music),2),
            'Median': round(np.median(data_music),2),
            'STD': round(np.std(data_music),2),
        })
        
        list_data.append({
            'Iteration': i,
            'Category': 'Mixture',
            'data_type': type_data,
            'Min': round(np.min(data_mixture),2),
            'Max': round(np.max(data_mixture),2),
            'Mean': round(np.mean(data_mixture),2),
            'Median': round(np.median(data_mixture),2),
            'STD': round(np.std(data_mixture),2),
        })

# Print the DataFrame
df = pd.DataFrame(list_data)
print(df)



    # speech_classification = classify_waveform(speech_wave, sampling_rate)
    # music_classification = classify_waveform(music_wave, sampling_rate)
    # mixed_classification = classify_waveform(mixed_wave, sampling_rate)
    # print(f"Ground Truth: Speech, Predicted: {speech_classification}")
    # print(f"Ground Truth: Music, Predicted: {music_classification}")
    # print(f"Ground Truth: Mixed, Predicted: {mixed_classification}")
    # print("------------------------------------------------------------------------------")


      Iteration Category data_type   Min   Max  Mean  Median   STD
0             0   Speech  harmonic -0.57  0.55 -0.00    0.00  0.10
1             0    Music  harmonic -0.62  0.60 -0.00    0.00  0.14
2             0  Mixture  harmonic -0.34  0.34 -0.00   -0.00  0.07
3             0   Speech  skewness -0.07 -0.07 -0.07   -0.07  0.00
4             0    Music  skewness -0.00 -0.00 -0.00   -0.00  0.00
...         ...      ...       ...   ...   ...   ...     ...   ...
1681        280    Music  harmonic -0.50  0.44 -0.00    0.00  0.09
1682        280  Mixture  harmonic -0.25  0.24 -0.00    0.00  0.06
1683        280   Speech  skewness  0.41  0.41  0.41    0.41  0.00
1684        280    Music  skewness -0.08 -0.08 -0.08   -0.08  0.00
1685        280  Mixture  skewness -0.02 -0.02 -0.02   -0.02  0.00

[1686 rows x 8 columns]


In [86]:
df.sort_values(by='Category', ascending=True, inplace=True)


In [88]:
df.to_csv('hpss_classification_sk.csv', index=False)

In [71]:
print(df)

    Iteration Category   data_type   Min   Max  Mean  Median   STD
14          2  Mixture    harmonic -0.34  0.36   0.0     0.0  0.07
26          4  Mixture    harmonic -0.31  0.31  -0.0     0.0  0.07
23          3  Mixture  percussive -0.23  0.35   0.0     0.0  0.04
20          3  Mixture    harmonic -0.33  0.37   0.0     0.0  0.07
17          2  Mixture  percussive -0.41  0.43  -0.0     0.0  0.08
11          1  Mixture  percussive -0.39  0.41   0.0     0.0  0.08
8           1  Mixture    harmonic -0.29  0.24  -0.0     0.0  0.06
29          4  Mixture  percussive -0.44  0.40  -0.0    -0.0  0.07
5           0  Mixture  percussive -0.42  0.44   0.0     0.0  0.10
2           0  Mixture    harmonic -0.27  0.27  -0.0    -0.0  0.06
10          1    Music  percussive -0.01  0.01   0.0     0.0  0.00
1           0    Music    harmonic -0.41  0.40  -0.0     0.0  0.08
13          2    Music    harmonic -0.80  0.82   0.0    -0.0  0.25
28          4    Music  percussive -0.49  0.49   0.0     0.0  

In [44]:
import matplotlib.pyplot as plt

# Assuming harmonic_speech and percussive_speech are numpy arrays
# representing the harmonic and percussive components of your speech signal
def plot_fig(harmonic, persussive, class_name):
    plt.figure(figsize=(12, 6))

    plt.subplot(2, 1, 1)
    plt.plot(harmonic)
    plt.title(f'Harmonic Component of {class_name}')

    plt.subplot(2, 1, 2)
    plt.plot(persussive)
    plt.title(f'Percussive Component of {class_name}')

    plt.tight_layout()
    plt.show()

In [87]:
df_2 = df.groupby(['Category','data_type']).mean().reset_index()

In [None]:
def traditional_classification(signal_wave):
    # Perform HPSS
    harmonic, percussive = librosa.effects.hpss(signal_wave)

    # Compute skewness vectors
    skewness = compute_skewness(harmonic) + compute_skewness(percussive)


In [89]:
df_2.to_csv('hpss_classification_grp_sk.csv', index=False)

In [None]:
def 