# Calculating MCD(Mel cepstral distance)

In [1]:
import librosa
import librosa.display
import numpy as np
import math

In [2]:
def extract_path_cost(D, wp):
    """
    Get the path cost from D(cost matrix), wp (warped path)
    :returns: sum of path cost 
    """
    path_cost = D[wp[:, 0], wp[:, 1]]
    return np.sum(path_cost)

def extract_frame_avg_path_cost(D, wp):
    path_cost = extract_path_cost(D, wp)
    path_length = wp.shape[0]
    frame_avg_path_cost = path_cost / float(path_length)
    return frame_avg_path_cost
 
def log_spec_dB_dist(x, y):
    log_spec_dB_const = 10.0 / math.log(10.0) * math.sqrt(2.0)
    diff = x - y
    
    return log_spec_dB_const * math.sqrt(np.inner(diff, diff))

In [3]:
def cal_mcd(gt, synt, cost_function, dtw_type='path_cost'):
    frames = synt.shape[1]
    log_mel_min_cost_tot = 0
    mel_cepstrum_min_cost_tot = 0
    
    # dynamic time warping for log mel distance
    log_mel_min_cost, log_mel_dtw_path = librosa.sequence.dtw(gt, synt, metric=cost_function)
    if dtw_type == 'mean':
        log_mel_min_cost_tot = np.mean(log_mel_min_cost)
    else:
        log_mel_min_cost_tot = extract_frame_avg_path_cost(log_mel_min_cost, log_mel_dtw_path)

    ## Dyanmic time warping for cepstral distance
    gt_mfcc = librosa.feature.mfcc(S=gt, n_mfcc=n_mfcc)
    synt_mfcc = librosa.feature.mfcc(S=synt, n_mfcc=n_mfcc)
    mel_cepstrum_min_cost, mel_cepstrum_dtw_path = librosa.sequence.dtw(gt_mfcc, synt_mfcc, metric=cost_function)
    if dtw_type == 'mean':
        mel_cepstrum_min_cost_tot = np.mean(mel_cepstrum_min_cost)
    else:
        mel_cepstrum_min_cost_tot = extract_frame_avg_path_cost(mel_cepstrum_min_cost, mel_cepstrum_dtw_path)
    
    mean_log_mel_distance = log_mel_min_cost_tot / frames
    mean_mel_cepstrum_mcd = mel_cepstrum_min_cost_tot / frames
    
    return mean_log_mel_distance, mean_mel_cepstrum_mcd, frames

In [4]:
## Mel params
n_fft=2048
hop_length=512
win_length=None
window='hann'
n_mels = 80

## Mfcc params
n_mfcc=34

In [5]:
gt_file = "ground_truth.wav"
synt_file = "generated.wav"

In [6]:
## Load wavs
gt_wav, gt_sr = librosa.load(gt_file)
synt_wav, synt_sr = librosa.load(synt_file)

In [7]:
## Generate spectrograms
gt_mels = librosa.feature.melspectrogram(gt_wav, sr=gt_sr, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window, n_mels=n_mels)
synt_mels = librosa.feature.melspectrogram(synt_wav, sr=gt_sr, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window, n_mels=n_mels)

  gt_mels = librosa.feature.melspectrogram(gt_wav, sr=gt_sr, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window, n_mels=n_mels)
 7.7692930e-05] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  synt_mels = librosa.feature.melspectrogram(synt_wav, sr=gt_sr, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window, n_mels=n_mels)


In [10]:
log_dist, mel_mcd, frames = cal_mcd(gt_mels, synt_mels, log_spec_dB_dist)
print(f"log distance between samples: {log_dist:.2f}")
print(f"mel cepstral distance between samples: {mel_mcd:.2f}")

log distance between samples: 142.87
mel cepstral distance between samples: 87.76
