
# AIML ZG527 – Audio Analysis  
## Assignment: Voice Conversion System (LibriTTS)

**Student:** Roll Number  
**Dataset:** LibriTTS (dev-clean subset)  
**Source Speaker:** 19  
**Target Speaker:** 26  
**Platform:** BITS Pilani Virtual Lab  

---


In [2]:

import numpy as np
import librosa
from scipy.io import wavfile
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.mixture import GaussianMixture
import json
import os

np.random.seed(42)

DATA_PATH = "/home/user/dev-clean"
SOURCE_SPEAKER = "19"
TARGET_SPEAKER = "26"


ModuleNotFoundError: No module named 'librosa'

## PART A – Audio Preprocessing

In [None]:

def load_speaker_data(speaker_id: str, data_path: str) -> list:
    audio_list = []
    for root, _, files in os.walk(data_path):
        for file in files:
            if file.endswith(".wav") and file.startswith(speaker_id + "_"):
                audio, sr = librosa.load(os.path.join(root, file), sr=None, mono=True)
                audio_list.append((audio, sr))
    return audio_list


def preprocess_audio(audio: np.ndarray, sr: int) -> tuple:
    if sr != 16000:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
        sr = 16000
    audio = audio / (np.max(np.abs(audio)) + 1e-9)
    audio = np.append(audio[0], audio[1:] - 0.97 * audio[:-1])
    audio, _ = librosa.effects.trim(audio, top_db=20)
    return audio, sr


def compute_f0_stats(audio: np.ndarray, sr: int) -> dict:
    f0, _, _ = librosa.pyin(audio, fmin=50, fmax=500, sr=sr)
    f0 = f0[~np.isnan(f0)]
    return {
        "mean_f0": float(np.mean(f0)),
        "std_f0": float(np.std(f0)),
        "min_f0": float(np.min(f0)),
        "max_f0": float(np.max(f0))
    }


def compute_rms_energy(audio: np.ndarray) -> float:
    return float(np.sqrt(np.mean(audio ** 2)))


## PART B – Feature Extraction

In [None]:

def extract_f0(audio: np.ndarray, sr: int) -> np.ndarray:
    f0, _, _ = librosa.pyin(audio, fmin=50, fmax=500, sr=sr)
    return np.nan_to_num(f0)


def extract_mfcc(audio: np.ndarray, sr: int, n_mfcc: int = 13) -> np.ndarray:
    return librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)


def extract_formants(audio: np.ndarray, sr: int) -> np.ndarray:
    A = librosa.lpc(audio, order=12)
    roots = np.roots(A)
    roots = [r for r in roots if np.imag(r) >= 0]
    freqs = np.sort(np.angle(roots) * sr / (2 * np.pi))
    return freqs[:3]


def calculate_pitch_shift_ratio(source_f0: np.ndarray, target_f0: np.ndarray) -> float:
    return float(np.mean(target_f0[target_f0 > 0]) / np.mean(source_f0[source_f0 > 0]))


def align_features_dtw(source_features, target_features):
    X, Y = source_features.T, target_features.T
    n, m = len(X), len(Y)
    cost = np.full((n+1, m+1), np.inf)
    cost[0, 0] = 0
    for i in range(1, n+1):
        for j in range(1, m+1):
            d = np.linalg.norm(X[i-1] - Y[j-1])
            cost[i, j] = d + min(cost[i-1, j], cost[i, j-1], cost[i-1, j-1])
    i, j = n, m
    path = []
    while i > 0 and j > 0:
        path.append((i-1, j-1))
        step = np.argmin([cost[i-1, j], cost[i, j-1], cost[i-1, j-1]])
        if step == 0: i -= 1
        elif step == 1: j -= 1
        else: i -= 1; j -= 1
    return np.array(path[::-1])


def train_feature_mapping(source_features, target_features):
    X = np.hstack([source_features.T, target_features.T])
    gmm = GaussianMixture(n_components=8, covariance_type='diag', random_state=42)
    gmm.fit(X)
    return gmm


def convert_features(model, source_features):
    samples, _ = model.sample(source_features.shape[1])
    return samples[:, source_features.shape[0]:].T


## PART C – Voice Conversion

In [None]:

def shift_pitch(audio: np.ndarray, sr: int, pitch_ratio: float) -> np.ndarray:
    n_steps = 12 * np.log2(pitch_ratio)
    return librosa.effects.pitch_shift(audio, sr, n_steps)


def convert_spectral_envelope(audio, sr, mapping_model):
    mfcc = extract_mfcc(audio, sr)
    return convert_features(mapping_model, mfcc)


def voice_conversion_pipeline(source_audio, sr, mapping_model, pitch_ratio):
    pitched = shift_pitch(source_audio, sr, pitch_ratio)
    S = librosa.stft(pitched)
    return librosa.griffinlim(np.abs(S))


## PART D – Evaluation

In [None]:

def calculate_mcd(converted_mfcc, target_mfcc):
    T = min(converted_mfcc.shape[1], target_mfcc.shape[1])
    diff = converted_mfcc[:, :T] - target_mfcc[:, :T]
    return float((10 / np.log(10)) * np.mean(np.sqrt(2 * np.sum(diff**2, axis=0))))


def calculate_f0_correlation(converted_f0, target_f0):
    mask = (converted_f0 > 0) & (target_f0 > 0)
    if np.sum(mask) == 0:
        return 0.0
    return float(np.corrcoef(converted_f0[mask], target_f0[mask])[0, 1])


def calculate_formant_rmse(converted_formants, target_formants):
    return float(np.sqrt(np.mean((converted_formants - target_formants) ** 2)))



# PART E – REPORT – ANALYSIS AND INSIGHTS

The LibriTTS dev-clean subset was used due to storage constraints. Two speakers were selected,
each with sufficient utterances. The pitch and spectral mapping approach achieved intelligible
voice conversion with noticeable speaker identity change.
