<a href="https://colab.research.google.com/github/mhuckvale/voice/blob/main/Globe_PCA_Demonstration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Demonstrate FreeVC with Deep-Speaker embedding trained on Globe sample dataset


# Configuration

In [1]:
%cd /content
!pip3 install torch torchaudio torchvision torchtext torchdata webrtcvad
!rm -rf FreeVC
!git clone https://github.com/OlaWod/FreeVC.git
%cd FreeVC
!pwd

/content
Collecting torchtext
  Downloading torchtext-0.18.0-cp311-cp311-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting torchdata
  Downloading torchdata-0.11.0-py3-none-any.whl.metadata (6.3 kB)
Collecting webrtcvad
  Downloading webrtcvad-2.0.10.tar.gz (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.2/66.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9

# 1. Set up Python environment

In [2]:
import math
import pandas as pd
import numpy as np

import ipywidgets as widgets
from ipywidgets import HBox, VBox, Layout
from IPython.display import Audio
%matplotlib inline


# 2. Load the FreeVC model and weights trained on Globe

In [3]:
!rm -rf voice
!git clone https://github.com/mhuckvale/voice.git

%cd /content/FreeVC
!cp /content/FreeVC/voice/FreeVC/freevc.py .
!mkdir checkpoints
!cp /content/FreeVC/voice/FreeVC/checkpoints/*.pt* checkpoints

# download the voice conversion model
!wget -O checkpoints/Globe10k_200000.pth https://avatartherapy.co.uk/download/Globe10k_200000.pth

# download wavlm
!wget -O wavlm/WavLM-Large.pt https://avatartherapy.co.uk/download/WavLM-Large.pt

# use FreeVC to apply speaker embedding to an audio file
import os
from types import SimpleNamespace
from freevc import FreeVC

args = SimpleNamespace()
args.hpfile="configs/freevc.json"
args.ptfile="checkpoints/Globe10k_200000.pth"
args.spfile='checkpoints/pretrained_bak_5805000.pt'
args.outdir="output"
print(args)

os.makedirs(args.outdir, exist_ok=True)
freevc=FreeVC()
freevc.load(args)

Cloning into 'voice'...
remote: Enumerating objects: 36, done.[K
remote: Counting objects: 100% (36/36), done.[K
remote: Compressing objects: 100% (35/35), done.[K
remote: Total 36 (delta 8), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (36/36), 16.18 MiB | 7.63 MiB/s, done.
Resolving deltas: 100% (8/8), done.
/content/FreeVC
checkpoints/Globe10k_200000.pth: Not a directory
--2025-05-01 14:03:47--  https://avatartherapy.co.uk/download/WavLM-Large.pt
Resolving avatartherapy.co.uk (avatartherapy.co.uk)... 185.151.30.205, 2a07:7800::205
Connecting to avatartherapy.co.uk (avatartherapy.co.uk)|185.151.30.205|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1261965425 (1.2G)
Saving to: ‘wavlm/WavLM-Large.pt’

wavlm/WavLM-Large.p  23%[===>                ] 287.53M  31.3MB/s    in 10s     

2025-05-01 14:03:58 (28.4 MB/s) - Read error at byte 301495752/1261965425 (Connection reset by peer). Retrying.

--2025-05-01 14:03:59--  (try: 2)  https

  WeightNorm.apply(module, name, dim)


Loading checkpoint...


AssertionError: 

# 3. Load scaling factors and PCA loadings for VQ parameters

In [None]:
!cp /content/FreeVC/voice/FreeVC/globe-params-scale.txt .
!cp /content/FreeVC/voice/FreeVC/globe-params-pca.txt .


# VQ parameter scaling
norm=pd.read_csv("globe-params-scale.txt")
print(norm)
# VQ parameter PCA
components=pd.read_csv("globe-params-pca.txt")
print(components)



# 4. Calculate the VQ parameters from principal components

In [None]:
def calculate_vqparams(pca):
    pvalues=[0]*len(norm.index)
    for i,value in enumerate(pca):
        for j in range(len(norm.index)):
            pvalues[j] = pvalues[j] + pca[i] * components.iloc[i,j]
    #print(pvalues)
    return(pvalues)

# 5. Set up sliders for each VQ parameter

In [None]:
# Build a set of sliders for each parameter
params=[]
param_sliders=[]
for index, row in norm.iterrows():
    name=row['PARAMETER']
    params.append(name)
    mean=row['SHIFT']
    sd=row['SCALE']
    param_sliders.append(widgets.FloatSlider(value=mean,min=mean-3*sd,max=mean+3*sd,step=sd/10,description=name+':',readout_format='.2f'))

# 6. Function to update VQ parameter sliders given PCA

In [None]:
def update_params(pca):
    pvalues = calculate_vqparams(pca)
    for j in range(len(norm.index)):
        mean=norm['SHIFT'][j]
        sd=norm['SCALE'][j]
        param_sliders[j].value=mean+sd*pvalues[j]

# 7. Use MLP to predict speaker embedding from VQ parameters

In [None]:
# use trained MLP to convert vq embedding to FreeVC speaker embedding
!cp /content/FreeVC/voice/FreeVC/mlp.py .
!cp /content/FreeVC/voice/FreeVC/globevqnorm2xvec.mlp .

from mlp import MLP
mlp=MLP()
mlp.load("globevqnorm2xvec.mlp")
#
def predict_speaker(params):
    #print(params)
    embedding=mlp.forward(params)
    #print(embedding)
    return(embedding)



# 8. Build sliders for first 6 principal components of VQ params

In [None]:
# function to respond to change in value of slider
def value_change(change):
    slider=change['owner']
    pcatext.value=slider.description + '=' + str(change['new'])
    values=[]
    for slider in pca_sliders:
        values.append(slider.value)
    update_params(values)

# build sliders
pca_sliders=[]
for i in range(6):
    slider=widgets.FloatSlider(value=0.,min=-5.0,max=5.0,step=0.5,description='PCA'+str(i+1),readout_format='.2f')
    slider.observe(value_change,names='value')
    pca_sliders.append(slider)
pcatext=widgets.Text(value='',placeholder='',description='Status:',disabled=False)

# reset button
reset_button = widgets.Button(description='Reset')
def on_reset(b):
    for slider in pca_sliders:
        slider.value=0.1
        slider.value=0
reset_button.on_click(on_reset)

# PCA Synthesis button
pcago_button = widgets.Button(description='Go PCA')
def on_pcago(b):
    runconversion(0)
pcago_button.on_click(on_pcago)

# VQ Synthesis button
vqgo_button = widgets.Button(description='Go VQ')
def on_vqgo(b):
    runconversion(1)
vqgo_button.on_click(on_vqgo)

# 9. Run voice conversion from VQ or PCA values

In [None]:
!cp /content/FreeVC/voice/FreeVC/*.wav .

# select audio
import glob
wavlist=glob.glob("*.wav")
sndlist=[ s.replace(".wav","") for s in wavlist]
options=list(zip(sndlist,wavlist))
wavselect=widgets.Dropdown(options=options,value='whitelight.wav',description="Audio",disabled=False)

from IPython.display import Audio, clear_output
output4 = widgets.Output(layout={'border': '1px solid black'})

def runconversion(isvq=0):
    with output4:
        if isvq:
            # get VQ params directly from sliders
            vqparams=[]
            for j,slider in enumerate(param_sliders):
                mean=norm['SHIFT'][j]
                sd=norm['SCALE'][j]
                val=(slider.value-mean)/sd
                vqparams.append(val)
        else:
            # get  PCA slider values and calculate VQ params
            pcavalues=[]
            for slider in pca_sliders:
                pcavalues.append(slider.value)
            # create the VQ parameters from the PCA
            pcatext.value="calculate vq params"
            vqparams=calculate_vqparams(pcavalues)
        #print(vqparams)
        # calculate the speaker embedding using the MLP model
        pcatext.value="calculate speaker embedding"
        speaker_embedding=predict_speaker(vqparams)
        # perform conversion
        pcatext.value="voice conversion started"
        freevc.convert(wavselect.value,speaker_embedding,'out.wav')
        # replay audio
        pcatext.value="replaying"
        clear_output(wait=True)
        player = Audio('output/out.wav',autoplay=True)
        display(player)

output4

# 10. Create the user interface

In [None]:
# layout
title1=widgets.HTML("<h2>Raw Voice Parameters</h2>")
title2=widgets.HTML("<h2>Principal Components</h2>")
box_layout = Layout(display='flex', flex_flow='column', align_items='center',border='solid', width='50%')

HBox(children=[
    VBox([title2,*pca_sliders,wavselect,HBox([reset_button,pcago_button])],layout=box_layout),
    VBox([title1,*param_sliders,HBox([reset_button,vqgo_button])],layout=box_layout)
])
