<a href="https://colab.research.google.com/github/mhuckvale/voice/blob/main/Embedding_PCA_Demonstration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Demonstrate FreeVC with Principal Components of a Deep-Speaker embedding trained on Globe sample dataset


# Configuration

In [1]:
%cd /content
!pip3 install torch torchaudio torchvision torchtext torchdata webrtcvad
!rm -rf FreeVC
!git clone https://github.com/OlaWod/FreeVC.git
%cd FreeVC
!pwd

/content
Collecting torchtext
  Downloading torchtext-0.18.0-cp311-cp311-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting torchdata
  Downloading torchdata-0.11.0-py3-none-any.whl.metadata (6.3 kB)
Collecting webrtcvad
  Downloading webrtcvad-2.0.10.tar.gz (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.2/66.2 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9

# 1. Set up Python environment

In [2]:
import math
import pandas as pd
import numpy as np

import ipywidgets as widgets
from ipywidgets import HBox, VBox, Layout
from IPython.display import Audio
%matplotlib inline


# 2. Load the FreeVC model and weights trained on Globe

In [4]:
!rm -rf voice
!git clone https://github.com/mhuckvale/voice.git

%cd /content/FreeVC
!cp /content/FreeVC/voice/FreeVC/freevc.py .
!mkdir checkpoints
!cp /content/FreeVC/voice/FreeVC/checkpoints/*.pt* checkpoints

# download the voice conversion model
!wget -O checkpoints/G_30000.pth https://avatartherapy.co.uk/download/G_30000.pth

# download wavlm
!wget -O wavlm/WavLM-Large.pt https://avatartherapy.co.uk/download/WavLM-Large.pt

# use FreeVC to apply speaker embedding to an audio file
import os
from types import SimpleNamespace
from freevc import FreeVC

args = SimpleNamespace()
args.hpfile="configs/freevc.json"
args.ptfile="checkpoints/G_30000.pth"
args.spfile='checkpoints/pretrained_bak_5805000.pt'
args.outdir="output"
print(args)

os.makedirs(args.outdir, exist_ok=True)
freevc=FreeVC()
freevc.load(args)

Cloning into 'voice'...
remote: Enumerating objects: 73, done.[K
remote: Counting objects: 100% (73/73), done.[K
remote: Compressing objects: 100% (70/70), done.[K
remote: Total 73 (delta 27), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (73/73), 16.36 MiB | 7.63 MiB/s, done.
Resolving deltas: 100% (27/27), done.
/content/FreeVC
--2025-05-09 14:03:49--  https://avatartherapy.co.uk/download/G_30000.pth
Resolving avatartherapy.co.uk (avatartherapy.co.uk)... 185.151.30.205, 2a07:7800::205
Connecting to avatartherapy.co.uk (avatartherapy.co.uk)|185.151.30.205|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 472747530 (451M)
Saving to: ‘checkpoints/G_30000.pth’


2025-05-09 14:04:13 (19.2 MB/s) - ‘checkpoints/G_30000.pth’ saved [472747530/472747530]

--2025-05-09 14:04:13--  https://avatartherapy.co.uk/download/WavLM-Large.pt
Resolving avatartherapy.co.uk (avatartherapy.co.uk)... 185.151.30.205, 2a07:7800::205
Connecting to avatartherapy.

  WeightNorm.apply(module, name, dim)


Loading checkpoint...
Loading WavLM for content...
Loading speaker encoder...
Loaded the voice encoder model on cuda in 0.09 seconds.


# 3. Load scaling factors and PCA loadings for VQ parameters

In [19]:
!cp /content/FreeVC/voice/FreeVC/globe-embed256-pcacomp.txt .


# VQ parameter PCA
components=np.loadtxt("globe-embed256-pcacomp.txt")
print(components)



[[ 0.04894757  0.01457478 -0.00033301 ... -0.05568676  0.0935198
  -0.0265537 ]
 [-0.00524807 -0.033582    0.03436786 ...  0.06803081 -0.06672643
   0.00078232]
 [-0.01579067  0.02275959 -0.08456072 ... -0.17109183 -0.00400106
  -0.03697687]
 ...
 [-0.04258716  0.06388158  0.08685379 ... -0.10348468  0.09588664
   0.06172095]
 [ 0.05119214 -0.03083188 -0.04197362 ... -0.01063311 -0.03944961
  -0.0287426 ]
 [-0.08024214  0.04484433  0.03005735 ... -0.08178106 -0.08418895
   0.03610608]]


# 4. Calculate the VQ parameters from principal components

In [32]:
def calculate_embedding(pca):
    pvalues=np.zeros((256,),dtype=np.float32)
    for i in range(8):
        for j in range(256):
            pvalues[j] = pvalues[j] + pca[i] * components[i,j]
    #print(pvalues)
    return(pvalues)

# 5. Build sliders for first 8 principal components of embedding

In [36]:
# build sliders
pca_sliders=[]
for i in range(8):
    slider=widgets.FloatSlider(value=0.,min=-5.0,max=5.0,step=0.5,description='PCA'+str(i+1),readout_format='.2f')
    pca_sliders.append(slider)
pcatext=widgets.Text(value='',placeholder='',description='Status:',disabled=False)

# reset button
reset_button = widgets.Button(description='Reset')
def on_reset(b):
    for slider in pca_sliders:
        slider.value=0.1
        slider.value=0
reset_button.on_click(on_reset)

# PCA Synthesis button
pcago_button = widgets.Button(description='Go PCA')
def on_pcago(b):
    runconversion()
pcago_button.on_click(on_pcago)


# 9. Run voice conversion from VQ or PCA values

In [37]:
!cp /content/FreeVC/voice/FreeVC/*.wav .

# select audio
import glob
wavlist=glob.glob("*.wav")
sndlist=[ s.replace(".wav","") for s in wavlist]
options=list(zip(sndlist,wavlist))
wavselect=widgets.Dropdown(options=options,value='whitelight.wav',description="Audio",disabled=False)

from IPython.display import Audio, clear_output
output4 = widgets.Output(layout={'border': '1px solid black'})

def runconversion():
    with output4:
        # get  PCA slider values and calculate VQ params
        pcavalues=[]
        for slider in pca_sliders:
            pcavalues.append(slider.value)
        # create the embedding from the PCA
        pcatext.value="calculate embedding"
        speaker_embedding=calculate_embedding(pcavalues)
        # perform conversion
        pcatext.value="voice conversion started"
        freevc.convert(wavselect.value,speaker_embedding,'out.wav')
        # replay audio
        pcatext.value="replaying"
        clear_output(wait=True)
        player = Audio('output/out.wav',autoplay=True)
        display(player)

output4

Output(layout=Layout(border='1px solid black'))

# 10. Create the user interface

In [38]:
# layout
title=widgets.HTML("<h2>Principal Components</h2>")
box_layout = Layout(display='flex', flex_flow='column', align_items='center',border='solid', width='50%')

HBox(children=[
    VBox([title,*pca_sliders,wavselect,HBox([reset_button,pcago_button])],layout=box_layout),
])


HBox(children=(VBox(children=(HTML(value='<h2>Principal Components</h2>'), FloatSlider(value=0.0, description=…