# Set-up

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# SET TO PROJECT FOLDER
dir = '/content/drive/MyDrive/cse256'

# LOCATION OF FEATURE FOLDER
folder = dir + '/wav2vec2_features'
folder

'/content/drive/MyDrive/cse256/wav2vec2_features'

In [3]:
! unzip '/content/drive/MyDrive/cse256/wav2vec2_features.zip'

# Get features from .wav files


Tutorial: [Speech Recognition with Wav2Vec ](https://pytorch.org/audio/stable/tutorials/speech_recognition_pipeline_tutorial.html)

In [5]:
import IPython
import matplotlib.pyplot as plt
from torchaudio.utils import download_asset

In [6]:
import glob
# speech_files = glob.glob("output/*.wav")

In [6]:
import torch
import torchaudio

print(torch.__version__)
print(torchaudio.__version__)

torch.random.manual_seed(0)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
print(device)

2.0.1+cu118
2.0.2+cu118
cpu


In [7]:
# bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H

# print("Sample Rate:", bundle.sample_rate)

# print("Labels:", bundle.get_labels())

# model = bundle.get_model().to(device)

# print(model.__class__)

Sample Rate: 16000
Labels: ('-', '|', 'E', 'T', 'A', 'O', 'N', 'I', 'H', 'S', 'R', 'D', 'L', 'U', 'M', 'W', 'C', 'F', 'G', 'Y', 'P', 'B', 'V', 'K', "'", 'X', 'J', 'Q', 'Z')
<class 'torchaudio.models.wav2vec2.model.Wav2Vec2Model'>


In [8]:
def file_to_features(file_path):
  waveform, sample_rate = torchaudio.load(file_path)
  waveform = waveform.to(device)

  if sample_rate != bundle.sample_rate:
    waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)

  with torch.inference_mode():
    features, _ = model.extract_features(waveform)

  return features

In [7]:
import pickle
import os
import numpy as np

# GET FEATURES FROM WAV FILES

# for i, f in enumerate(speech_files):
#   name = f.split(".")[0].split('/')[1]
#   file_name = dir + '/wav2vec2_features/' + name + '.pickle'
#   print('Audio ', i, ': ', file_name)

#   if not os.path.exists(file_name):
#     feat = file_to_features(f)

#     feat = [t.reshape((t.shape[1], t.shape[2])) for t in feat]
#     stacked_feat = torch.stack(feat)
#     features_np = stacked_feat.numpy()

#     average_features = np.mean( np.array(np.split(features_np, 12, axis=0)), axis=0)
#     average_features = average_features.reshape((average_features.shape[1], average_features.shape[2]))

#     with open(file_name, 'wb') as handle:
#       pickle.dump(average_features, handle)

# Features DF

In [8]:
import pandas as pd

meta = pd.read_csv('drive/MyDrive/cse256/speakers_all.csv')
display(meta)

Unnamed: 0,age,age_onset,birthplace,filename,native_language,sex,speakerid,country,file_missing?,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,24.0,12.0,"koussi, senegal",balanta,balanta,male,788,senegal,True,,,
1,18.0,10.0,"buea, cameroon",cameroon,cameroon,male,1953,cameroon,True,,,
2,48.0,8.0,"hong, adamawa, nigeria",fulfulde,fulfulde,male,1037,nigeria,True,,,
3,42.0,42.0,"port-au-prince, haiti",haitian,haitian,male,1165,haiti,True,,,
4,40.0,35.0,"port-au-prince, haiti",haitian,haitian,male,1166,haiti,True,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2167,46.0,5.0,"lagos, nigeria",yoruba3,yoruba,female,766,nigeria,False,,,
2168,46.0,12.0,"lagos, nigeria",yoruba4,yoruba,male,851,nigeria,False,,,
2169,47.0,2.0,"ibadan, nigeria",yoruba5,yoruba,female,2023,nigeria,False,,,
2170,31.0,1.0,"bethel, alaska, usa",yupik1,yupik,female,571,usa,False,,,


# Load in audio features

In [9]:
wav2vec_list = glob.glob(dir + "/wav2vec2_features/*.pickle")

In [None]:
import pickle

lang_list = ['mandarin', 'italian']

speaker_to_embeddings = {}
speaker_labels = {}
for f in wav2vec_list[:]:
  name = f.split('/')[-1].split('.')[0]
  for lang in lang_list:
    if lang in name:
      speaker_features = pickle.load(open(f, 'rb'))
      speaker_to_embeddings[name] = speaker_features
      speaker_labels[name] = lang
      print(lang)
      break

In [9]:
# PAD FEATURES TO MAKE SAME LENGTH
import numpy as np

max = 0
for s in speaker_to_embeddings:
  if speaker_to_embeddings[s].shape[0] > max:
    max = speaker_to_embeddings[s].shape[0]

print("Max embedding length: ", max)

speaker_to_padded = {}
for s in speaker_to_embeddings:
  diff = max - speaker_to_embeddings[s].shape[0]
  padded =  np.pad(speaker_to_embeddings[s], ((diff,0), (0, 0)), 'constant')
  flattened = padded.flatten()
  speaker_to_padded[s] = flattened

print("Padded shape: ", flattened.shape)

Max embedding length:  2779
Padded shape:  (2134272,)


# Clustering

In [10]:
from sklearn.decomposition import PCA

values = list(speaker_to_padded.values())
names = list(speaker_to_padded.keys())
langs = [speaker_labels[n] for n in names]

pca = PCA(n_components=3)
components_3 = pca.fit_transform(values)

In [11]:
import plotly.express as px

fig = px.scatter_3d(components_3, x=0, y=1, z=2, color=langs)
fig.show()

In [12]:
fig = px.scatter_matrix(
    components_3,
    dimensions=range(3),
    labels=langs,
    color=langs
)
fig.update_traces(diagonal_visible=False)
fig.show()

In [13]:
lang_to_comps = {}
for lang in lang_list:

  if lang not in lang_to_comps:
    lang_to_comps[lang] = []

  for name, comp in zip(names, values):
    if lang in name:
      lang_to_comps[lang].append(comp)


lang_to_centroid = {}
for lang in lang_to_comps:
  centroid = np.average(np.asarray(lang_to_comps[lang]), axis=0)
  lang_to_centroid[lang] = centroid

mandarin_italian_dist = np.linalg.norm(lang_to_centroid['italian'] - lang_to_centroid['mandarin'])
print("Mandarin-Italian Centroid Distance: ", mandarin_italian_dist)

Mandarin-Italian Centroid Distance:  56.86651


Top N Languages


In [None]:
import pickle

lang_list = ['english', 'mandarin', 'french', 'korean', 'russian', 'turkish']

speaker_to_embeddings = {}
speaker_labels = {}
speaker_to_count = {}

for lang in lang_list:
  speaker_to_count[lang] = 0

for f in wav2vec_list[:]:
  name = f.split('/')[-1].split('.')[0]
  for lang in lang_list:
    if (lang in name) and (speaker_to_count[lang] <= 50):
      speaker_features = pickle.load(open(f, 'rb'))
      speaker_to_embeddings[name] = speaker_features
      speaker_labels[name] = lang
      speaker_to_count[lang] += 1
      print(lang)
      break

In [11]:
# PAD FEATURES TO MAKE SAME LENGTH

import numpy as np

max = 0
for s in speaker_to_embeddings:
  if speaker_to_embeddings[s].shape[0] > max:
    max = speaker_to_embeddings[s].shape[0]

print("Max embedding length: ", max)

speaker_to_padded = {}
for s in speaker_to_embeddings:
  diff = max - speaker_to_embeddings[s].shape[0]
  padded =  np.pad(speaker_to_embeddings[s], ((diff,0), (0, 0)), 'constant')
  flattened = padded.flatten()
  speaker_to_padded[s] = flattened

print("Padded shape: ", flattened.shape)

Max embedding length:  3342
Padded shape:  (2566656,)


In [None]:
from sklearn.decomposition import PCA

values = list(speaker_to_padded.values())
names = list(speaker_to_padded.keys())
langs = [speaker_labels[n] for n in names]

pca = PCA(n_components=3)
components = pca.fit_transform(values)

In [None]:
import plotly.express as px

fig = px.scatter_3d(components, x=0, y=1, z=2, color=langs)
fig.show()

In [None]:
fig = px.scatter_matrix(
    components,
    labels=langs,
    dimensions=range(3),
    color=langs
)
fig.update_traces(diagonal_visible=False)
fig.show()

In [None]:
lang_to_comps = {}
for lang in lang_list:

  if lang not in lang_to_comps:
    lang_to_comps[lang] = []

  for name, comp in zip(names, values):
    if lang in name:
      lang_to_comps[lang].append(comp)


lang_to_centroid = {}
for lang in lang_to_comps:
  centroid = np.average(np.asarray(lang_to_comps[lang]), axis=0)
  lang_to_centroid[lang] = centroid


english_centroid = lang_to_centroid['english']
for lang in lang_to_centroid:
  if lang!='english':
    distance = np.linalg.norm(lang_to_centroid[lang] - english_centroid)
    print(lang.upper(), ' distance: ', distance)