# Set-Up

In [2]:
!pip install allosaurus

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting allosaurus
  Downloading allosaurus-1.0.2-py3-none-any.whl (52 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.1/52.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting resampy (from allosaurus)
  Downloading resampy-0.4.2-py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting panphon (from allosaurus)
  Downloading panphon-0.20.0-py2.py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting unicodecsv (from panphon->allosaurus)
  Downloading unicodecsv-0.14.1.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting munkres (from panphon->allosaurus)
  Downloading munkres-1.1.4-py2.py3-none-any.whl (7.0 kB)
Building wheels for

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# SET TO PROJECT FOLDER
dir = '/content/drive/MyDrive/cse256'

# Features DF

In [5]:
import pandas as pd

meta = pd.read_csv('drive/MyDrive/cse256/speakers_all.csv')
display(meta)

Unnamed: 0,age,age_onset,birthplace,filename,native_language,sex,speakerid,country,file_missing?,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,24.0,12.0,"koussi, senegal",balanta,balanta,male,788,senegal,True,,,
1,18.0,10.0,"buea, cameroon",cameroon,cameroon,male,1953,cameroon,True,,,
2,48.0,8.0,"hong, adamawa, nigeria",fulfulde,fulfulde,male,1037,nigeria,True,,,
3,42.0,42.0,"port-au-prince, haiti",haitian,haitian,male,1165,haiti,True,,,
4,40.0,35.0,"port-au-prince, haiti",haitian,haitian,male,1166,haiti,True,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2167,46.0,5.0,"lagos, nigeria",yoruba3,yoruba,female,766,nigeria,False,,,
2168,46.0,12.0,"lagos, nigeria",yoruba4,yoruba,male,851,nigeria,False,,,
2169,47.0,2.0,"ibadan, nigeria",yoruba5,yoruba,female,2023,nigeria,False,,,
2170,31.0,1.0,"bethel, alaska, usa",yupik1,yupik,female,571,usa,False,,,


# Load in Speaker-to-Phonemes dictionary

In [7]:
import pickle

speaker_to_phonemes = pickle.load(open(dir + '/speakers_to_phonemes_dict.pickle', 'rb'))

# Get list of unique IPA phoneme vocab

In [None]:
all_phonemes = [item for sublist in speaker_to_phonemes.values() for item in sublist]
unique_phonemes = list(set(all_phonemes))

phone_code = {}
num = 1
for phon in unique_phonemes:
  phone_code[phon] = num
  num += 1

phone_code

# Convert phonemes into BoW feature vector

In [27]:
import glob, pickle

lang_list = ['mandarin', 'italian']

speakers_to_bow = {}
speaker_labels = {}
for speaker in speaker_to_phonemes.keys():
  for lang in lang_list:
    if lang in speaker:
      phonemes = speaker_to_phonemes[speaker]

      phone_bow = []
      for phon in phone_code.keys():
        count = phonemes.count(phon)
        phone_bow.append(count)

      speakers_to_bow[speaker] = phone_bow
      speaker_labels[speaker] = lang

In [28]:
from sklearn.decomposition import PCA

values = list(speakers_to_bow.values())
names = list(speakers_to_bow.keys())
langs = [speaker_labels[n] for n in names]

pca = PCA(n_components=3)
components = pca.fit_transform(values)

In [29]:
import plotly.express as px

fig = px.scatter_3d(components, x=0, y=1, z=2, color=langs)
fig.show()

In [31]:
fig = px.scatter_matrix(
    components,
    labels=langs,
    dimensions=range(3),
    color=langs
)
fig.update_traces(diagonal_visible=False)
fig.show()

In [32]:
import numpy as np

lang_to_bows = {}
for lang in lang_list:

  if lang not in lang_to_bows:
    lang_to_bows[lang] = []

  for s in speakers_to_bow:
    if lang in s:
      lang_to_bows[lang].append(speakers_to_bow[s])
      break


lang_to_centroid = {}
for lang in lang_to_bows:
  centroid = np.mean(np.asarray(lang_to_bows[lang]))
  print(lang, " centroid: ", centroid)

mandarin  centroid:  5.9125
italian  centroid:  5.85


# Top N Languages

In [33]:
lang_list = ['english', 'spanish', 'mandarin', 'french', 'korean', 'portuguese', 'russian', 'dutch', 'turkish']

speakers_to_bow = {}
speaker_labels = {}
for speaker in speaker_to_phonemes.keys():
  for lang in lang_list:
    if lang in speaker:
      phonemes = speaker_to_phonemes[speaker]

      phone_bow = []
      for phon in phone_code.keys():
        count = phonemes.count(phon)
        phone_bow.append(count)

      speakers_to_bow[speaker] = phone_bow
      speaker_labels[speaker] = lang

In [34]:
values = list(speakers_to_bow.values())
names = list(speakers_to_bow.keys())
langs = [speaker_labels[n] for n in names]

pca = PCA(n_components=3)
components = pca.fit_transform(values)

In [35]:
fig = px.scatter_3d(components, x=0, y=1, z=2, color=langs)
fig.show()

In [36]:
lang_to_bows = {}
for lang in lang_list:

  if lang not in lang_to_bows:
    lang_to_bows[lang] = []

  for s in speakers_to_bow:
    if lang in s:
      lang_to_bows[lang].append(speakers_to_bow[s])
      break


lang_to_centroid = {}
for lang in lang_to_bows:
  centroid = np.mean(np.asarray(lang_to_bows[lang]))
  print(lang, " centroid: ", centroid)

english  centroid:  5.9625
spanish  centroid:  6.1375
mandarin  centroid:  5.9125
french  centroid:  5.975
korean  centroid:  6.225
portuguese  centroid:  6.225
russian  centroid:  5.875
dutch  centroid:  5.8375
turkish  centroid:  6.425
