# Set-Up

In [None]:
!pip install allosaurus

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting allosaurus
  Downloading allosaurus-1.0.2-py3-none-any.whl (52 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.1/52.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting resampy (from allosaurus)
  Downloading resampy-0.4.2-py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting panphon (from allosaurus)
  Downloading panphon-0.20.0-py2.py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting unicodecsv (from panphon->allosaurus)
  Downloading unicodecsv-0.14.1.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting munkres (from panphon->allosaurus)
  Downloading munkres-1.1.4-py2.py3-none-any.whl (7.0 kB)
Building wheels for

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# SET TO PROJECT FOLDER
dir = '/content/drive/MyDrive/cse256'

# Features DF

In [None]:
import pandas as pd

meta = pd.read_csv('drive/MyDrive/cse256/speakers_all.csv')
display(meta)

Unnamed: 0,age,age_onset,birthplace,filename,native_language,sex,speakerid,country,file_missing?,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,24.0,12.0,"koussi, senegal",balanta,balanta,male,788,senegal,True,,,
1,18.0,10.0,"buea, cameroon",cameroon,cameroon,male,1953,cameroon,True,,,
2,48.0,8.0,"hong, adamawa, nigeria",fulfulde,fulfulde,male,1037,nigeria,True,,,
3,42.0,42.0,"port-au-prince, haiti",haitian,haitian,male,1165,haiti,True,,,
4,40.0,35.0,"port-au-prince, haiti",haitian,haitian,male,1166,haiti,True,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2167,46.0,5.0,"lagos, nigeria",yoruba3,yoruba,female,766,nigeria,False,,,
2168,46.0,12.0,"lagos, nigeria",yoruba4,yoruba,male,851,nigeria,False,,,
2169,47.0,2.0,"ibadan, nigeria",yoruba5,yoruba,female,2023,nigeria,False,,,
2170,31.0,1.0,"bethel, alaska, usa",yupik1,yupik,female,571,usa,False,,,


# Load in Speaker-to-Phonemes dictionary

In [None]:
import pickle

speaker_to_phonemes = pickle.load(open(dir + '/speakers_to_phonemes_dict.pickle', 'rb'))

# Get list of unique IPA phoneme vocab

In [None]:
all_phonemes = [item for sublist in speaker_to_phonemes.values() for item in sublist]
unique_phonemes = list(set(all_phonemes))

phone_code = {}
num = 1
for phon in unique_phonemes:
  phone_code[phon] = num
  num += 1

phone_code

{'ɥ': 1,
 'b': 2,
 'ʒ': 3,
 'θ': 4,
 'z': 5,
 'u': 6,
 'ɤ': 7,
 'r': 8,
 'f': 9,
 'm': 10,
 'l': 11,
 'æ': 12,
 'ɰ': 13,
 'y': 14,
 'ɕ': 15,
 'ʰ': 16,
 '̥': 17,
 'i': 18,
 '̚': 19,
 'ɣ': 20,
 'ʲ': 21,
 '̆': 22,
 'ʋ': 23,
 'ʏ': 24,
 'ç': 25,
 'ɹ': 26,
 'ɳ': 27,
 'x': 28,
 'o': 29,
 'e': 30,
 'v': 31,
 'ʁ': 32,
 'h': 33,
 '̪': 34,
 'ː': 35,
 'ɒ': 36,
 'œ': 37,
 'k': 38,
 'p': 39,
 'ɴ': 40,
 'β': 41,
 'ʃ': 42,
 ' ': 43,
 '̟': 44,
 'j': 45,
 '̞': 46,
 'ʝ': 47,
 'ʔ': 48,
 'ʂ': 49,
 'ʌ': 50,
 'ʀ': 51,
 'ɯ': 52,
 'n': 53,
 'ɻ': 54,
 'ә': 55,
 't': 56,
 'ɪ': 57,
 'ɔ': 58,
 'ɑ': 59,
 'ɡ': 60,
 'ɾ': 61,
 '̤': 62,
 'ʊ': 63,
 'ø': 64,
 'ə': 65,
 'ɨ': 66,
 'd': 67,
 'ʐ': 68,
 '̩': 69,
 'ð': 70,
 'w': 71,
 'ɟ': 72,
 's': 73,
 'a': 74,
 'ɛ': 75,
 '̃': 76,
 'g': 77,
 'ɲ': 78,
 'ŋ': 79,
 '͡': 80}

# Convert phonemes into BoW feature vector

In [None]:
import glob, pickle

lang_list = ['mandarin', 'italian']

speakers_to_bow = {}
speaker_labels = {}
for speaker in speaker_to_phonemes.keys():
  for lang in lang_list:
    if lang in speaker:
      phonemes = speaker_to_phonemes[speaker]

      phone_bow = []
      for phon in phone_code.keys():
        count = phonemes.count(phon)
        phone_bow.append(count)

      speakers_to_bow[speaker] = phone_bow
      speaker_labels[speaker] = lang

In [None]:
from sklearn.decomposition import PCA

values = list(speakers_to_bow.values())
names = list(speakers_to_bow.keys())
langs = [speaker_labels[n] for n in names]

pca = PCA(n_components=3)
components = pca.fit_transform(values)

In [None]:
import plotly.express as px

fig = px.scatter_3d(components, x=0, y=1, z=2, color=langs)
fig.show()

In [None]:
fig = px.scatter_matrix(
    components,
    labels=langs,
    dimensions=range(3),
    color=langs
)
fig.update_traces(diagonal_visible=False)
fig.show()

In [None]:
import numpy as np

lang_to_comps = {}
for lang in lang_list:

  if lang not in lang_to_comps:
    lang_to_comps[lang] = []

  for name, comp in zip(names, values):
    if lang in name:
      lang_to_comps[lang].append(comp)


lang_to_centroid = {}
for lang in lang_to_comps:
  centroid = np.average(np.asarray(lang_to_comps[lang]), axis=0)
  lang_to_centroid[lang] = centroid

mandarin_italian_dist = np.linalg.norm(lang_to_centroid['italian'] - lang_to_centroid['mandarin'])
print("Mandarin-Italian Centroid Distance: ", mandarin_italian_dist)

Mandarin-Italian Centroid Distance:  5.148177741614851


# Top N Languages

In [None]:
lang_list = ['english', 'spanish', 'mandarin', 'french', 'korean', 'portuguese', 'russian', 'dutch', 'turkish']

speakers_to_bow = {}
speaker_labels = {}
for speaker in speaker_to_phonemes.keys():
  for lang in lang_list:
    if lang in speaker:
      phonemes = speaker_to_phonemes[speaker]

      phone_bow = []
      for phon in phone_code.keys():
        count = phonemes.count(phon)
        phone_bow.append(count)

      speakers_to_bow[speaker] = phone_bow
      speaker_labels[speaker] = lang

In [None]:
values = list(speakers_to_bow.values())
names = list(speakers_to_bow.keys())
langs = [speaker_labels[n] for n in names]

pca = PCA(n_components=3)
components = pca.fit_transform(values)

In [None]:
fig = px.scatter_3d(components, x=0, y=1, z=2, color=langs)
fig.show()

In [None]:
lang_to_comps = {}
for lang in lang_list:

  if lang not in lang_to_comps:
    lang_to_comps[lang] = []

  for name, comp in zip(names, values):
    if lang in name:
      lang_to_comps[lang].append(comp)


lang_to_centroid = {}
for lang in lang_to_comps:
  centroid = np.average(np.asarray(lang_to_comps[lang]), axis=0)
  lang_to_centroid[lang] = centroid


english_centroid = lang_to_centroid['english']
for lang in lang_to_centroid:
  if lang!='english':
    distance = np.linalg.norm(lang_to_centroid[lang] - english_centroid)
    print(lang.upper(), ' distance: ', distance)

SPANISH  distance:  6.355981390154438
MANDARIN  distance:  6.446347753645995
FRENCH  distance:  4.601127272496815
KOREAN  distance:  7.867154141731113
PORTUGUESE  distance:  3.273000741863762
RUSSIAN  distance:  6.196142056091757
DUTCH  distance:  4.834054044140781
TURKISH  distance:  10.220450266216522
