In [1]:


# imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
from pathlib import Path
import os
import re
from collections import defaultdict
from google.colab import drive
from google.colab import files
from tqdm import tqdm

In [2]:
layer_num = 4
model = "Hubert_collapsed_layer3"


# get path to data
# directory =  "/content/drive/MyDrive/Grad project/models_hidden_states/Wav2Vec2/dev-clean"
# directory =  "/content/drive/MyDrive/Grad project/models_hidden_states/Wav2Vec2/dev-clean"

# directory = "/content/drive/MyDrive/Grad project/Oli_files/orig"
directory = "/content/drive/MyDrive/Grad project/models_hidden_states/" + model + "/dev-clean"
# directory = "/content/drive/MyDrive/Grad project/models_hidden_states/Hubert_outputs/Hubert_large_ASR_last_layer/Hubert_outputs/dev-clean"
# alignment_file = "/content/drive/MyDrive/Grad project/Oli_files/train-clean-100.ali"
alignment_file = "/content/drive/MyDrive/Grad project/Oli_files/dev-clean.ali"



dir_list = [x for x in next(os.walk(directory))][1]

print(dir_list)

# for some reason makes later function load files in correct order... create list with bash script?
# dir_list = [1272, 1462, 1673, 174, 1919, 1988, 1993, 2035, 2078, 2086, 2277, 2412, 2428, 251, 2803, 2902, 3000, 3081, 3170, 3536, 3576, 3752, 3853, 422, 5338, 5536, 5694, 5895, 6241, 6295, 6313, 6319, 6345, 652, 777, 7850, 7976, 8297, 84, 8842]

['1988', '2803', '2277', '5895', '6241', '6313', '5694', '7976', '8297', '3000', '7850', '1462', '8842', '3752', '2428', '2902', '6295', '5536', '652', '2078', '3536', '174', '2086', '1272', '1919', '6345', '3081', '3576', '777', '1673', '6319', '5338', '422', '3853', '1993', '2035', '251', '2412', '3170', '84']


In [3]:
apply_proj = False
#apply projection
if apply_proj:
  proj_mat_dir = "/content/drive/MyDrive/Grad project/spk_phone_joint_matrices/" + model + "/layer_" + str(layer_num) # path to speaker matrix
  speaker_arrays = np.load(proj_mat_dir + "/speaker_matrix.npy")
  pca_speaker = PCA(n_components=40)
  principal_components_speaker = pca_speaker.fit_transform(speaker_arrays)

In [4]:
with open(alignment_file, "r") as f:
  lines = [line.split() for line in f.readlines()]

In [5]:
utt_allignments = {}


for line in lines[1:]:
  if line[0] in utt_allignments:
    utt_allignments[line[0]].append(line)
  else:
    utt_allignments[line[0]] = [line]




In [6]:
# for walking through features directory and retrieving files

def GetFiles(path: str):

                                                                # store files in a list
  files = []
  speaker_index = []

  for (root, dirs, file) in os.walk(path):
    if re.match(f"^{path}/\d+$", root):                         # group files by speaker
      speaker_index.append(root[len(f"{path}/"):])              # only append speaker id
      try:
        files.append(sorted(speaker_sublist))                   # order by utterance sample
      except NameError:                                         # speaker_sublist doesn't exist in the first iteration
        pass
      speaker_sublist = []

    for f in file:                                              # append all files of specific speaker
      if f[-3:] == "npy":                                       # remove files with weird suffix after .npy
        speaker_sublist.append(f"{root}/{f}")

  files.append(sorted(speaker_sublist))                         # to capture final speaker at the end of loop



  return files, speaker_index

In [7]:
def OrgFiles(dir_list: list, file_list: list, spk_list: list):
  spk_id_file_list = [(spk_list[i], file_list[i]) for i in range(len(spk_list))]
  sorted_file_list = [nested[1] for nested in sorted(spk_id_file_list)]
  return sorted_file_list

In [8]:
def RemoveDupl(file_list: list):
  for spk_idx,by_spk in enumerate(file_list):
    for array_path in by_spk:
      if array_path[-5] == ")":                          # in google colab duplicates will have this pattern...
        file_list[spk_idx].remove(array_path)
  return file_list

In [9]:
def Averaging(dict_list: list):

  avg_joint = []
  avg_spk = []

  all_phones_dict = {}

  for phones_dict in tqdm(dict_list):
    phones_by_speaker = []
    by_spk = []

    for key in phones_dict.keys():

      ## for joint matrix
      stacked = np.vstack(phones_dict[key])
      avg_phone_spk = np.average(stacked, axis=0)
      phones_by_speaker.append((key, avg_phone_spk))

      ## for speaker matrix
      if key == "SIL" or key == "SPN":
       continue
      by_spk.extend(phones_dict[key])                     # add all phones into one list to average by speaker

      ## for phone matrix
      if key == "SIL" or key == "SPN":
        continue
      elif key not in all_phones_dict:
        all_phones_dict[key] = np.vstack(phones_dict[key])
      else:
        all_phones_dict[key] = np.vstack([all_phones_dict[key], np.vstack(phones_dict[key])])

    ## for joint matrix
    avg_joint.append(phones_by_speaker)

    ## for speaker matrix
    stacked_spk = np.vstack([by_spk])
    avg_by_spk = np.average(stacked_spk, axis=0)
    avg_spk.append(avg_by_spk)

  ## for phone matrix
  avg_phones = [(phone, np.average(all_phones_dict[phone], axis=0)) for phone in all_phones_dict]

  return avg_phones, avg_spk, avg_joint

In [10]:
def AvgArrays(file_list: list):

  avg_by_speaker = []

  #utts_speaker = [np.load(utterance) for speakers in file_list for utterance in speakers]
  for speaker in file_list:
    for index,utterance in enumerate(speaker):
      utterance = np.load(utterance)[layer_num].squeeze()
      # utterance = np.load(utterance)
      if index == 0:
        utt = utterance
      else:
        utt = np.concatenate((utt, utterance))

    avg_speaker = np.average(utt, axis = 0)
    avg_by_speaker.append(avg_speaker)

  return avg_by_speaker

In [11]:
def StackArrs(dir_list: list, joint: list):

  for_all = []
  speaker_length = []

  for nmbr,l in enumerate(joint):
    bs = []                                                  # bs = "by speaker"
    for s in l:
      bs.append(s[0])
      speaker_length.append(dir_list[nmbr])

    for_all.append(bs)

  joint_arrays = np.vstack([s[1] for l in joint for s in l]) # grab averaged vector from tuple element in list
  #print(joint_arrays.shape)
  #print(len(for_all))
  #print(len(speaker_length))
  return joint_arrays, for_all, speaker_length

In [12]:
# dimensions = np.arange(0, 768, dtype=int)                 # name each column in df
files, speaker_id = GetFiles(directory)                   # retrieve files sorted by speaker
sorted_files = OrgFiles(dir_list, files, speaker_id)      # put them in order of appearance of directory (to match alignment file)
sorted_no_dupl_files = RemoveDupl(sorted_files)

In [13]:
from collections import Counter
import linecache
import os
import tracemalloc

def display_top(snapshot, key_type='lineno', limit=3):
    snapshot = snapshot.filter_traces((
        tracemalloc.Filter(False, "<frozen importlib._bootstrap>"),
        tracemalloc.Filter(False, "<unknown>"),
    ))
    top_stats = snapshot.statistics(key_type)

    print("Top %s lines" % limit)
    for index, stat in enumerate(top_stats[:limit], 1):
        frame = stat.traceback[0]
        # replace "/path/to/module/file.py" with "module/file.py"
        filename = os.sep.join(frame.filename.split(os.sep)[-2:])
        print("#%s: %s:%s: %.1f KiB"
              % (index, filename, frame.lineno, stat.size / 1024))
        line = linecache.getline(frame.filename, frame.lineno).strip()
        if line:
            print('    %s' % line)

    other = top_stats[limit:]
    if other:
        size = sum(stat.size for stat in other)
        print("%s other: %.1f KiB" % (len(other), size / 1024))
    total = sum(stat.size for stat in top_stats)
    print("Total allocated size: %.1f KiB" % (total / 1024))

In [None]:



# for Hubert and all 20ms frame rate models

# tracemalloc.start()

list_phones_dicts = []
for speaker in tqdm(sorted_no_dupl_files):
  phones_dict = defaultdict(list)

  for index,utterance_file_path in enumerate(speaker):
    # print("------------------------------")
    # print("utterance:",utterance)
    # print(utterance_file_path)
    utterance_id = utterance_file_path.split("/")[-1].split(".")[0]
    # utterance = np.load(utterance_file_path, mmap_mode="r")[layer_num].squeeze()
    utterance = np.load(utterance_file_path, mmap_mode="r")[layer_num]
    # print(utterance.shape)
    # utterance = np.load(utterance_file_path, allow_pickle=True)[layer_num].squeeze()

    # utterance = np.load(utterance_file_path, mmap_mode="r")
    # print(utterance.shape)
    utt_allignment = utt_allignments[utterance_id]
    num_accum_frames = 0
    num_accum_frames_orig = 0
    taken_frame = []
    taken_frame_second = []
    durations = []
    curr_duration = 0
    right_start = 0
    wrong_start = 0
    num_accum_frames = 0
    carry = 0
    odd_counter = 0
    for id, channel, start_time, phone_duration, phone_id, phone in utt_allignment:
      if phone == "SIL":          # all phones but "SIL" are of type AW_I, here removing _I ending
        key = "SIL"
      else:                                            # slice [5] grabs the whole phone id but here sorting according to the phones in figure 2 of Oli's paper
        key = phone[:-2]          # get phones
        if key[-1].isdigit():                          # go from AW0 and AW1 to phone label AW
          key = key[:-1]


      # print("phone_duration: ",phone_duration)
      # print("number_of_frames: ",int(float(phone_duration) * 50))
      # number_of_frames = int(float(phone_duration) * 100) # with a frame each 10 ms
      number_of_frames_dec = float(phone_duration) * 50 # with a frame each 20 ms
      # if (number_of_frames_dec * 10) % 2 != 0:
      #     odd_counter = 1


      number_of_frames = int(number_of_frames_dec) + odd_counter

      if round((number_of_frames_dec * 10) % 2, 2) != 0:
          odd_counter = 1
      else:
          odd_counter = 0

      durations.append(number_of_frames)
      taken_frame.append("*# start at: " + str(int((curr_duration * 100)/2)) + " counter: " + str(odd_counter) +  " #*")
      taken_frame_second.append("*# start at: " + str(curr_duration) + " counter: " + str(odd_counter) +  " #*")
      prev_duration = int((curr_duration * 100)/2)

      # print(round(float(phone_duration), 2))

      curr_duration += round(float(phone_duration), 2)
      number_of_frames = int((curr_duration * 100)/2) - prev_duration
      # print(round(float(phone_duration), 2) * 100)





      # if number_of_frames_dec.is_integer():
      #   carry = 0
      # else:
      #   carry = 1



      for frame in range(number_of_frames):
          try:
            if apply_proj:
              z = utterance[num_accum_frames + frame]
              for pca in pca_speaker.components_:
                # z = z - (np.transpose(z) * pca) * pca
                z = np.subtract(z, np.multiply(np.matmul(z, pca), pca))
            else:
              z = utterance[num_accum_frames + frame]
            phones_dict[key].append(z)
          except IndexError:
            print("length of utterance: ",utterance.shape[0])
            print("appended frame index:",num_accum_frames + frame)

      # num_accum_frames += number_of_frames + odd_counter
      num_accum_frames += number_of_frames


    # del utterance.f
    # utterance.close()
    # print(taken_frame)
    # print(taken_frame_second)
    # print(durations)
    # print(num_accum_frames)

  list_phones_dicts.append(phones_dict)
  # break

  # snapshot = tracemalloc.take_snapshot()
  # display_top(snapshot)




 88%|████████▊ | 35/40 [31:53<05:00, 60.05s/it]

In [None]:
# # tracemalloc.start()

# list_phones_dicts = []
# for speaker in tqdm(sorted_no_dupl_files):
#   phones_dict = defaultdict(list)

#   for index,utterance_file_path in enumerate(speaker):
#     # print("------------------------------")
#     # print("utterance:",utterance)
#     utterance_id = utterance_file_path.split("/")[-1].split(".")[0]
#     utterance = np.load(utterance_file_path, mmap_mode="r")[layer_num].squeeze()
#     # utterance = np.load(utterance_file_path, mmap_mode="r")

#     # utterance_new = utterance.tolist()
#     # print(utterance.shape)
#     utt_allignment = utt_allignments[utterance_id]
#     # print("num frames:",utterance.shape)
#     # print(utt_allignment)
#     # print("allignments duration:",float(utt_allignment[-1][2]) + float(utt_allignment[-1][3]))
#     num_accum_frames = 0
#     carry = 0
#     for id, channel, start_time, phone_duration, phone_id, phone in utt_allignment:
#       if phone == "SIL":          # all phones but "SIL" are of type AW_I, here removing _I ending
#         key = "SIL"
#       else:                                            # slice [5] grabs the whole phone id but here sorting according to the phones in figure 2 of Oli's paper
#         key = phone[:-2]          # get phones
#         if key[-1].isdigit():                          # go from AW0 and AW1 to phone label AW
#           key = key[:-1]


#       # print("phone_duration: ",phone_duration)
#       # print("number_of_frames: ",int(float(phone_duration) * 50))
#       number_of_frames = int(float(phone_duration) * 100) # with a frame each 10 ms
#       # number_of_frames_dec = float(phone_duration) * 50 # with a frame each 20 ms


#       # number_of_frames = int(number_of_frames_dec)



#       # if number_of_frames_dec.is_integer():
#       #   carry = 0
#       # else:
#       #   carry = 1


#       for frame in range(number_of_frames):
#           try:
#             if apply_proj:
#               z = utterance[num_accum_frames + frame]
#               for pca in pca_speaker.components_:
#                 # z = z - (np.transpose(z) * pca) * pca
#                 z = np.subtract(z, np.multiply(np.matmul(z, pca), pca))
#             else:
#               z = utterance[num_accum_frames + frame]
#             phones_dict[key].append(z)
#           except IndexError:
#             print("length of utterance: ",utterance.shape[0])
#             print("appended frame index:",num_accum_frames + frame)

#       num_accum_frames += number_of_frames


#     # del utterance.f
#     # utterance.close()

#   list_phones_dicts.append(phones_dict)
#   # break

#   # snapshot = tracemalloc.take_snapshot()
#   # display_top(snapshot)




In [None]:
# phones_dataset = []

In [None]:
# for speaker_dict in list_phones_dicts:
#   for key, val in speaker_dict.items():
#     tmp = {"phone":key, "rep":val}
#     phones_dataset.append(tmp)

In [None]:
# df = pd.DataFrame(phones_dataset)

In [None]:
# df.to_csv("phones.csv")

In [None]:
speaker_ids = []
for speaker in sorted_no_dupl_files:
  speaker_ids.append(speaker[0].split("/")[8])

phones_dict = {value:[] for value in list_phones_dicts[0].keys()}

speakers_dict = {value:[] for value in speaker_ids}

In [None]:
for index, speaker_dict in enumerate(list_phones_dicts):
  speaker_id = sorted_no_dupl_files[index][0].split("/")[8]
  for key, val in tqdm(speaker_dict.items()):
    speakers_dict[speaker_id].extend(val)

In [None]:
for speaker_dict_val in list_phones_dicts:
  for key, val in speaker_dict_val.items():
    phones_dict[key].extend(val)


# speaker classification

In [None]:
X_speaker, Y_speaker = [], []
for key, val in speakers_dict.items():
  for rep in val:
    X_speaker.append(rep)
    Y_speaker.append(key)

In [None]:
import sklearn as sk
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Get dataset with only the first two attributes
X, y = X_speaker, Y_speaker
# Split the dataset into a training and a testing set
# Test set will be the 25% taken randomly
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=33)
# print (X_train.shape, y_train.shape)
# # Standarize the features
# # Feature Scaling
# #For each feature, calculate the average, subtract the mean
# #value from the feature value, and divide the result by their standard deviation. After
# #scaling, each feature will have a zero average, with a standard deviation of one.
# scaler = StandardScaler().fit(X_train)
# X_train = scaler.transform(X_train)

# X_test = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression(verbose=1)
logisticRegr.fit(X_train, y_train)

In [None]:
from sklearn import metrics
y_train_pred = logisticRegr.predict(X_train)
print (metrics.accuracy_score(y_train, y_train_pred))

In [None]:
from sklearn import metrics
#Measure accuracy on the testing set
y_pred = logisticRegr.predict(X_test)
print (metrics.accuracy_score(y_test, y_pred))

In [None]:
print(1 - metrics.accuracy_score(y_test, y_pred))

# phone classifcation

In [None]:
X_phone, Y_phone = [], []
for key, val in phones_dict.items():
  for rep in val:
    X_phone.append(rep)
    Y_phone.append(key)

In [None]:
import sklearn as sk
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Get dataset with only the first two attributes
X, y = X_phone, Y_phone
# Split the dataset into a training and a testing set
# Test set will be the 25% taken randomly
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=33)
# print (X_train.shape, y_train.shape)
# # Standarize the features
# # Feature Scaling
# #For each feature, calculate the average, subtract the mean
# #value from the feature value, and divide the result by their standard deviation. After
# #scaling, each feature will have a zero average, with a standard deviation of one.
# scaler = StandardScaler().fit(X_train)
# X_train = scaler.transform(X_train)

# X_test = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression(verbose=1)
logisticRegr.fit(X_train, y_train)

In [None]:
from sklearn import metrics
y_train_pred = logisticRegr.predict(X_train)
print (metrics.accuracy_score(y_train, y_train_pred))

In [None]:
from sklearn import metrics
#Measure accuracy on the testing set
y_pred = logisticRegr.predict(X_test)
print (metrics.accuracy_score(y_test, y_pred))

In [None]:
print(1 - metrics.accuracy_score(y_test, y_pred))