In [17]:
import librosa
import math
import numpy as np
import scipy.signal
from scipy.special import logsumexp
import torch
import torch.nn as nn
import torch.nn.functional as F

class MyNet(nn.Module):
    def __init__(self):
        super(MyNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 5, padding=2)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(32, 64, 5, padding=2)
        self.conv3 = nn.Conv2d(64, 64, 3, padding=1)
        self.conv4 = nn.Conv2d(64, 128, (1, 5))
        self.fc1 = nn.Linear(128, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 48)
        self.sm = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = F.relu(self.conv4(x))
        x = x.view(-1, 128)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        x = self.sm(x)
        return x
    
def load_audio_to_melspec_tensor(wavpath, sample_rate=16000):
    window_size = .025
    window_stride = 0.01
    n_dft = 512
    win_length = int(sample_rate * window_size)
    hop_length = int(sample_rate * window_stride)
    y, sr = librosa.load(wavpath, sr=sample_rate)
    y = y - y.mean()
    y = np.append(y[0],y[1:]-.97*y[:-1])
    # compute mel spectrogram
    stft = librosa.stft(y, n_fft=n_dft, hop_length=hop_length,
        win_length=win_length, window=scipy.signal.hamming)
    spec = np.abs(stft)**2
    mel_basis = librosa.filters.mel(sr=sample_rate, n_fft=n_dft, n_mels=40, fmin=20)
    melspec = np.dot(mel_basis, spec)
    logspec = librosa.power_to_db(melspec, ref=np.max)
    logspec = np.transpose(logspec)
    logspec_tensor = torch.tensor(logspec)
    return logspec_tensor

def compute_phone_likelihoods(model, logspec):
    likelihood_list = []
    with torch.no_grad():
        for j in range(6, logspec.size(0) - 5):
            inp = logspec[j-5:j+6,:].unsqueeze(0)
            output = model(inp) # output will be log probabilities over classes
            output = output - math.log(1. / 48) # subtract the logprob of the class priors (assumed to be uniform)
            likelihood_list.append(output[0])
    likelihoods = torch.transpose(torch.stack(likelihood_list, dim=1), 0, 1).numpy()
    return likelihoods

model = MyNet()
model.load_state_dict(torch.load('lab3_AM.pt'))

lab3_data = np.load('lab3_phone_labels.npz')
phone_labels = list(lab3_data['phone_labels'])
print ("phones labels: ", phone_labels)

def phones2indices(phones):
    return [phone_labels.index(p) for p in phones]


# fee_HMM = MyHMM(phones2indices(['sil', 'f', 'iy', 'sil']), np.array([0.5, 0.5, 0, 0]), np.array([[.9,.1,0,0],[0,.9,.1,0],[0,0,.9,.1],[0,0,0,1]]))
# pea_HMM = MyHMM(phones2indices(['sil', 'p', 'iy', 'sil']), np.array([0.5, 0.5, 0, 0]), np.array([[.9,.1,0,0],[0,.9,.1,0],[0,0,.9,.1],[0,0,0,1]]))
# rock_HMM = MyHMM(phones2indices(['sil', 'r', 'aa', 'cl', 'k', 'sil']), np.array([0.5,0.5,0,0,0,0]), np.array([[.9,.1,0,0,0,0],[0,.9,.1,0,0,0],[0,0,.9,.1,0,0],[0,0,0,.9,.1,0],[0,0,0,0,.9,.1],[0,0,0,0,0,1]]))
# burt_HMM = MyHMM(phones2indices(['sil', 'b', 'er', 'cl', 't', 'sil']), np.array([0.5,0.5,0,0,0,0]), np.array([[.9,.1,0,0,0,0],[0,.9,.1,0,0,0],[0,0,.9,.1,0,0],[0,0,0,.9,.1,0],[0,0,0,0,.9,.1],[0,0,0,0,0,1]]))
# see_HMM = MyHMM(phones2indices(['sil', 's', 'iy', 'sil']), np.array([0.5, 0.5, 0, 0]), np.array([[.9,.1,0,0],[0,.9,.1,0],[0,0,.9,.1],[0,0,0,1]]))
# she_HMM = MyHMM(phones2indices(['sil', 'sh', 'iy', 'sil']), np.array([0.5, 0.5, 0, 0]), np.array([[.9,.1,0,0],[0,.9,.1,0],[0,0,.9,.1],[0,0,0,1]]))

phones labels:  ['sil', 's', 'ao', 'l', 'r', 'iy', 'vcl', 'd', 'eh', 'cl', 'p', 'ix', 'z', 'ih', 'sh', 'n', 'v', 'aa', 'y', 'uw', 'w', 'ey', 'dx', 'b', 'ay', 'ng', 'k', 'epi', 'ch', 'dh', 'er', 'en', 'g', 'aw', 'hh', 'ae', 'ow', 't', 'ax', 'm', 'zh', 'ah', 'el', 'f', 'jh', 'uh', 'oy', 'th']


In [18]:
# load in audio files
# convert to mel-logspec tensor
# compute phone likelihoods for each

burt_phone_likelihoods = compute_phone_likelihoods(model, load_audio_to_melspec_tensor(wavpath="burt.wav"))
fee_phone_likelihoods = compute_phone_likelihoods(model, load_audio_to_melspec_tensor(wavpath="fee.wav"))
pea_phone_likelihoods = compute_phone_likelihoods(model, load_audio_to_melspec_tensor(wavpath="pea.wav"))
rock_phone_likelihoods = compute_phone_likelihoods(model, load_audio_to_melspec_tensor(wavpath="rock.wav"))
see_phone_likelihoods = compute_phone_likelihoods(model, load_audio_to_melspec_tensor(wavpath="see.wav"))
she_phone_likelihoods = compute_phone_likelihoods(model, load_audio_to_melspec_tensor(wavpath="she.wav"))

print ("burt_phone_likelihoods.shape: ", burt_phone_likelihoods.shape)
print ("fee_phone_likelihoods.shape: ", fee_phone_likelihoods.shape)
print ("pea_phone_likelihoods.shape: ", pea_phone_likelihoods.shape)
print ("rock_phone_likelihoods.shape: ", rock_phone_likelihoods.shape)
print ("see_phone_likelihoods.shape: ", see_phone_likelihoods.shape)
print ("she_phone_likelihoods.shape: ", she_phone_likelihoods.shape)

print ("\nburt_phone_likelihoods; ", burt_phone_likelihoods)

burt_phone_likelihoods.shape:  (100, 48)
fee_phone_likelihoods.shape:  (103, 48)
pea_phone_likelihoods.shape:  (113, 48)
rock_phone_likelihoods.shape:  (81, 48)
see_phone_likelihoods.shape:  (96, 48)
she_phone_likelihoods.shape:  (107, 48)

burt_phone_likelihoods;  [[  3.7091124  -10.21496    -16.231117   ... -12.376249   -13.11356
   -3.5045896 ]
 [  3.6599908  -10.715677   -17.96255    ... -13.799179   -14.720837
   -3.553031  ]
 [  3.6621015  -10.123196   -18.386337   ... -14.938078   -15.339006
   -3.7572546 ]
 ...
 [  2.8377366   -6.8738704   -9.583971   ...  -6.1912694   -8.165667
   -1.3121886 ]
 [  0.26819515  -8.690104   -14.135099   ... -11.422363   -12.194883
   -2.6556416 ]
 [  0.7419872   -8.786886   -14.555197   ... -12.312962   -13.335056
   -3.2742243 ]]


In [19]:
# implement hidden markov model
# A (N x N)= state transition distribution
# pi (N x 1)= initial state distribution

from math import gamma


class MyHMM:
    def __init__(self, state_labels, initial_state_distribution, transition_matrix, eps=1e-200):
        self.eps = eps
        self.total_states = len(state_labels)
        # (the product of probabilities becomes addition in log space)
        self.pi = np.log(initial_state_distribution + eps)
        self.A = np.log(transition_matrix + eps) #A_{ji} is prob of transitioning from state j to state i
        self.labels = state_labels # a list where self.labels[j] is the index of the phone label belonging to the jth state
        #print(self.labels)
        self.N_states = len(self.labels)

    def forward(self, state_likelihoods): # state_likelihoods.shape is assumed to be (N_timesteps, 48)
        # create B array
        N_timesteps = state_likelihoods.shape[0]
        self.B = np.zeros((N_timesteps, self.total_states))
        i = 0
        for state in self.labels:
            self.B[:,i] = state_likelihoods[:,state]
            i += 1
        
        # create alpha matrix (N = N_timesteps, M = total states)
        alpha_matrix = np.zeros((N_timesteps, self.total_states))
        
        # initialization
        t = 0 # time step
        i = 0 # state 
        for i in range (self.total_states):
            alpha_matrix[t, i] = self.B[t, i] + self.pi[i]
        i += 1

        # induction step
        for t in range(N_timesteps):
            for i in range (self.total_states):
                sum = []
                j = 0
                # get sum
                for j in range(self.total_states):
                    sum.append(alpha_matrix[t, j] + self.A[j, i])
                # multiply values and set alpha
                value = logsumexp(sum) + self.B[t, i]
                alpha_matrix[t, i] = value

        # termination
        return alpha_matrix[N_timesteps - 1,:]
    
    def viterbi(self, state_likelihoods): # state_likelihoods.shape is assumed to be (N_timesteps, 48)
        # create B array
        N_timesteps = state_likelihoods.shape[0]
        self.B = np.zeros((N_timesteps, self.total_states))
        i = 0
        for state in self.labels:
            self.B[:,i] = state_likelihoods[:,state]
            i += 1

        # create psi matrix (backtrace)
        psi_matrix = np.zeros((N_timesteps, self.total_states))
        psi_matrix[0, 0] = 0.0
        
        # create gamma array (N = N_timesteps, M = total states)
        gamma_matrix = np.zeros((N_timesteps, self.total_states))

        # initialization
        t = 0 # time step
        i = 0 # state 
        for i in range (self.total_states):
            gamma_matrix[t, i] = self.B[t, i] + self.pi[i]
        i += 1

        # induction step
        for t in range(N_timesteps):
            for i in range (self.total_states):
                ind_array = []
                j = 0
                # get induction values to find max
                for j in range(self.total_states):
                    ind_array.append(gamma_matrix[t - 1, j] + self.A[j, i])
                # find max, multiply values, and set gamma
                value = np.max(ind_array) + self.B[t, i]
                gamma_matrix[t, i] = value
                # set psi backtrace value
                psi_matrix[t, i] = np.argmax(ind_array)

        # termination
        return np.argmax(gamma_matrix[N_timesteps - 1,:])
    
    def viterbi_transition_update(self, state_likelihoods):
        # state_likelihoods.shape is assumed to be (N_timesteps, 48)
        # TODO: fill in
        pass

In [20]:
# HMM inputs: (state_labels, initial_state_distribution (pi), transition_matrix (A), eps = 1e-200)
# HMM.forward inputs: (state_likelihoods (B))
# HMM.viterbi inputs: (state_likelihoods (B))
# HMM.viterbi_transition_update inputs: (state_likelihoods (B))

# remove scientific notation from prints
np.set_printoptions(suppress=True)

burt_HMM = MyHMM(phones2indices(['sil', 'b', 'er', 'cl', 't', 'sil']), # state_labels
                 np.array([0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), # initial_state_distribution (pi)
                 np.array([[0.9, 0.1, 0.0, 0.0, 0.0, 0.0], # transition_matrix (A)
                           [0.0, 0.9, 0.1, 0.0, 0.0, 0.0],
                           [0.0, 0.0, 0.9, 0.1, 0.0, 0.0],
                           [0.0, 0.0, 0.0, 0.9, 0.1, 0.0],
                           [0.0, 0.0, 0.0, 0.0, 0.9, 0.1],
                           [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]]))

burt_forward_output = burt_HMM.forward(burt_phone_likelihoods) # state_observation_distribution (B)
print ("burt_HMM.forward_output: ", burt_forward_output)

burt_viterbi_output = burt_HMM.viterbi(burt_phone_likelihoods)
print ("burt_HMM.viterbi_output: ", burt_viterbi_output)

print ("----------------")

fee_HMM = MyHMM(phones2indices(['sil', 'f', 'iy', 'sil']), 
                np.array([0.5, 0.5, 0.0, 0.0]), 
                np.array([[0.9, 0.1, 0.0, 0.0],
                          [0.0, 0.9, 0.1, 0.0],
                          [0.0, 0.0, 0.9, 0.1],
                          [0.0, 0.0, 0.0, 1.0]]))

fee_forward_output = fee_HMM.forward(fee_phone_likelihoods) # state_observation_distribution (B)
print ("fee_HMM.forward_output: ", fee_forward_output)

fee_viterbi_output = fee_HMM.viterbi(fee_phone_likelihoods)
print ("fee_HMM.viterbi_output: ", fee_viterbi_output)

print ("----------------")

pea_HMM = MyHMM(phones2indices(['sil', 'p', 'iy', 'sil']),  
                np.array([0.5, 0.5, 0.0, 0.0]), 
                np.array([[0.9, 0.1, 0.0, 0.0],
                          [0.0, 0.9, 0.1, 0.0],
                          [0.0, 0.0, 0.9, 0.1],
                          [0.0, 0.0, 0.0, 1.0]]))

pea_forward_output = pea_HMM.forward(pea_phone_likelihoods) # state_observation_distribution (B)
print ("pea_HMM.forward_output: ", pea_forward_output)

pea_viterbi_output = pea_HMM.viterbi(pea_phone_likelihoods)
print ("pea_HMM.viterbi_output: ", pea_viterbi_output)

print ("----------------")

rock_HMM = MyHMM(phones2indices(['sil', 'r', 'aa', 'cl', 'k', 'sil']), 
                 np.array([0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), 
                 np.array([[0.9, 0.1, 0.0, 0.0, 0.0, 0.0],
                           [0.0, 0.9, 0.1, 0.0, 0.0, 0.0],
                           [0.0, 0.0, 0.9, 0.1, 0.0, 0.0],
                           [0.0, 0.0, 0.0, 0.9, 0.1, 0.0],
                           [0.0, 0.0, 0.0, 0.0, 0.9, 0.1],
                           [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]]))

rock_forward_output = rock_HMM.forward(rock_phone_likelihoods) # state_observation_distribution (B)
print ("rock_HMM.forward_output: ", rock_forward_output)

rock_viterbi_output = rock_HMM.viterbi(rock_phone_likelihoods)
print ("rock_HMM.viterbi_output: ", rock_viterbi_output)

print ("----------------")

see_HMM = MyHMM(phones2indices(['sil', 's', 'iy', 'sil']), 
                np.array([0.5, 0.5, 0.0, 0.0]), 
                np.array([[0.9, 0.1, 0.0, 0.0],
                          [0.0, 0.9, 0.1, 0.0],
                          [0.0, 0.0, 0.9, 0.1],
                          [0.0, 0.0, 0.0, 1.0]]))

see_forward_output = see_HMM.forward(see_phone_likelihoods) # state_observation_distribution (B)
print ("see_HMM.forward_output: ", see_forward_output)

see_viterbi_output = see_HMM.viterbi(see_phone_likelihoods)
print ("see_HMM.viterbi_output: ", see_viterbi_output)

print ("----------------")

she_HMM = MyHMM(phones2indices(['sil', 'sh', 'iy', 'sil']), 
                np.array([0.5, 0.5, 0.0, 0.0]), 
                np.array([[0.9, 0.1, 0.0, 0.0],
                          [0.0, 0.9, 0.1, 0.0],
                          [0.0, 0.0, 0.9, 0.1],
                          [0.0, 0.0, 0.0, 1.0]]))

she_forward_output = she_HMM.forward(she_phone_likelihoods) # state_observation_distribution (B)
print ("she_HMM.forward_output: ", she_forward_output)

she_viterbi_output = she_HMM.viterbi(she_phone_likelihoods)
print ("she_HMM.viterbi_output: ", she_viterbi_output)

burt_HMM.forward_output:  [  0.63662671  -4.8610115  -11.74037629   0.89066927  -9.66055408
   0.7419936 ]
burt_HMM.viterbi_output:  5
----------------
fee_HMM.forward_output:  [ 0.43372368 -4.76673856 -3.36711141  0.54252717]
fee_HMM.viterbi_output:  3
----------------
pea_HMM.forward_output:  [ 3.12280988 -4.73790943 -6.50918221  3.22831935]
pea_HMM.viterbi_output:  3
----------------
rock_HMM.forward_output:  [ -0.53076077 -13.49451952 -16.57392916   0.31579709 -10.36344577
  -0.4253971 ]
rock_HMM.viterbi_output:  5
----------------
see_HMM.forward_output:  [ 2.86987256 -6.29892772 -6.79899011  2.97534456]
see_HMM.viterbi_output:  3
----------------
she_HMM.forward_output:  [ 1.92741775 -4.69655353 -4.93931886  2.03349395]
she_HMM.viterbi_output:  3
