In [4]:
import numpy as np
from collections import defaultdict

In [5]:
# Initialize counts and probabilities
def initialize_probs(ipa_vocab_size, whisper_vocab_size):
    # Initialize uniform probability distribution
    alignment_probs = np.ones((ipa_vocab_size, whisper_vocab_size)) / whisper_vocab_size
    return alignment_probs

In [6]:
# E-Step: Calculate alignment probabilities for each pair of sequences
def expectation_step(whisper_seqs, ipa_seqs, alignment_probs):
    alignments = []
    
    for whisper_seq, ipa_seq in zip(whisper_seqs, ipa_seqs):
        seq_alignment = np.zeros((len(ipa_seq), len(whisper_seq)))
        
        for i, ipa_token in enumerate(ipa_seq):
            for j, whisper_token in enumerate(whisper_seq):
                seq_alignment[i, j] = alignment_probs[ipa_token, whisper_token]
        
        # Normalize alignment to form a probability distribution
        seq_alignment /= seq_alignment.sum(axis=1, keepdims=True)
        alignments.append(seq_alignment)
    
    return alignments

In [7]:
# M-Step: Update the alignment probabilities based on the E-step
def maximization_step(alignments, whisper_seqs, ipa_seqs, ipa_vocab_size, whisper_vocab_size):
    # Initialize count matrix
    new_alignment_probs = np.zeros((ipa_vocab_size, whisper_vocab_size))

    for seq_alignment, whisper_seq, ipa_seq in zip(alignments, whisper_seqs, ipa_seqs):
        for i, ipa_token in enumerate(ipa_seq):
            for j, whisper_token in enumerate(whisper_seq):
                new_alignment_probs[ipa_token, whisper_token] += seq_alignment[i, j]

    # Normalize to get probability distributions
    new_alignment_probs /= new_alignment_probs.sum(axis=1, keepdims=True)
    
    return new_alignment_probs

In [8]:
# EM Algorithm
def train_em(whisper_seqs, ipa_seqs, ipa_vocab_size, whisper_vocab_size, n_iter=10):
    # Step 1: Initialize probabilities
    alignment_probs = initialize_probs(ipa_vocab_size, whisper_vocab_size)

    for iteration in range(n_iter):
        # Step 2: E-Step
        alignments = expectation_step(whisper_seqs, ipa_seqs, alignment_probs)
        
        # Step 3: M-Step
        alignment_probs = maximization_step(alignments, whisper_seqs, ipa_seqs, ipa_vocab_size, whisper_vocab_size)

        print(f"Iteration {iteration + 1}: Updated alignment probabilities")
    
    return alignment_probs

In [20]:
# Example sequences (list of sequences of token indices)
whisper_seqs = [
    # [1, 10, 20, 30],
    # [5, 15, 25],
    # [8, 18, 28, 38, 48]
    [1,2,3],
    [3,2,1],
]
whisper_vocab_size = np.max(np.concatenate(whisper_seqs))+1

ipa_seqs = [
    [1,2,3],
    [3,2,1],
    # [2, 12, 22],
    # [6, 16],
    # [9, 19, 29]
]
ipa_vocab_size = np.max(np.concatenate(ipa_seqs))+1

# Train the EM model
alignment_probs = train_em(whisper_seqs, ipa_seqs, ipa_vocab_size, whisper_vocab_size, n_iter=10)

# alignment_probs[i, j] gives the probability of mapping IPA token i to Whisper token j

Iteration 1: Updated alignment probabilities
Iteration 2: Updated alignment probabilities
Iteration 3: Updated alignment probabilities
Iteration 4: Updated alignment probabilities
Iteration 5: Updated alignment probabilities
Iteration 6: Updated alignment probabilities
Iteration 7: Updated alignment probabilities
Iteration 8: Updated alignment probabilities
Iteration 9: Updated alignment probabilities
Iteration 10: Updated alignment probabilities


  new_alignment_probs /= new_alignment_probs.sum(axis=1, keepdims=True)


In [18]:
alignment_probs.shape

(4, 4)

In [21]:
alignment_probs

array([[       nan,        nan,        nan,        nan],
       [0.        , 0.33333333, 0.33333333, 0.33333333],
       [0.        , 0.33333333, 0.33333333, 0.33333333],
       [0.        , 0.33333333, 0.33333333, 0.33333333]])