In [1]:
from src.Charsiu import charsiu_forced_aligner
from alignment_helper_fns import *
from scipy.io import wavfile
import torch
import soundfile as sf
CHARSIU_MODEL = charsiu_forced_aligner('charsiu/en_w2v2_fc_10ms')

#datafiles
TRANSCRIPT = 'The chairman decided to pave over the shopping center garden'
audio_filepath = '/home/prad/github/charsiu/data/artp_files/chairman_healthy.wav'
# audio_filepath = '/home/prad/github/charsiu/data/artp_files/chariman_healthy.wav'
tg_filepath = '/home/prad/github/charsiu/data/artp_files/chairman_healthy.TextGrids'

#load charsiu mdoel
CHARSIU_MODEL.serve(audio = audio_filepath, text = TRANSCRIPT, save_to = tg_filepath)
PHONE2IND = CHARSIU_MODEL.charsiu_processor.processor.tokenizer.encoder
# 
# aligned_phone_df = textgridpath_to_phonedf(txtgrid_path=tg_filepath, phone_key='phones', replace_silence=True, remove_numbers=True)
# audio_signal, fs = sf.read(audio_filepath)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


In [2]:
'''
Artp function 

Here are the steps to computing Artp


1. Generate the sequence of phones from the text 
2. Generate the framewise phone labels
3. Generate the probability distribution for each frame
4. For each frame, calculate the expected/maximum probability

--
5. Group by each phoneme




---
My improvements
6. a windowed version of artp


'''



'\nArtp function \n\nHere are the steps to computing Artp\n\n\n1. Generate the sequence of phones from the text \n2. Generate the framewise phone labels\n3. Generate the probability distribution for each frame\n4. For each frame, calculate the expected/maximum probability\n\n--\n5. Group by each phoneme\n\n\n\n\n---\nMy improvements\n6. a windowed version of artp\n\n\n'

In [3]:
''' get logprobas from healthy aligner'''

def get_logprobas(audio_path, charsiu):
    audio_signal, fs = sf.read(audio_path)
    assert fs == 16000
    # charsiu.aligner._get_feat_extract_output_lengths(len(audio_signal))
    audio = charsiu.charsiu_processor.audio_preprocess(audio_signal)
    inputs = torch.tensor(audio).float().unsqueeze(0).to(charsiu.device)
    with torch.no_grad():
        out = charsiu.aligner(inputs)
    logits = out[0]
    logprobas = torch.log_softmax(logits, dim=-1).detach().cpu().numpy().squeeze()
    return logprobas


In [5]:
print(CHARSIU_MODEL.charsiu_processor.processor.tokenizer.encoder)

{'[SIL]': 0, 'NG': 1, 'F': 2, 'M': 3, 'AE': 4, 'R': 5, 'UW': 6, 'N': 7, 'IY': 8, 'AW': 9, 'V': 10, 'UH': 11, 'OW': 12, 'AA': 13, 'ER': 14, 'HH': 15, 'Z': 16, 'K': 17, 'CH': 18, 'W': 19, 'EY': 20, 'ZH': 21, 'T': 22, 'EH': 23, 'Y': 24, 'AH': 25, 'B': 26, 'P': 27, 'TH': 28, 'DH': 29, 'AO': 30, 'G': 31, 'L': 32, 'JH': 33, 'OY': 34, 'SH': 35, 'D': 36, 'AY': 37, 'S': 38, 'IH': 39, '[UNK]': 40, '[PAD]': 41}


In [6]:
''' get aligned phone ids from aligned df'''
phone_df = textgridpath_to_phonedf(txtgrid_path=tg_filepath, phone_key='phones', replace_silence=True, remove_numbers=True)
# transcript_phone_ids = charsiu.charsiu_processor.get_phones_and_words(TRANSCRIPT)


def return_aligned_phns_and_ids(aligned_df, charsiu, seqlen, phone2id_dict=PHONE2IND):
    timesteps = .01 * np.arange(seqlen) + .01
    # timesteps = .01 * np.arange(530)

    aligned_phns = []



    for timestep in timesteps:
        gt_start_indicator = timestep > aligned_df.iloc[:, 0].values
        lt_end_indicator = timestep <= aligned_df.iloc[:, 1].values
        match_indicator = np.logical_and(gt_start_indicator, lt_end_indicator)
        aligned_ind = np.argwhere(match_indicator).ravel()
        # print(aligned_ind)
        # framewise_phns.append()
        if len(aligned_ind)==0:
            _phn = '[SIL]'
        elif len(aligned_ind)>1:
            Exception('Error found more than 1 matching phone index!')
        else:
            _phn = aligned_df.iloc[aligned_ind, 2].values[0]
            _phn = '[SIL]' if _phn=='sil' else _phn
            # print(_phn)
            aligned_phns.append(_phn)
    aligned_phn_idxs = [phone2id_dict[phone] for phone in aligned_phns]
    return np.array(aligned_phns), np.array(aligned_phn_idxs)

In [7]:
def _calc_GOP(logprobas, aligned_phn_idxs, sil_phone='[SIL]', phone_subset=None, ignore_sil=True, phone2id_dict=PHONE2IND):
    
    max_phn_idxs = np.argmax(logprobas, axis=1)
    framewise_gop = np.array([logprobas[ii, align_phn_idx] - logprobas[ii, max_phn_idx] for ii, (align_phn_idx, max_phn_idx) in enumerate(zip(aligned_phn_idxs, max_phn_idxs))])
    sil_phn_idx = phone2id_dict[sil_phone]    
    nosil_idxs = np.argwhere(aligned_phn_idxs!=sil_phn_idx).ravel()
    framewise_gop_nosil = framewise_gop[nosil_idxs]
    output = {'framewise_gop': framewise_gop_nosil, 'gopmean': np.mean(framewise_gop_nosil), 'gop_withsil': np.mean(framewise_gop)}

    if phone_subset is not None:
        framewise_subset_gops = np.array([])
        for phone in phone_subset:
            _phn_idx = phone2id_dict[phone]
            phone_idxs = np.argwhere(aligned_phn_idxs==_phn_idx).ravel()
            framewise_subset_gops = np.concatenate([framewise_subset_gops, framewise_gop[phone_idxs]]) #don't need to use framewise_gops_nosil since we're filtering phones anyways
            output['framewise_gop_subset'] = framewise_subset_gops
            output['gopmean_subset'] = np.mean(framewise_subset_gops)
    return output
    #TODO: implement for phone_subset 

In [51]:
# audio = charsiu.charsiu_processor.audio_preprocess(audio_signal)
# inputs = torch.tensor(audio).float().unsqueeze(0).to(charsiu.device)
audio_signal
phones, words = CHARSIU_MODEL.align(audio_signal, text=TRANSCRIPT)
phndf = pd.DataFrame.from_records(phones, columns = ['start', 'end', 'phone'])
np.unique(phndf['phone'].values)

array(['AA', 'AH', 'AY', 'CH', 'D', 'DH', 'EH', 'ER', 'EY', 'G', 'IH',
       'M', 'N', 'NG', 'OW', 'P', 'R', 'S', 'SH', 'T', 'UW', 'V', '[SIL]'],
      dtype=object)

In [47]:
#TODO implement for phone subset

def get_aligner_frame_seq_len(audio_filepath, fs, charsiu):
    audio = charsiu.charsiu_processor.audio_preprocess(audio_filepath, sr=fs)
    audio = torch.Tensor(audio).unsqueeze(0).to(charsiu.device)
    print(len(audio))
    return charsiu.aligner._get_feat_extract_output_lengths(audio)
    
def calculate_GOP_e2e(audio_filepath, transcript, charsiu_model, phone_subset=None):
    audio_signal, fs = sf.read(audio_filepath)
    phones, words, logits = charsiu_model.align(audio_signal, text=TRANSCRIPT, return_logits=True)
    aligned_phone_df = pd.DataFrame.from_records(phones, columns = ['start', 'end', 'phone'])
    seqlen = CHARSIU_MODEL.aligner._get_feat_extract_output_lengths(len(audio_signal))
    print(seqlen)
    aligned_phns, aligned_phn_idxs = return_aligned_phns_and_ids(aligned_phone_df, seqlen=seqlen, charsiu=charsiu_model)
    logprobas = torch.log_softmax(logits, dim=-1).numpy()
    gopoutput = _calc_GOP(logprobas, aligned_phn_idxs, phone_subset=phone_subset)
    if phone_subset is not None:
        return gopoutput['gopmean_subset']
    else:
        return gopoutput['gopmean']


In [48]:
audio_filepath

'/home/prad/github/charsiu/data/artp_files/chairman_healthy.wav'

In [50]:
calculate_GOP_e2e(audio_filepath=audio_filepath, transcript=TRANSCRIPT, charsiu_model=CHARSIU_MODEL, phone_subset=[])

tensor(530)


-0.26588184

In [14]:

framewise_gop = [logprobas[ii, align_phn_idx] - logprobas[ii, max_phn_idx] for ii, (align_phn_idx, max_phn_idx) in enumerate(zip(aligned_phn_ids, max_phn_idxs))]

In [16]:
np.mean(framewise_gop)

-0.17508069

In [68]:
phone_ids = charsiu.charsiu_processor.get_phone_ids(aligned_phns)

In [47]:
aligned_phns[2]

'sil'

In [26]:
''' Healthy file'''


''' Healthy file older male'''


''' ALS Speaker'''

5.3203125

In [None]:
''' Run initial GOP compute on Gabi's files'''



In [11]:
phone_df.iloc[-1,1]

5.3