In [32]:
import os, glob
import soundfile as sf
from collections import defaultdict
import numpy as np
import pandas as pd
import h5py
import tqdm
import IPython
import fairseq
import torch
from matplotlib import pyplot as plt
from sklearn.manifold import TSNE
from sklearn import svm
from sklearn import metrics
import seaborn as sns
import utils
from fairseq.dataclass.utils import convert_namespace_to_omegaconf

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="2"

In [3]:
from fairseq.models.wav2vec.wav2vec2 import Wav2Vec2Model

In [43]:
def load_wav2vec2(mode='base', evaluate=True):
    checkpoint_base = torch.load('./pretrained_checkpoints/wav2vec_small.pt')
    wav2vec2 = Wav2Vec2Model.build_model(convert_namespace_to_omegaconf(checkpoint_base['args']).model, task='audio_pretraining')

    if mode == 'random':
        utils.reset_all_weights(wav2vec2)
    else:
        if mode == 'finetune':
            checkpoint_finetune = torch.load('./pretrained_checkpoints/wav2vec_small_960h.pt')
            for key in checkpoint_finetune['model']:
                if 'w2v_encoder.w2v_model.' == key[:len('w2v_encoder.w2v_model.')]:
                    checkpoint_base['model'][key[len('w2v_encoder.w2v_model.'):]] = checkpoint_finetune['model'][key]
        wav2vec2.load_state_dict(checkpoint_base['model'])
        
    wav2vec2.cuda()
    if evaluate:
        wav2vec2.eval()
    return wav2vec2

In [119]:
wav2vec2 = load_wav2vec2('random')

In [5]:
def wav2vec2forward(model, source, aggregation=True, hidden_layer=None):
    """
    Inference function of pretrained wav2vec2 to extract intermediate representations
    Ref: https://github.com/pytorch/fairseq/blob/89ec6e7efff867d258947acafc57189b257212d0/fairseq/models/wav2vec/wav2vec2.py
    """
    with torch.no_grad():
        cnn_features = model.feature_extractor(source)
        
        cnn_features = cnn_features.transpose(1, 2)
        features = model.layer_norm(cnn_features)

        if model.quantizer: # this is not None in pretrained w2v
            q = model.quantizer(features, produce_targets=False)
            quantized_features = q["x"]
            projected_quantized_features = model.project_q(quantized_features)

        if model.post_extract_proj is not None: # this is not None in pretrained w2v
            features = model.post_extract_proj(features)

        if model.input_quantizer is not None: # this is None in pretrained w2v
            q = model.input_quantizer(features, produce_targets=False)
            features = q['x']
            features = model.project_inp(features)
            
        encoder_outputs, encoder_layers_features = model.encoder(features, padding_mask=None, layer=hidden_layer)
            
        context_vectors = model.final_proj(encoder_outputs)
        
        ret = dict()
        ret['cnn_output'] = cnn_features.squeeze(0)
        ret['vq'] = quantized_features.squeeze(0)
        ret['projected_vq'] = projected_quantized_features.squeeze(0)
        ret['encoder_output'] = encoder_outputs.squeeze(0)
        ret['context_vector'] = context_vectors.squeeze(0)
        if len(encoder_layers_features) > 0:
            ret['encoder_hiddens'] = [h[0][0] for h in encoder_layers_features]
        
        if aggregation:
            ret['cnn_output'] = torch.mean(ret['cnn_output'], dim=0)
            ret['vq'] = torch.mean(ret['vq'], dim=0)
            ret['projected_vq'] = torch.mean(ret['projected_vq'], dim=0)
            ret['encoder_output'] = torch.mean(ret['encoder_output'], dim=0)
            ret['context_vector'] = torch.mean(ret['context_vector'], dim=0)
            if len(encoder_layers_features) > 0:
                ret['encoder_hiddens'] = [torch.mean(h, dim=0) for h in ret['encoder_hiddens']]
        
        return ret

In [81]:
df = pd.read_csv('./data/TIMIT_train.csv')

In [134]:
!rm -rf ./outputs/extracted_features/wav2vec2_small/TIMIT_train_word.h5

In [171]:
hf = h5py.File("./outputs/extracted_features/wav2vec2_small-random_init/TIMIT_test_word.h5", 'w')

In [24]:
hf.close()

In [36]:
## word as single segment

hf_paths = ["./outputs/extracted_features/wav2vec2_small-random_init/TIMIT_train_word.h5", 
            "./outputs/extracted_features/wav2vec2_small-random_init/TIMIT_test_word.h5"]

df_paths = ['./data/TIMIT_train.csv', 
            './data/TIMIT_test.csv']

window = 400
stride = 320

for hf_path, df_path in zip(hf_paths, df_paths):
    hf = h5py.File(hf_path, 'w')
    df = pd.read_csv(df_path)
    
    for i, row in tqdm.tqdm(df.iterrows()):
        wav_id = row['wav_id']
        word_path = row['wav_path'][:-7] + 'WRD'
        with open(word_path) as f:
            words = f.read().strip('\n').split('\n')
        wav, sr = sf.read(row['wav_path'], dtype='float32')
        for j, word in enumerate(words):
            word = word.split(' ')
            s = wav[int(word[0]):int(word[1])+1]
            word = word[2]
            
            if len(s) < window:
                s = np.concatenate((s, np.zeros(window-len(s), dtype=np.float32)))
            else:
                residual = (len(s) - window) % stride
                if residual > 10:
                    s = np.concatenate((s, np.zeros(stride - residual, dtype=np.float32)))
                
            output = wav2vec2forward(wav2vec2, torch.tensor(s).unsqueeze(0).cuda(), aggregation=True)

            for feature in ['cnn_output', 'vq', 'projected_vq', 'encoder_output', 'context_vector']:
                hf.create_dataset(f"{wav_id}-{word}_{j}-{feature}", data=output[feature].cpu())
                
    hf.close()

4620it [11:12,  6.87it/s]
1680it [03:59,  7.00it/s]


In [54]:
!rm -rf ./outputs/extracted_features/wav2vec2_small-random_init/

In [56]:
!ls ./outputs/extracted_features/wav2vec2_small-random_init/

LibriSpeech_devclean_averaged.h5  TIMIT_train.h5	   VCTK_averaged.h5
TIMIT_test.h5			  TIMIT_train_averaged.h5
TIMIT_test_word.h5		  TIMIT_train_word.h5


In [47]:
hf.close()

In [60]:
hf_paths = ["./outputs/extracted_features/wav2vec2_small-random_init/TIMIT_train_word-phoneme_combine.h5", 
            "./outputs/extracted_features/wav2vec2_small-random_init/TIMIT_test_word-phoneme_combine.h5"]

df_paths = ['./data/TIMIT_train.csv', 
            './data/TIMIT_test.csv']

window = 400
stride = 320

for hf_path, df_path in zip(hf_paths, df_paths):
    hf = h5py.File(hf_path, 'w')
    hf_phonemes = {}
    save_dir = os.path.split(hf_path)[0]
    split_type = os.path.split(hf_path)[1].split('_')[1]
    print(split_type)
    
    word_accum_features = defaultdict(list)
    
    for feature in ['cnn_output', 'vq', 'projected_vq', 'encoder_output', 'context_vector']:
        hf_phonemes[feature] = h5py.File(f"{save_dir}/TIMIT_{split_type}_phoneme-{feature}.h5", 'w')
    df = pd.read_csv(df_path)

    for i, row in tqdm.tqdm(df.iterrows()):
        wav_id = row['wav_id']
        phoneme_path = row['wav_path'][:-7] + 'PHN'
        word_path = row['wav_path'][:-7] + 'WRD'
        with open(phoneme_path) as f:
            phonemes = f.read().strip('\n').split('\n')
        with open(word_path) as f:
            words = f.read().strip('\n').split('\n')
        wav, sr = sf.read(row['wav_path'], dtype='float32')
        word_idx = -1
        word_end_pos = -1
        if phonemes[0].split(' ')[2] != 'h#':
            print(phonemes[0].split(' ')[2])
        if phonemes[-1].split(' ')[2] != 'h#':
            print(phonemes[-1].split(' ')[2])
            
        for j, p in enumerate(phonemes):
            p = p.split(' ')
            if p[2] == 'h#':
                continue
            try:
                if int(p[0]) >= word_end_pos:
                    if len(word_accum_features['cnn_output']) > 0:
                        for feature in ['cnn_output', 'vq', 'projected_vq', 'encoder_output', 'context_vector']:
                            word_accum_features[feature] = torch.stack(word_accum_features[feature])
                            hf.create_dataset(f"{wav_id}-{current_word}_{word_idx}-{feature}", data=torch.mean(word_accum_features[feature], dim=0))
                            word_accum_features[feature] = []
                    
                    word_idx += 1
                    word_end_pos = int(words[word_idx].split(' ')[1])
                    current_word = words[word_idx].split(' ')[2]
            except:
                continue
            s = wav[int(p[0]):int(p[1])+1]
            if len(s) < window:
                s = np.concatenate((s, np.zeros(window-len(s), dtype=np.float32)))
            else:
                residual = (len(s) - window) % stride
                if residual > 10:
                    s = np.concatenate((s, np.zeros(stride - residual, dtype=np.float32)))
            output = wav2vec2forward(wav2vec2, torch.tensor(s).unsqueeze(0).cuda(), aggregation=True)

            for feature in ['cnn_output', 'vq', 'projected_vq', 'encoder_output', 'context_vector']:
                word_accum_features[feature].append(output[feature].cpu())
                hf_phonemes[feature].create_dataset(f"{wav_id}-{current_word}_{word_idx}-{p[2]}_{j}-{feature}", data=output[feature].cpu())
                
    hf.close()
    for feature in ['cnn_output', 'vq', 'projected_vq', 'encoder_output', 'context_vector']:
        hf_phonemes[feature].close()

0it [00:00, ?it/s]

train


4620it [37:00,  2.08it/s]
0it [00:00, ?it/s]

test


1680it [13:30,  2.07it/s]


In [114]:
from collections import defaultdict
words_list = defaultdict(int)
for i, row in df.iterrows():
    wav_id = row['wav_id']
    word_path = row['wav_path'][:-7] + 'WRD'
    with open(word_path) as f:
        words = f.read().strip('\n').split('\n')
    for w in words:
        words_list[w.split(' ')[2]] += 1

In [111]:
valid_word_list = set()
for w in words_list:
    if words_list[w] >= 7:
        valid_word_list.add(w)

In [120]:
valid_word_list_test = set()
for w in words_list:
    if words_list[w] >= 7:
        valid_word_list_test.add(w)

In [139]:
valid_words = valid_word_list_test.intersection(valid_word_list)

In [152]:
import pickle
with open("data/TIMIT/valid_words.pkl", 'wb') as f:
    pickle.dump(valid_words, f)

In [105]:
{k: v for k, v in sorted(words_list.items(), key=lambda item: item[1], reverse=True)}

{'the': 1603,
 'to': 1018,
 'in': 947,
 'a': 867,
 'that': 612,
 'she': 572,
 'an': 571,
 'your': 565,
 'all': 545,
 'had': 526,
 'like': 518,
 'me': 517,
 'and': 492,
 "don't": 488,
 'water': 479,
 'dark': 473,
 'year': 473,
 'oily': 470,
 'rag': 470,
 'wash': 469,
 'ask': 464,
 'carry': 463,
 'suit': 462,
 'greasy': 462,
 'of': 455,
 'is': 401,
 'you': 274,
 'are': 238,
 'was': 236,
 'he': 233,
 'for': 216,
 'his': 190,
 'with': 188,
 'be': 176,
 'it': 171,
 'on': 167,
 'we': 154,
 'this': 152,
 'they': 141,
 'by': 130,
 'her': 127,
 'from': 125,
 'as': 125,
 'have': 124,
 'not': 119,
 'but': 103,
 'will': 100,
 'i': 99,
 'do': 93,
 'him': 84,
 'my': 83,
 'or': 78,
 'no': 76,
 'were': 75,
 'at': 74,
 'can': 73,
 'new': 68,
 'up': 68,
 'would': 67,
 'every': 65,
 'now': 60,
 'our': 60,
 'how': 59,
 'each': 59,
 'their': 58,
 'big': 57,
 'so': 57,
 'often': 56,
 'may': 55,
 'has': 55,
 'out': 55,
 'never': 54,
 'many': 51,
 'into': 51,
 'if': 50,
 'saw': 50,
 'get': 50,
 'one': 49,
 'm

In [89]:
!cat ./data/data/TRAIN/DR1/MRAI0/SX72.WRD

2210 9400 spring
9400 16887 street
16887 20386 is
20386 24680 straight
24680 31040 ahead


In [93]:
s, sr = sf.read("./data/data/TRAIN/DR1/MRAI0/SX72.WAV.wav", dtype='float32')
IPython.display.Audio(data=s[31040 :31645    ], rate=sr)

In [43]:
def wav2vec2featurize(model, source, feature_name):
    """
    Inference function of pretrained wav2vec2 to extract intermediate representations
    Ref: https://github.com/pytorch/fairseq/blob/89ec6e7efff867d258947acafc57189b257212d0/fairseq/models/wav2vec/wav2vec2.py
    """
    assert feature_name in ['cnn_output', 'vq', 'projected_vq', 'encoder_output', 'context_vector']
    cnn_features = model.feature_extractor(source)

    cnn_features = cnn_features.transpose(1, 2)
    
    if feature_name == 'cnn_output':
        return cnn_features.squeeze(0)
    
    features = model.layer_norm(cnn_features)

    if model.quantizer: # this is not None in pretrained w2v
        q = model.quantizer(features, produce_targets=False)
        quantized_features = q["x"]
        if feature_name == 'vq':
            return quantized_features.squeeze(0)
        projected_quantized_features = model.project_q(quantized_features)
        if feature_name == 'projected_vq':
            return projected_quantized_features.squeeze(0)

    if model.post_extract_proj is not None: # this is not None in pretrained w2v
        features = model.post_extract_proj(features)

    if model.input_quantizer is not None: # this is None in pretrained w2v
        q = model.input_quantizer(features, produce_targets=False)
        features = q['x']
        features = model.project_inp(features)

    encoder_outputs, encoder_layers_features = model.encoder(features, padding_mask=None, layer=None)
    
    if feature_name == 'encoder_output':
        return encoder_outputs.squeeze(0)

    context_vectors = model.final_proj(encoder_outputs)

    return context_vectors.squeeze(0)

In [46]:
wav2vec2featurize(wav2vec2, torch.tensor(s[39561:40313]).unsqueeze(0), 'encoder_output')

tensor([[ 0.0458,  1.0363, -0.3802,  ..., -1.2763,  0.5118, -0.3924],
        [-0.7767, -0.6042, -0.1070,  ..., -0.0194, -0.7107, -1.8827]],
       grad_fn=<SqueezeBackward1>)

In [45]:
s, sr = sf.read("./data/data/TRAIN/DR1/FCJF0/SA1.WAV.wav", dtype='float32')
IPython.display.Audio(data=s[4559:5723], rate=sr)

In [72]:
wav2vec2

Wav2Vec2Model(
  (feature_extractor): ConvFeatureExtractionModel(
    (conv_layers): ModuleList(
      (0): Sequential(
        (0): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): Fp32GroupNorm(512, 512, eps=1e-05, affine=True)
        (3): GELU()
      )
      (1): Sequential(
        (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): GELU()
      )
      (2): Sequential(
        (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): GELU()
      )
      (3): Sequential(
        (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): GELU()
      )
      (4): Sequential(
        (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): GELU()
      )
      (5

# Word representations in utterance

In [None]:
# stride = 320
# window = 400

# for i, row in tqdm.tqdm(df.iterrows()):
#     wav_id = row['wav_id']
#     word_path = row['wav_path'][:-7] + 'WRD'
#     with open(word_path) as f:
#         words = f.read().strip('\n').split('\n')
#     wav, sr = sf.read(row['wav_path'], dtype='float32')
#     for j, word in enumerate(words):
#         word = word.split(' ')
#         start, end = int(word[0]), int(word[1])
#         start_frame = start // stride
#         end_frame = start_frame + (end - start + 1) // window
#         word = word[2]
#         s = wav[:start]
#         if start % stride != 0:
#             start_frame += 1
#             s = np.concatenate((np.zeros(stride - start % stride, dtype=np.float32), s))
#         s = np.concatenate((s, wav[start:end+1]))
#         if (end - start + 1) % window != 0:
#             s = np.concatenate((s, np.zeros(window - (end - start + 1) % window, dtype=np.float32)))
#             end_frame += 1

#         s = np.concatenate((s, wav[end + 1:]))
        
#         with torch.no_grad():
#             output = wav2vec2forward(wav2vec2, torch.tensor(s).unsqueeze(0).cuda(), aggregation=False)    

#         hf.create_dataset(f"{wav_id}-{word}_{j}-cnn_output", data=torch.mean(output['cnn_output'].cpu()[start_frame:end_frame], dim=0))
#         hf.create_dataset(f"{wav_id}-{word}_{j}-vq", data=torch.mean(output['vq'].cpu()[start_frame:end_frame], dim=0))
#         hf.create_dataset(f"{wav_id}-{word}_{j}-projected_vq", data=torch.mean(output['projected_vq'].cpu()[start_frame:end_frame], dim=0))
#         hf.create_dataset(f"{wav_id}-{word}_{j}-encoder_output", data=torch.mean(output['encoder_output'].cpu()[start_frame:end_frame], dim=0))
#         hf.create_dataset(f"{wav_id}-{word}_{j}-context_vector", data=torch.mean(output['context_vector'].cpu()[start_frame:end_frame], dim=0))

400it [01:10,  6.67it/s]

In [120]:
stride = 320
window = 400

hf_paths = ["./outputs/extracted_features/wav2vec2_small-random_init//TIMIT_train_word_in_utterance.h5", 
            "./outputs/extracted_features/wav2vec2_small-random_init//TIMIT_test_word_in_utterance.h5"]

df_paths = ['./data/TIMIT_train.csv', 
            './data/TIMIT_test.csv']

for hf_path, df_path in zip(hf_paths, df_paths):
    hf = h5py.File(hf_path, 'w')
    df = pd.read_csv(df_path)
    for i, row in tqdm.tqdm(df.iterrows()):
        wav_id = row['wav_id']
        word_path = row['wav_path'][:-7] + 'WRD'
        with open(word_path) as f:
            words = f.read().strip('\n').split('\n')
        wav, sr = sf.read(row['wav_path'], dtype='float32')
        for j, word in enumerate(words):
            word = word.split(' ')
            start, end = int(word[0]), int(word[1])
            word = word[2]
            ## align start frame

            s = wav[:end + 1]
            if start < window:
                shift = window - start
                s = np.concatenate((np.zeros(shift, dtype=np.float32), s))
                start += shift
                end += shift
            else:
                shift = stride - (start - window) % stride
                s = np.concatenate((np.zeros(shift, dtype=np.float32), s))
                start += shift
                end += shift

            word_len = (end - start + 1)
            start_frame = (start - window) // stride

            if word_len < window:
                end_frame = start_frame
                s = np.concatenate((np.zeros(window - word_len, dtype=np.float32), s))
            else:
                end_frame = start_frame + (word_len - window) // stride
                residual = (word_len - window) % stride
                if residual >= 10:
                    s = np.concatenate((np.zeros(stride - residual, dtype=np.float32), s))
                    end_frame += 1

            s = np.concatenate((s, wav[end + 1:]))

            with torch.no_grad():
                output = wav2vec2forward(wav2vec2, torch.tensor(s).unsqueeze(0).cuda(), aggregation=False)    

            for feature in ['cnn_output', 'vq', 'projected_vq', 'encoder_output', 'context_vector']:
                hf.create_dataset(f"{wav_id}-{word}_{j}-{feature}", data=torch.mean(output[feature].cpu()[start_frame:end_frame+1], dim=0))
    hf.close()

4620it [11:58,  6.43it/s]
1680it [04:24,  6.34it/s]


In [114]:
!rm -rf ./outputs/extracted_features/wav2vec2_small-random_init//TIMIT_train_word_in_utterance.h5

In [108]:
start = 400
end = 830

if start < window:
    shift = window - start
    s = np.concatenate((np.zeros(shift, dtype=np.float32), s))
    start += shift
    end += shift
else:
    shift = stride - (start - window) % stride
    s = np.concatenate((np.zeros(shift, dtype=np.float32), s))
    start += shift
    end += shift
    
print(start, end)
    
word_len = (end - start + 1)
start_frame = (start - window) // stride + 1

if word_len < window:
    end_frame = start_frame
else:
    end_frame = start_frame + (word_len - window) // stride
    residual = (word_len - window) % stride
    if residual > 10:
        end_frame += 1
        
print(start_frame)
print(end_frame)

720 1150
2
3


In [77]:
print(start_frame)
print(end_frame)

3
3


In [76]:
end_frame

3

In [65]:
!mv ./outputs/extracted_features/wav2vec2_small_960h/TIMIT_test_word_in_utterance.h5 ./outputs/extracted_features/wav2vec2_small_960h/TIMIT_test_word_in_utterance.h5.old

In [51]:
start

33770

In [60]:
len(s)

36566

In [62]:
# s, sr = sf.read(row['wav_path'], dtype='float32')
IPython.display.Audio(data=s, rate=sr)

In [41]:
word

'tips'

In [190]:
torch.mean(output['cnn_output'].cpu()[start_frame:end_frame], dim=0)

tensor([ 1.6804e-03, -1.5472e-03,  3.7947e-03,  2.7861e-03,  4.9227e-03,
         6.3696e-03, -1.0799e-03,  8.0385e-03,  9.7318e-03, -1.5368e-02,
         7.6208e-03, -3.2176e-03,  5.7563e-03, -5.8533e-03,  9.3264e-03,
         4.5652e-03,  1.0227e-02,  2.9289e-03, -1.1028e-02,  2.6695e-03,
        -1.5716e-02, -1.8326e-02, -1.2468e-02,  8.4183e-03, -1.1944e-02,
         2.3955e-04, -1.4522e-03,  6.8531e-03,  1.5572e-02, -3.0073e-03,
        -1.1049e-02,  1.0676e-02, -7.1369e-03,  1.3206e-02,  9.1090e-03,
        -4.0687e-03,  1.3036e-03, -5.4596e-03,  1.9076e-03, -1.0222e-02,
        -6.3178e-03, -1.1119e-02,  1.5601e-04,  5.7615e-04, -1.2921e-02,
         6.6013e-03, -2.0192e-03,  9.9866e-03, -5.2884e-03,  1.3517e-02,
         1.4157e-03, -1.1710e-02, -9.6261e-03, -1.2148e-02,  1.5904e-02,
        -2.3539e-03, -6.9669e-03,  1.9262e-03,  3.8302e-03,  8.0981e-03,
        -1.1971e-03,  1.7010e-03, -1.7203e-03,  3.0630e-02, -5.0514e-04,
        -1.6890e-02,  4.7542e-03, -1.0574e-02,  7.6

In [82]:
hf = h5py.File("./outputs/extracted_features/wav2vec2_small_960h/TIMIT_train_word_in_utterance.h5", 'w')

In [33]:
hf = h5py.File("./outputs/extracted_features/wav2vec2_small/TIMIT_train_word_in_utterance.h5", 'r')

In [67]:
!rm -rf ./outputs/extracted_features/wav2vec2_small_960h//TIMIT_train_word_in_utterance.h5

In [34]:
for key in hf.keys():
    if np.any(np.isnan(hf[key][:])):
        print(key, hf[key][:])
        break

DR1_FDML0_SI1149-of_2-cnn_output [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan n

In [89]:
df = pd.read_csv(df_paths[0])
for i, row in tqdm.tqdm(df.iterrows()):
    wav_id = row['wav_id']
    word_path = row['wav_path'][:-7] + 'WRD'
    with open(word_path) as f:
        words = f.read().strip('\n').split('\n')
    
    for j, word in enumerate(words):
        word = word.split(' ')
        if int(word[0]) == 0:
            print(i)

4620it [00:10, 448.65it/s]


In [122]:
a =  torch.ones((2,1,3))

In [124]:
a

tensor([[[1., 1., 1.]],

        [[1., 1., 1.]]])