# tira_asr_noise
Measure correlation between various nosie measures on Tira ASR dataset
Sample 5 rows from each 10 bins of each measure and set aside for hand eval,
then visualize correlation of hand eval with objective measures.

In [1]:
from datasets import load_from_disk
from scipy.stats import pearsonr
from scipy.stats.mstats import mquantiles
import numpy as np
import torchaudio
import torch
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds_path = r'C:\projects\malachor5\data\dataset_clips\tira-asr\unprocessed_audio_ds'
hand_eval_dir = r"C:\projects\malachor5\data\dataset_clips\tira-asr\hand-eval"
os.makedirs(hand_eval_dir, exist_ok=True)

In [3]:
ds = load_from_disk(ds_path)
ds

Dataset({
    features: ['audio', 'clip_name', 'index', 'vad_chunks', 'speech_embed', 'text_embed', 'embed_cos_sim', 'wada_snr', 'nist_snr', 'duration', 'vad_duration', 'vad_pct'],
    num_rows: 23261
})

In [4]:
quality_measures = ['vad_duration', 'vad_pct', 'embed_cos_sim', 'wada_snr', 'nist_snr']
for i, measure_1 in enumerate(quality_measures[:-1]):
    for j, measure_2 in enumerate(quality_measures[i+1:]):
        corr = pearsonr(ds[measure_1], ds[measure_2])
        print(measure_1, measure_2, corr)

vad_duration vad_pct PearsonRResult(statistic=np.float64(-0.058665296830645614), pvalue=np.float64(3.409606635791288e-19))
vad_duration embed_cos_sim PearsonRResult(statistic=np.float64(-0.047300970979111956), pvalue=np.float64(5.284043368062316e-13))
vad_duration wada_snr PearsonRResult(statistic=np.float64(0.08268366055430638), pvalue=np.float64(1.4155284764961584e-36))
vad_duration nist_snr PearsonRResult(statistic=np.float64(0.14086636331862154), pvalue=np.float64(2.1873407475885328e-103))
vad_pct embed_cos_sim PearsonRResult(statistic=np.float64(0.18004735228096272), pvalue=np.float64(1.0561006451866381e-168))
vad_pct wada_snr PearsonRResult(statistic=np.float64(-0.3433066376292787), pvalue=np.float64(0.0))
vad_pct nist_snr PearsonRResult(statistic=np.float64(-0.29215886904184274), pvalue=np.float64(0.0))
embed_cos_sim wada_snr PearsonRResult(statistic=np.float64(-0.028318656206717613), pvalue=np.float64(1.5626254087926876e-05))
embed_cos_sim nist_snr PearsonRResult(statistic=np.f

In [5]:
rows_added = []
bin_sample_n = 5
for measure in quality_measures:
    print(measure)
    bins = mquantiles(ds[measure], prob=np.linspace(0,1,10))
    binned_measure = np.digitize(ds[measure], bins)
    for bin in range(1, bins.shape[0]+1):
        # print(bin, binned_measure[binned_measure==bin].shape)
        bin_idcs = np.argwhere(binned_measure==bin).squeeze()
        for _ in range(bin_sample_n):
            new_idx = np.random.choice(bin_idcs)
            while new_idx in rows_added:
                new_idx = np.random.choice(bin_idcs)
            rows_added.append(new_idx)
rows_added[:5], len(rows_added)

vad_duration
vad_pct
embed_cos_sim
wada_snr
nist_snr


([np.int64(5039),
  np.int64(15529),
  np.int64(17738),
  np.int64(6114),
  np.int64(2741)],
 250)

In [6]:
hand_eval_ds = ds.select(rows_added)
hand_eval_ds

Dataset({
    features: ['audio', 'clip_name', 'index', 'vad_chunks', 'speech_embed', 'text_embed', 'embed_cos_sim', 'wada_snr', 'nist_snr', 'duration', 'vad_duration', 'vad_pct'],
    num_rows: 250
})

In [7]:
wav_files = []
def row_to_wav(row, outdir: str) -> None:
    """
    Reads samples from a Dataset Audio row (in the form of a dictionary)
    and save as a wav in the specified `outdir`.
    """
    basepath = os.path.basename(row['clip_name'])
    path = os.path.join(outdir, basepath)
    wav_files.append(path)
    wav_array = row['audio']['array']
    sr = row['audio']['sampling_rate']
    # cast to torch tensor and make 2D
    wav_tensor = torch.Tensor(wav_array).unsqueeze(0)
    torchaudio.save(path, wav_tensor, sr)
    return {"filename": os.path.basename(path)}

df = hand_eval_ds.map(lambda row: row_to_wav(row, hand_eval_dir), remove_columns=hand_eval_ds.column_names).to_pandas()
likert_cols = [
    "Audio quality",
    "Crosstalk",
    "Only Tira spoken?",
    "Disfluencies?",
    "Tira transcription accuracy",
]
for col in likert_cols:
    df[col]=''
df.to_excel(os.path.join(hand_eval_dir, 'eval.xlsx'))


Map: 100%|██████████| 250/250 [00:03<00:00, 65.70 examples/s] 
