In [14]:
from glob import glob
from tqdm import tqdm
from collections import defaultdict
import json
import numpy as np

In [2]:
filtered = glob('tts-filtered/*.json')
len(filtered)

1645455

In [12]:
pitches = defaultdict(list)
distances = defaultdict(list)

for f in tqdm(filtered):
    with open(f) as fopen:
        d = json.load(fopen)
        pitches[d['speaker']].extend(d['averaged_pitch'])
        distances[d['speaker']].extend(d['distances'])

100%|██████████| 1645455/1645455 [00:59<00:00, 27722.52it/s]


In [80]:
speaker_rate_bins = {}

for k in distances.keys():
    rates = distances[k]
    rates = [r for r in rates if 0 < r < 1]
    _, bin_edges = pd.qcut(rates, q=5, retbins=True)
    speaker_rate_bins[k] = bin_edges.tolist()

In [81]:
speaker_rate_bins

{'husein': [0.001, 0.008, 0.012, 0.02, 0.036, 0.98],
 'idayu': [0.001, 0.009, 0.012, 0.02, 0.03, 0.98]}

In [82]:
pitch_bins = {}

for k in pitches.keys():
    rates = pitches[k]
    _, bin_edges = pd.qcut(rates, q=5, retbins=True)
    pitch_bins[k] = bin_edges.tolist()

pitch_bins

{'husein': [74.98, 106.661, 116.258, 128.275, 151.404, 599.721],
 'idayu': [74.994, 194.709, 208.025, 222.457, 243.88, 599.999]}

In [83]:
def binarize_speaking_rate(samples, bin_edges):
    min = bin_edges.min()
    max = bin_edges.max()
    samples[samples <= min] = min
    samples[samples >= max] = max
    labels = list(range(bin_edges.shape[0] - 1))
    return pd.cut(samples, bins=bin_edges, labels=labels, include_lowest=True).tolist()

In [84]:
binarize_speaking_rate(np.array(rates[:10]), np.array(pitch_bins['idayu']))

[4, 3, 2, 4, 2, 3, 2, 1, 2, 1]

In [85]:
with open('bins.json', 'w') as fopen:
    json.dump({
        'speaking_rate': speaker_rate_bins,
        'pitch': pitch_bins,
    }, fopen)