/
data_proc.py
107 lines (86 loc) · 3.81 KB
/
data_proc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""
Philip Anastassiou (pja2114)
COMS 6998: Fundamentals of Speech Recognition
Professor Homayoon Beigi
Columbia University
Due: December 19th, 2021
Citation for original authors of this file:
@misc{sammutbonnici2021timbre,
title={Timbre Transfer with Variational Auto Encoding and Cycle-Consistent Adversarial Networks},
author={Russell Sammut Bonnici and Charalampos Saitis and Martin Benning},
year={2021},
eprint={2109.02096},
archivePrefix={arXiv},
primaryClass={cs.SD}
}
@inproceedings{AlBadawy2020,
author={Ehab A. AlBadawy and Siwei Lyu},
title={{Voice Conversion Using Speech-to-Speech Neuro-Style Transfer}},
year=2020,
booktitle={Proc. Interspeech 2020},
pages={4726--4730},
doi={10.21437/Interspeech.2020-3056},
url={http://dx.doi.org/10.21437/Interspeech.2020-3056}
}
Re-implemented in TensorFlow 2.0 by Philip Anastassiou (Github: @phiana)
This is now a self-contained class that prepares a dictionary of random
samples of log-scaled mel-spectrograms of both speakers. Previously, this
class object was passed into a 'torch.utils.data.DataLoader' object. Being a
TensorFlow implementation, I opted to remove the PyTorch infrastructure and
re-implemented this class to contain all necessary functionality from the
DataLoader object. In the future, 'tf.data' should be used instead.
"""
import numpy as np
import pickle
import random
import tensorflow as tf
from params import num_samples
class DataProc():
def __init__(self, args, split):
self.args = args
self.data_dict = pickle.load(open('%s/data_%s.pickle'%(args.dataset, split),'rb'))
self.n_spkrs = len(self.data_dict.keys()) # pja2114: Added class attribute
def __len__(self):
total_len = 0
for i in range(len(self.data_dict.keys())):
tmp = np.sum([j.shape[1] for j in self.data_dict[i]])
total_len = max(total_len,tmp / 128)
return int(total_len)
def __getitem__(self):
rslt = []
for i in range(0, self.n_spkrs):
# Chose random item based on prop distribution (length of each sample)
tmp_lens = [j.shape[1] for j in self.data_dict[i]]
item = np.random.choice(len(tmp_lens), p=tmp_lens / np.sum(tmp_lens))
rslt.append(self.random_sample(i, item))
# Prepares a random sample per speaker
samples = {}
for i in range(0, self.n_spkrs):
samples[i] = rslt[i] # pja2114: Updated syntax; previous formatting: "samples[i] = np.array(rslt)[i, :]"
return samples
# pja2114: Modified to work with TensorFlow Tensor objects (channels-last formatting needed)
def random_sample(self, i, item):
n_samples = num_samples # 128
data = self.data_dict[i][item]
assert data.shape[1] >= n_samples
rand_i = random.randint(0,data.shape[1]-n_samples)
data = np.array(data[:, rand_i:rand_i+n_samples])
data = data.reshape((data.shape[0], data.shape[1], 1)).astype('float32')
return data
# pja2114: A new method I wrote to prepare a batch of random samples for training
def prepare_batch(self, batch_size):
batch = {}
for i in range(batch_size):
result = self.__getitem__()
for spkr in range(0, self.n_spkrs):
if i == 0:
batch[spkr] = []
batch[spkr].append(result[spkr])
for spkr in range(0, self.n_spkrs):
batch[spkr] = np.array(batch[spkr])
batch[spkr] = tf.convert_to_tensor(batch[spkr], dtype=tf.float32)
return batch
# Not currently used, but feel free to add for additional robustness in final model
def augment(self,data,sample_rate=16000, pitch_shift=0.5):
if pitch_shift == 0 : return data
return librosa.effects.pitch_shift(data, sample_rate, pitch_shift)