Skip to content

Commit

Permalink
sinhala changes
Browse files Browse the repository at this point in the history
  • Loading branch information
pathnirvana committed Jan 29, 2021
1 parent 54139f6 commit a42f9d2
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 11 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,6 @@ TODO.txt
data/*
notebooks/data/*
TTS/tts/layers/glow_tts/monotonic_align/core.c

.vscode
temp_build
2 changes: 1 addition & 1 deletion TTS/bin/compute_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def main():
description="Compute mean and variance of spectrogtram features.")
parser.add_argument("--config_path", type=str, required=True,
help="TTS config file path to define audio processin parameters.")
parser.add_argument("--out_path", default=None, type=str,
parser.add_argument("--out_path", default=None, type=str, required=True,
help="directory to save the output file.")
args = parser.parse_args()

Expand Down
8 changes: 8 additions & 0 deletions TTS/setup_datasets.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
mkdir -p /home/models/sinhala /home/models/phoneme_cache

cd /home
mkdir -p datasets/sinhala
cd datasets/sinhala
wget https://github.com/pathnirvana/tacotron2/releases/download/1/sinhala.zip
unzip sinhala.zip
cd /home
22 changes: 13 additions & 9 deletions TTS/tts/configs/config.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"model": "Tacotron2",
"run_name": "ljspeech-ddc",
"run_name": "sinhala-tacotron2-ddc",
"run_description": "tacotron2 with DDC and differential spectral loss.",

// AUDIO PARAMETERS
Expand Down Expand Up @@ -37,7 +37,8 @@
"symmetric_norm": true, // move normalization to range [-1, 1]
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm": true, // clip normalized values into the range.
"stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
// python TTS/bin/compute_statistics.py --config_path TTS/tts/configs/config.json --out_path=/home/datasets/sinhala/scale_stats.npy
"stats_path": "/home/datasets/sinhala/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
},

// VOCABULARY PARAMETERS
Expand Down Expand Up @@ -135,12 +136,13 @@
"use_noise_augment": true,

// PATHS
"output_path": "/home/erogol/Models/LJSpeech/",
"output_path": "/home/models/sinhala/",

// PHONEMES
"phoneme_cache_path": "/home/erogol/Models/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder.
"phoneme_cache_path": "/home/models/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder.
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
"phoneme_language": "kn", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
// for sinhala use kn and add language_switch='remove-flags' when the phenomize is called - input should be in sinhala letters

// MULTI-SPEAKER and GST
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
Expand All @@ -162,10 +164,12 @@
"datasets": // List of datasets. They all merged and they get different speaker_ids.
[
{
"name": "ljspeech",
"path": "/home/erogol/Data/LJSpeech-1.1/",
"meta_file_train": "metadata.csv", // for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers
"meta_file_val": null
"name": "sinhala", // "ljspeech",
"path": "/home/datasets/sinhala/", //"/home/erogol/Data/LJSpeech-1.1/",
//"meta_file_train": "metadata.csv", // for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers
//"meta_file_val": null,
"meta_file_train": "train_filelist.txt",
"meta_file_val": "val_filelist.txt"
}
]
}
Expand Down
13 changes: 13 additions & 0 deletions TTS/tts/datasets/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,19 @@ def ljspeech(root_path, meta_file):
return items


def sinhala(root_path, meta_file):
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "sinhala" # todo can take the speaker id from the sinhala dataset if needed
with open(txt_file, 'r') as ttf:
for line in ttf:
cols = line.split('|')
wav_file = cols[0] #os.path.join(root_path, 'wavs', cols[0] + '.wav')
text = cols[1]
items.append([text, wav_file, speaker_name])
return items


def nancy(root_path, meta_file):
"""Normalizes the Nancy meta data file to TTS format"""
txt_file = os.path.join(root_path, meta_file)
Expand Down
2 changes: 1 addition & 1 deletion TTS/tts/utils/text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def text2phone(text, language):
#try:
punctuations = re.findall(PHONEME_PUNCTUATION_PATTERN, text)
if version.parse(phonemizer.__version__) < version.parse('2.1'):
ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language)
ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language, language_switch='remove-flags')
ph = ph[:-1].strip() # skip the last empty character
# phonemizer does not tackle punctuations. Here we do.
# Replace \n with matching punctuations.
Expand Down

0 comments on commit a42f9d2

Please sign in to comment.