sinhala changes

pathnirvana · Jan 29, 2021 · a42f9d2 · a42f9d2
1 parent 54139f6
commit a42f9d2
Show file tree

Hide file tree

Showing 6 changed files with 39 additions and 11 deletions.
diff --git a/.gitignore b/.gitignore
@@ -130,3 +130,6 @@ TODO.txt
 data/*
 notebooks/data/*
 TTS/tts/layers/glow_tts/monotonic_align/core.c
+
+.vscode
+temp_build
diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py
@@ -19,7 +19,7 @@ def main():
         description="Compute mean and variance of spectrogtram features.")
     parser.add_argument("--config_path", type=str, required=True,
                         help="TTS config file path to define audio processin parameters.")
-    parser.add_argument("--out_path", default=None, type=str,
+    parser.add_argument("--out_path", default=None, type=str, required=True,
                         help="directory to save the output file.")
     args = parser.parse_args()
 

diff --git a/TTS/setup_datasets.sh b/TTS/setup_datasets.sh
@@ -0,0 +1,8 @@
+mkdir -p /home/models/sinhala  /home/models/phoneme_cache
+
+cd /home
+mkdir -p datasets/sinhala
+cd datasets/sinhala
+wget https://github.com/pathnirvana/tacotron2/releases/download/1/sinhala.zip
+unzip sinhala.zip
+cd /home
diff --git a/TTS/tts/configs/config.json b/TTS/tts/configs/config.json
@@ -1,6 +1,6 @@
 {
     "model": "Tacotron2",
-    "run_name": "ljspeech-ddc",
+    "run_name": "sinhala-tacotron2-ddc",
     "run_description": "tacotron2 with DDC and differential spectral loss.",
 
     // AUDIO PARAMETERS
@@ -37,7 +37,8 @@
         "symmetric_norm": true, // move normalization to range [-1, 1]
         "max_norm": 4.0,        // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
         "clip_norm": true,      // clip normalized values into the range.
-        "stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy"    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
+        // python TTS/bin/compute_statistics.py --config_path TTS/tts/configs/config.json --out_path=/home/datasets/sinhala/scale_stats.npy 
+        "stats_path": "/home/datasets/sinhala/scale_stats.npy"    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
     },
 
     // VOCABULARY PARAMETERS
@@ -135,12 +136,13 @@
     "use_noise_augment": true,
 
     // PATHS
-    "output_path": "/home/erogol/Models/LJSpeech/",
+    "output_path": "/home/models/sinhala/",
 
     // PHONEMES
-    "phoneme_cache_path": "/home/erogol/Models/phoneme_cache/",  // phoneme computation is slow, therefore, it caches results in the given folder.
+    "phoneme_cache_path": "/home/models/phoneme_cache/",  // phoneme computation is slow, therefore, it caches results in the given folder.
     "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
-    "phoneme_language": "en-us",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
+    "phoneme_language": "kn",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
+    // for sinhala use kn and add language_switch='remove-flags' when the phenomize is called - input should be in sinhala letters
 
     // MULTI-SPEAKER and GST
     "use_speaker_embedding": false,      // use speaker embedding to enable multi-speaker learning.
@@ -162,10 +164,12 @@
     "datasets":   // List of datasets. They all merged and they get different speaker_ids.
         [
             {
-                "name": "ljspeech",
-                "path": "/home/erogol/Data/LJSpeech-1.1/",
-                "meta_file_train": "metadata.csv", // for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers
-                "meta_file_val": null
+                "name": "sinhala", // "ljspeech",
+                "path": "/home/datasets/sinhala/", //"/home/erogol/Data/LJSpeech-1.1/",
+                //"meta_file_train": "metadata.csv", // for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers
+                //"meta_file_val": null,
+                "meta_file_train": "train_filelist.txt",
+                "meta_file_val": "val_filelist.txt"
             }
         ]
 }

diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py
@@ -166,6 +166,19 @@ def ljspeech(root_path, meta_file):
     return items
 
 
+def sinhala(root_path, meta_file):
+    txt_file = os.path.join(root_path, meta_file)
+    items = []
+    speaker_name = "sinhala" # todo can take the speaker id from the sinhala dataset if needed
+    with open(txt_file, 'r') as ttf:
+        for line in ttf:
+            cols = line.split('|')
+            wav_file = cols[0] #os.path.join(root_path, 'wavs', cols[0] + '.wav')
+            text = cols[1]
+            items.append([text, wav_file, speaker_name])
+    return items
+
+
 def nancy(root_path, meta_file):
     """Normalizes the Nancy meta data file to TTS format"""
     txt_file = os.path.join(root_path, meta_file)

diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py
@@ -33,7 +33,7 @@ def text2phone(text, language):
     #try:
     punctuations = re.findall(PHONEME_PUNCTUATION_PATTERN, text)
     if version.parse(phonemizer.__version__) < version.parse('2.1'):
-        ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language)
+        ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language, language_switch='remove-flags')
         ph = ph[:-1].strip() # skip the last empty character
         # phonemizer does not tackle punctuations. Here we do.
         # Replace \n with matching punctuations.