### Improved ''' facebook/mms-tts-kir ''' model KYRGYZ TEXT --> SPEECH 

In [5]:
import torch
from transformers import VitsModel, AutoTokenizer
import numpy as np
from pydub import AudioSegment
import os
import uuid
import scipy.io.wavfile
from pydub import AudioSegment
import time

In [6]:

class TextToSpeechSynthesizer:

    def __init__(self, model_name="facebook/mms-tts-kir", output_dir="./media/voices"):
        self.model = VitsModel.from_pretrained(model_name)  
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)

    def synthesize(self, text):
        inputs = self.tokenizer(text, return_tensors="pt")
        inputs['input_ids'] = inputs['input_ids'].long()

        if inputs['input_ids'].size(1) == 0:  
            raise ValueError("Tokenized input is empty. Check the input text or tokenizer settings.")
        
        try:
            with torch.no_grad():
                outputs = self.model(**inputs)
        except RuntimeError as e:
            print("Error during model inference:", e)
            raise

        waveform = outputs.waveform.squeeze().cpu().numpy()
        sampling_rate = self.model.config.sampling_rate

        uid = str(uuid.uuid4())
        mp3_path = os.path.join(self.output_dir, f"{uid}.mp3")
        scipy.io.wavfile.write(mp3_path, rate=sampling_rate, data=waveform)
        time.sleep(3)

        return mp3_path

if __name__ == "__main__":
    output_directory = "media/voices"
    synthesizer = TextToSpeechSynthesizer(output_dir=output_directory)
    ###  EXAMPLE TEXT
    example_text = "   " + "Кыргыз элинде руханий тарыхы бар. Ал 3000 жыл."
    try:
        mp3_file = synthesizer.synthesize(example_text)
        print(f"Generated speech saved to {mp3_file}")
    except Exception as e:
        print(f"Error: {e}")


Some weights of the model checkpoint at facebook/mms-tts-kir were not used when initializing VitsModel: ['flow.flows.0.wavenet.in_layers.0.weight_g', 'flow.flows.0.wavenet.in_layers.0.weight_v', 'flow.flows.0.wavenet.in_layers.1.weight_g', 'flow.flows.0.wavenet.in_layers.1.weight_v', 'flow.flows.0.wavenet.in_layers.2.weight_g', 'flow.flows.0.wavenet.in_layers.2.weight_v', 'flow.flows.0.wavenet.in_layers.3.weight_g', 'flow.flows.0.wavenet.in_layers.3.weight_v', 'flow.flows.0.wavenet.res_skip_layers.0.weight_g', 'flow.flows.0.wavenet.res_skip_layers.0.weight_v', 'flow.flows.0.wavenet.res_skip_layers.1.weight_g', 'flow.flows.0.wavenet.res_skip_layers.1.weight_v', 'flow.flows.0.wavenet.res_skip_layers.2.weight_g', 'flow.flows.0.wavenet.res_skip_layers.2.weight_v', 'flow.flows.0.wavenet.res_skip_layers.3.weight_g', 'flow.flows.0.wavenet.res_skip_layers.3.weight_v', 'flow.flows.1.wavenet.in_layers.0.weight_g', 'flow.flows.1.wavenet.in_layers.0.weight_v', 'flow.flows.1.wavenet.in_layers.1.wei

Generated speech saved to media/voices\de22c4ab-da89-45d8-8fbc-1c035dfeb50d.mp3
