In [7]:
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", use_fast=False)

tok.eos_token_id

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


2

In [8]:
#!pip install -q librosa soundfile transformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import os, numpy as np, tensorflow as tf, librosa, soundfile as sf
from IPython.display import Audio, display
from transformers import AutoTokenizer

# لاگ‌های اضافی کمتر
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
for gpu in tf.config.list_physical_devices("GPU"):
    try: tf.config.experimental.set_memory_growth(gpu, True)
    except: pass


In [9]:
from MyTTSModel import TransformerTTS  # همان فایلی که مدل TTS را نوشتی

# پارامترها باید با آموزش هم‌خوانی داشته باشند
SR          = 22050
N_FFT       = 1024
HOP         = 256
WIN         = 1024
N_MELS      = 80
FMIN        = 0.0
FMAX        = 8000.0
PAD_ID      = 1
MAX_SRC_LEN = 256
MAX_MEL_LEN = 2000  # سقف منطقی برای تولید

# توکنایزر NLLB (فعلاً انگلیسی)
tok = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", use_fast=False)
tok.src_lang = "eng_Latn"

# ساخت/لود مدل
core = TransformerTTS(
    num_layers=6, d_model=256, num_heads=8, dff=1024,
    input_vocab_size=tok.vocab_size, n_mels=N_MELS,
    dropout_rate=0.1, pad_id=PAD_ID
)

core.build_for_load(max_src_len=MAX_SRC_LEN, max_tgt_len=MAX_MEL_LEN)
# وزن‌های آموزش‌داده‌شده‌ات
os.makedirs("checkpoints", exist_ok=True)
core_path = "checkpoints/tts_core_last.weights.h5"

if os.path.exists(core_path):
    core.load_weights(core_path)
    print("✅ Weights loaded.")
else:
    print("⚠️ Checkpoint not found:", core_path)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


✅ Weights loaded.


In [10]:
# ---------- کمکـی‌ها برای برگرداندن mel_norm[-1,1] به موج ----------
def db_to_power(db): return tf.pow(10.0, db / 10.0)

def denorm_mel(mel_norm):
    mel01  = (mel_norm + 1.0) * 0.5        # [-1,1] -> [0,1]
    mel_db = mel01 * 100.0 - 100.0         # -> [-100, 0] dB
    return db_to_power(mel_db)             # -> power

def mel_to_linear_power(mel_power, sr=SR, n_fft=N_FFT, n_mels=N_MELS, fmin=FMIN, fmax=FMAX):
    mel_mat = tf.signal.linear_to_mel_weight_matrix(
        num_mel_bins=n_mels,
        num_spectrogram_bins=n_fft // 2 + 1,
        sample_rate=float(sr),
        lower_edge_hertz=float(fmin),
        upper_edge_hertz=float(fmax),
        dtype=tf.float32
    )  # (n_fft//2+1, n_mels)
    mel_pinv = tf.linalg.pinv(mel_mat)               # (n_mels, n_fft//2+1)
    linear_power = tf.matmul(mel_power, mel_pinv)    # (T, n_fft//2+1)
    return tf.maximum(linear_power, 1e-10)

def griffin_lim_from_mag(mag, n_fft=N_FFT, hop=HOP, win=WIN, n_iter=60):
    mag = tf.cast(mag, tf.float32)

    # --- init random phase (بدون 1j) ---
    phases = tf.random.uniform(tf.shape(mag), 0.0, 2.0*np.pi, dtype=tf.float32)
    angles = tf.complex(tf.cos(phases), tf.sin(phases))            # (T, n_fft//2+1) complex64
    S = tf.cast(mag, tf.complex64) * angles

    def _stft(w):
        return tf.signal.stft(w, frame_length=win, frame_step=hop,
                              fft_length=n_fft, window_fn=tf.signal.hann_window)
    def _istft(S_):
        return tf.signal.inverse_stft(S_, frame_length=win, frame_step=hop,
                                      window_fn=tf.signal.hann_window)

    for _ in range(n_iter):
        wav = _istft(S)
        S_est = _stft(wav)
        ang = tf.math.angle(S_est)                                 # float32
        phase = tf.complex(tf.cos(ang), tf.sin(ang))               # complex64
        S = tf.cast(mag, tf.complex64) * phase

    wav = _istft(S)
    wav = wav / (tf.reduce_max(tf.abs(wav)) + 1e-6)                # [-1,1]
    return tf.squeeze(wav)


def mel_to_wav_griffinlim(mel_norm, n_iter=60):
    mel_power = denorm_mel(mel_norm)
    linear_power = mel_to_linear_power(mel_power)
    mag = tf.sqrt(tf.maximum(linear_power, 1e-10))
    return griffin_lim_from_mag(mag, n_iter=n_iter)

# ---------- مدل و توکنایزر ----------
# فرض: core همون TransformerTTS شماست که build شده و وزن‌هاش لود شدن
# core.load_weights("checkpoints/tts_core_best.weights.h5")

tok = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", use_fast=False)
if hasattr(tok, "src_lang"): tok.src_lang = "eng_Latn"

def tokenize_texts(texts, tok, max_len=256, pad_id=1):
    ids = [tok.encode(t, add_special_tokens=True)[:max_len] for t in texts]
    lens = [len(x) for x in ids]
    ids_pad = tf.keras.preprocessing.sequence.pad_sequences(ids, maxlen=max_len, padding="post", value=pad_id)
    return tf.constant(ids_pad, tf.int32), tf.constant(lens, tf.int32)



# (اختیاری) ذخیره‌ی WAV روی دیسک
# tf.io.write_file("tts_out.wav", tf.audio.encode_wav(tf.expand_dims(wav, -1), sample_rate=SR))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
# ---------- اینفرنس ----------
texts = ["Please use our dedicated channels for questions and discussion"]
enc_ids, _ = tokenize_texts(texts, tok, max_len=256, pad_id=1)

# greedy_generate شما خروجی PostNet را برمی‌گرداند (mel_hat)
mel_hat, stop_probs = core.greedy_generate_fast(
    enc_ids,
    max_steps=600,      # برای تست، بعداً زیاد کن
    min_steps=40,
    stop_threshold=0.55,
    window=6,
    patience=2,
    check_stop_every=10,
    verbose=True
)
mel_one = mel_hat[0].numpy()
wav = mel_to_wav_griffinlim(mel_one, n_iter=32)  # قبلاً 64 یا 60 بود
Audio(wav.numpy(), rate=SR)



gen step 50 stop~ 9.24928546e-19
gen step 100 stop~ 9.23079036e-17
gen step 150 stop~ 1.37322961e-15
gen step 200 stop~ 8.11241231e-15
gen step 250 stop~ 3.04225351e-15
gen step 300 stop~ 7.06005324e-11
gen step 350 stop~ 3.74647049e-11
gen step 400 stop~ 3.59980129e-10
gen step 450 stop~ 4.53999893e-10
gen step 500 stop~ 2.69196443e-10
gen step 550 stop~ 4.91966967e-10
gen step 600 stop~ 3.43942569e-10
