In [1]:
%%writefile ./train_lm.yaml

lm_conf:
    nlayers: 2
    unit: 650

token_type: bpe
bpemodel: data/bpemodel/bpe.model  # path to BPE model if using BPE
token_list: data/bpemodel/tokens.txt

optim: sgd        # or adam
batch_type: sorted
batch_size: 16    # batch size in LM training
max_epoch: 20     # if the data size is large, we can reduce this
patience: 3

best_model_criterion:
-   - valid
    - loss
    - min

keep_nbest_models: 1
use_matplotlib: false
use_tensorboard: false

Overwriting ./train_lm.yaml


In [2]:
from espnet2.text.build_tokenizer import build_tokenizer
from espnet2.text.token_id_converter import TokenIDConverter
from pathlib import Path

def create_lm_shape_files(
    text_file: str,
    shape_file: str
):
    """Create shape files for language model training in ESPnet2.
    
    Args:
        text_file: Input text file (utt_id + text)
        shape_file: Output shape file path
    """
    # Build tokenizer (ESPnet2 style)
    tokenizer = build_tokenizer(
        token_type="bpe",
        bpemodel="data/bpemodel/bpe.model",
        non_linguistic_symbols=None,
        delimiter=None,
    )
    
    # Create converter (though we only need tokenization here)
    converter = TokenIDConverter(
        token_list="data/bpemodel/tokens.txt",
    )
    
    with open(text_file) as fin, open(shape_file, "w") as fout:
        for line in fin:
            utt_id, text = line.strip().split(maxsplit=1)
            tokens = tokenizer.text2tokens(text)
            fout.write(f"{utt_id} {len(tokens)}\n")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package cmudict to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.


In [3]:
create_lm_shape_files(
    text_file="dump/train/text",
    shape_file="dump/train/text_shape"
)

create_lm_shape_files(
    text_file="dump/valid/text",
    shape_file="dump/valid/text_shape"
)

In [1]:
# !rm -r exp/lm

In [4]:
from espnet2.tasks.lm import LMTask

LMTask.main(cmd=[
    "--config", "train_lm.yaml",
    "--train_data_path_and_name_and_type", "dump/train/text,text,text",
    "--valid_data_path_and_name_and_type", "dump/valid/text,text,text",
    "--train_shape_file", "dump/train/text_shape",
    "--valid_shape_file", "dump/valid/text_shape",
    "--output_dir", "exp/lm/cy/rnn",
    "--ngpu", "1",
])

Failed to import Flash Attention, using ESPnet default: No module named 'flash_attn'


/opt/conda/envs/espnet/bin/python /opt/conda/envs/espnet/lib/python3.10/site-packages/ipykernel_launcher.py -f /home/jovyan/.local/share/jupyter/runtime/kernel-6f82f57f-22f0-4d61-9c6c-3aa0064d9d21.json
[jupyter-wpc0385] 2025-07-04 13:01:19,472 (lm:199) INFO: Vocabulary size: 1000
[jupyter-wpc0385] 2025-07-04 13:01:20,069 (abs_task:1383) INFO: pytorch.version=2.5.1, cuda.available=True, cudnn.version=90100, cudnn.benchmark=False, cudnn.deterministic=True
[jupyter-wpc0385] 2025-07-04 13:01:20,070 (abs_task:1384) INFO: Model structure:
ESPnetLanguageModel(
  (lm): SequentialRNNLM(
    (drop): Dropout(p=0.0, inplace=False)
    (encoder): Embedding(1000, 650, padding_idx=0)
    (rnn): LSTM(650, 650, num_layers=2, batch_first=True)
    (decoder): Linear(in_features=650, out_features=1000, bias=True)
  )
)

Model summary:
    Class Name: ESPnetLanguageModel
    Total Number of model parameters: 8.07 M
    Number of trainable parameters: 8.07 M (100.0%)
    Size: 32.29 MB
    Type: torch.float