# 基于FunASR进行推理 

## 可执行命令行：

In [None]:
funasr +model=paraformer-zh +vad_model="fsmn-vad" +punc_model="ct-punc" +input=vad_example.wav
# 注：支持单条音频文件识别，也支持文件列表，列表为kaldi风格wav.scp：wav_id   wav_path

## 实时语音识别 :

In [18]:
from funasr import AutoModel

chunk_size = [0, 10, 5] # [0, 10, 5] 600ms, [0, 8, 4] 480ms
encoder_chunk_look_back = 4 # number of chunks to lookback for encoder self-attention
decoder_chunk_look_back = 1 # number of encoder chunks to lookback for decoder cross-attention

model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.4")

import soundfile
import os

# wav_file = os.path.join(model.model_path, "./asr_example.wav") 
wav_file ="/data/coding/paraformer_Streaming/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/example/asr_example.wav"
speech, sample_rate = soundfile.read(wav_file)
#chunk_stride = chunk_size[1] * 960  # 600ms  # 每步处理的样本数：16个样本点每毫秒，600ms 对应于 10 * 96 样本
chunk_stride = chunk_size[1] * 8000  # 5s 

cache = {}
total_chunk_num = int(len((speech)-1)/chunk_stride+1)
for i in range(total_chunk_num):
    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
    is_final = i == total_chunk_num - 1
    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
    print(res)

funasr version: 1.2.3.
Check update of funasr, and it would cost few times. You may disable it by set `disable_update=True` in AutoModel
You are using the latest version of funasr-1.2.3


rtf_avg: 0.624: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  2.55it/s]                                                                                          


[{'key': 'rand_key_2yW4Acq9GFz6Y', 'text': '欢迎大家来体验达摩院推出的语音识'}]


rtf_avg: 0.829: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  2.22it/s]                                                                                          

[{'key': 'rand_key_1t9EwL56nGisi', 'text': '别模式欢迎大家来体验达摩院推出的语音识别模型'}]





注：chunk_size为流式延时配置，[0,10,5]表示上屏实时出字粒度为10*60=600ms，未来信息为5*60=300ms。
每次推理输入为600ms（采样点数为16000*0.6=960），
输出为对应文字，最后一个语音片段输入需要设置is_final=True来强制输出最后一个字。

## 语音端点检测（实时） 

In [None]:
from funasr import AutoModel

chunk_size = 200 # ms 
model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")

import soundfile

#wav_file = f"{model.model_path}/example/vad_example.wav"
wav_file = f"/data/coding/data_download/wav_mp3_datas/VAD_30s_mono.wav"  #单通道音频
speech, sample_rate = soundfile.read(wav_file)
chunk_stride = int(chunk_size * sample_rate / 1000)  
print(chunk_stride,sample_rate) # 9600

cache = {}
total_chunk_num = int(len((speech)-1)/chunk_stride+1)  # 计算总块数
for i in range(total_chunk_num):
    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
    is_final = i == total_chunk_num - 1
    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
    if len(res[0]["value"]):
        print(res)

语音端点检测 output 解读：
    [{'key': 'rand_key_wbFOdbpcao8Pu', 'value': [[78530, -1]]}]
    检测到一个语音片段，起始位置为 78530 采样点，结束位置未确定（-1）

## 时间戳预测 

In [4]:
from funasr import AutoModel

model = AutoModel(model="fa-zh", model_revision="v2.0.4")
# wav_file =f"music_piano_man.wav"
wav_file = f"{model.model_path}/example/asr_example.wav"
text_file = f"{model.model_path}/example/text.txt"
res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
print(res)

funasr version: 1.2.2.
Check update of funasr, and it would cost few times. You may disable it by set `disable_update=True` in AutoModel
New version is available: 1.2.3.
Please use the command "pip install -U funasr" to upgrade.
Downloading Model to directory: /mnt/workspace/.cache/modelscope/iic/speech_timestamp_prediction-v1-16k-offline


2025-01-30 23:29:15,278 - modelscope - INFO - Use user-specified model revision: v2.0.4
rtf_avg: 0.019: 100%|[34m██████████[0m| 1/1 [00:02<00:00,  2.23s/it]                                                                                          

[{'key': 'rand_key_2yW4Acq9GFz6Y', 'text': '欢 迎 大 家 来 到 魔 搭 社 区 进 行 体 验', 'timestamp': [[34670, 34910], [40590, 40830], [44430, 44670], [46890, 47130], [61010, 61250], [65210, 65450], [69390, 69630], [72270, 72510], [76350, 76590], [83390, 83630], [89670, 89910], [92670, 92910], [96390, 96630], [99790, 100030]]}]







更多详细用法（[示例](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining)）


## 微调

详细用法（[示例](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining)）


## 相关论文
https://arxiv.org/abs/2206.08317

title -- Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition(2022)
```

