# Machine Transcription and Translation

AdvancedCI.

For technical assistance contact beining@chineseaci.com .

In [None]:
#@title GPU Check
!nvidia-smi

Sat Feb  3 23:39:18 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## Step 1: Install and Setup

Execute all steps.

In [None]:
#@title 1.1 Install

import locale
locale.getpreferredencoding = lambda: "UTF-8"

!pip install faster-whisper
!pip install srt requests tqdm googletrans==4.0.0rc1 httpx aiometer
# https://stackoverflow.com/a/77671445
!apt install libcublas11

Collecting faster-whisper
  Downloading faster-whisper-0.10.0.tar.gz (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting av==10.* (from faster-whisper)
  Downloading av-10.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (31.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.0/31.0 MB[0m [31m47.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ctranslate2<4,>=3.22 (from faster-whisper)
  Downloading ctranslate2-3.24.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.8/36.8 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
Collecting onnxruntime<2,>=1.14 (from faster-whisper)
  Downloading onnxruntime-1.17.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

### 1.2 Config

### Whisper
- `device`: `cuda` or `cpu`. Whether to use GPU.
- `model_size`: Name of model. `distil` models are faster with lower quality.
- `compute_type`: `float16` is FP16 by default; `int8_float16` is INT8 on GPU; `int8` is INT8 on CPU
- `beam_size`: Whisper was trained with this - do not change unless you know what you are doing

### Silero VAD
- `vad_filter`: Whether to use VAD. Recommended to reduce false positive.
- `threshold`: Probability of non-speech. Higher = stricter.
- `min_speech_duration_ms`: as name suggests.
- `max_speech_duration_s`: Max duration of single speach. Reduced from infinite to 12s.
- `min_silence_duration_ms`: In the end of each speech chunk wait for this before separating it
- `window_size_samples`: Do not change unless you know what you are doing.
- `speech_pad_ms`: Add this to the beginning and end of VAD chunk to reduce false negative.

### SRT Generation

_This setup is very much ACICFG opinionated._

The following combination of setup should achive:

1. Any single line of subtitle should not become too long to show in a single line per default font and size setup; AND,
2. Any single line of subtitle should be long enough to give viewers enough time to recognize.

- `max_text_len`: Maximum characters per line to avoid out of vision. Best-effort basis. See `max_segment_interval`. Address point 1.
- `max_segment_interval`: Consider the next chunk of sentence if the length of current line is less than this amount of time. Address point 2.


In [None]:
#@title Settings

# Whisper
device = "cuda" #@param ["cuda", "cpu"]
model_size = 'large-v3' #@param ["large-v3", "distil-large-v2", "distil-medium.en"]
compute_type = "float16" #@param ["float16", "int8_float16", "int8"]
beam_size = 5 #@param {type:"integer"}
whisper_debug = True #@param {type: "boolean"}
# Silero VAD
vad_filter = True #@param {type:"boolean"}
threshold = 0.5 #@param {type:"number"}
min_speech_duration_ms = 250 #@param {type:"integer"}
max_speech_duration_s = 12 #@param {type:"number"}
min_silence_duration_ms = 2000  #@param {type:"integer"}
window_size_samples = 1024 #@param [512, 1024, 1536]
speech_pad_ms = 400 #@param {type:"integer"}
# SRT Generation
use_whisper_sentence_segment = False #@param {type: "boolean"}
max_text_len = 110 #@param {type:"integer"}
max_segment_interval = 1.5 #@param {type:"number"}
# transcription_cutoff_char = 80 #@param {type:"integer"}
# align_extend = 2 #@param {type:"integer"}
# align_from_prev = True #@param {type:"boolean"}




In [36]:
#@title 1.3 Load Model
from faster_whisper import WhisperModel

import logging

logging.basicConfig()
logging.getLogger("faster_whisper").setLevel(logging.DEBUG)

model = WhisperModel(model_size, device=device, compute_type=compute_type)

## Step 2: Transcribe and Alignment

In [38]:
#@title 2.1 Setup filename

filename = "output.mp3" #@param {type:"string"}
transcribed_srt_name = 'transcribed.srt' #@param {type:"string"}

In [None]:
#@title 2.2 Transcribe! Speed: ~10x

segments, info = model.transcribe(filename,
                                  beam_size=beam_size,
                                  word_timestamps=True,
                                  vad_filter=vad_filter,
                                  vad_parameters={'threshold': threshold,
                                                  'min_speech_duration_ms': min_speech_duration_ms,
                                                  'max_speech_duration_s': max_speech_duration_s,
                                                  'min_silence_duration_ms': min_silence_duration_ms,
                                                  'window_size_samples': window_size_samples,
                                                  'speech_pad_ms': speech_pad_ms},
                                  )
print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
segments = [i for i in segments]  # force run generator


INFO:faster_whisper:Processing audio with duration 44:35.200
INFO:faster_whisper:VAD filter removed 07:52.320 of audio
DEBUG:faster_whisper:VAD filter kept the following audio segments: [00:02.160 -> 00:12.928], [00:12.928 -> 00:22.736], [00:23.216 -> 00:25.680], [00:26.992 -> 00:37.344], [00:37.344 -> 00:47.648], [00:47.648 -> 00:54.480], [01:20.560 -> 01:27.184], [01:29.648 -> 01:37.680], [01:38.416 -> 01:49.696], [01:49.696 -> 01:58.864], [02:00.112 -> 02:09.744], [02:10.608 -> 02:17.424], [02:18.928 -> 02:26.000], [02:26.160 -> 02:36.480], [02:36.480 -> 02:47.904], [02:47.904 -> 02:59.280], [02:59.696 -> 03:10.592], [03:10.592 -> 03:14.256], [03:17.296 -> 03:26.928], [03:29.968 -> 03:36.976], [03:38.352 -> 03:46.064], [03:47.312 -> 03:57.760], [03:57.760 -> 03:59.312], [04:00.752 -> 04:07.184], [04:08.112 -> 04:17.504], [04:17.504 -> 04:24.912], [04:27.248 -> 04:37.584], [04:39.152 -> 04:50.592], [04:50.592 -> 05:01.776], [05:02.128 -> 05:03.440], [05:05.840 -> 05:15.984], [05:17.3

Detected language 'en' with probability 0.998535


DEBUG:faster_whisper:Processing segment at 00:26.640
DEBUG:faster_whisper:Processing segment at 00:56.640
DEBUG:faster_whisper:Processing segment at 01:26.620
DEBUG:faster_whisper:Processing segment at 01:56.620
DEBUG:faster_whisper:Processing segment at 02:26.620
DEBUG:faster_whisper:Processing segment at 02:56.620
DEBUG:faster_whisper:Processing segment at 03:26.620
DEBUG:faster_whisper:Processing segment at 03:56.620
DEBUG:faster_whisper:Processing segment at 04:26.620
DEBUG:faster_whisper:Processing segment at 04:56.600
DEBUG:faster_whisper:Processing segment at 05:25.560
DEBUG:faster_whisper:Processing segment at 05:55.380
DEBUG:faster_whisper:Processing segment at 06:25.380
DEBUG:faster_whisper:Processing segment at 06:51.540
DEBUG:faster_whisper:Processing segment at 07:18.800
DEBUG:faster_whisper:Processing segment at 07:48.780
DEBUG:faster_whisper:Processing segment at 08:16.780
DEBUG:faster_whisper:Processing segment at 08:45.940
DEBUG:faster_whisper:Processing segment at 09:

In [None]:
#@title 2.3 Generate SRT


import copy
import srt
from datetime import timedelta

def sentence_segments_merger(segments, max_text_len=80, max_segment_interval=2.0):
    """
    Merge sentence segments to one segment, if the length of the text is less than max_text_len.
    :param segments: [{"text": "Hello, World!", "start": 1.1, "end": 4.4}, {"text": "Hello, World!", "start": 1.1, "end": 4.4}]
    :type segments: list of dicts
    :param max_text_len: Max length of the text
    :type max_text_len: int
    :return: Segments, but with merged sentences.
    :rtype: list of dicts  [{"text": "Hello, World! Hello, World!", "start": 1.1, "end": 4.4}]
    """
    if not segments:
        return []

    merged_segments = []
    current_segment = {"text": "", "start": 0, "end": 0}
    current_segment_template = {"text": "", "start": 0, "end": 0}
    is_current_segment_empty = True

    for i, segment in enumerate(segments):
        # remove empty lines
        segment_text = segment["text"].strip()
        if not segment_text:
            continue

        if is_current_segment_empty:
            current_segment["start"] = segment["start"]
            current_segment["end"] = segment["end"]
            current_segment["text"] = segment["text"].strip()
            is_current_segment_empty = False
            continue

        if segment["start"] - current_segment["end"] < max_segment_interval and \
                len(current_segment["text"] + " " + segment_text) < max_text_len:
            current_segment["text"] += " " + segment_text
            current_segment["text"] = current_segment["text"].strip()
            current_segment["end"] = segment["end"]
        else:
            current_segment["text"] = current_segment["text"].strip()
            merged_segments.append(copy.deepcopy(current_segment))
            current_segment = copy.deepcopy(current_segment_template)
            is_current_segment_empty = True

    return merged_segments


segments_lst = []
for i in segments:
    for j in i.words:
        if j.word.strip():  # not empty string
            segments_lst.append({"text": j.word.strip(), "start": j.start, "end": j.end})

result_merged = sentence_segments_merger(segments_lst,
                                         max_text_len=max_text_len,
                                         max_segment_interval=max_segment_interval)

result_srt_list = []

# if use_whisper_sentence_segment:
#     for i, v in enumerate(segments):
#         result_srt_list.append(srt.Subtitle(index=i,
#                                         start=timedelta(seconds=v.start),
#                                         end=timedelta(seconds=v.end),
#                                         content=v.text.strip()))
# else:
for i, v in enumerate(result_merged):
    result_srt_list.append(srt.Subtitle(index=i,
                                        start=timedelta(seconds=v['start']),
                                        end=timedelta(seconds=v['end']),
                                        content=v['text'].strip()))

composed_transcription = srt.compose(result_srt_list)

with open(transcribed_srt_name, 'w') as f:
    f.write(composed_transcription)

You should see a srt file generated with desired name: right click and download the file.

In [None]:
#@title 2.4 Optional: Peek the SRT file
print(composed_transcription)

## Step 6: Translate

In [None]:
#@title 6.1 Import packages
import requests
import random
from hashlib import md5
from tqdm.notebook import tqdm
from tqdm.contrib.concurrent import process_map  # or thread_map
from googletrans import Translator
from joblib import Parallel, delayed

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


def translate_via_googletrans(content):
    try:
        resp = google_translator(content, src=source_lang.lower(), dest=target_lang.lower()).text
    except Exception as e:
        print(content)
        print(e)
        return ''

    return resp


# def translate_via_gmx(content):
#     try:
#         resp = s.get('https://search.gmx.com/translate', params={"q": content, "source": source_lang.lower(), "target": target_lang.lower(), "lang": 'en', "reload": "true"}).json()
#     except Exception as e:
#         print(content)
#         print(e)
#         return ''

#     return resp['Translation']


def translate_via_deepl_backup(content):
    try:
        resp = s.post('https://deepl.cnbeining.com/translate', json={"text": content, "source_lang": source_lang, "target_lang": target_lang}).json()
    except Exception as e:
        print(content)
        print(e)
        if resp['code'] != 200:
            print('Error calling API: ')
            print(resp)
        return ''

    return resp['result']['texts'][0]['text']


def translate_via_baidu(content):
    app_id = '20221011001385250'
    secret_key = 'J1qY4VXuCF9QOeumC_R4'
    salt = random.randint(32768, 65536)
    temp_str = app_id + content + str(salt) + secret_key
    sign = md5(temp_str.encode('utf-8')).hexdigest()
    payload = {'appid': app_id, 'q': content, 'from': source_lang.lower(), 'to': target_lang.lower(), 'salt': salt, 'sign': sign}
    try:
        resp = s.post('http://api.fanyi.baidu.com/api/trans/vip/translate', params=payload).json()
    except Exception as e:
        print(content)
        print(e)
        return ''

    return resp['trans_result'][0]['dst']

translation_function = translate_via_deepl_backup



In [37]:
#@title 6.2 Setup Variables: Thread Number, Source Language, Target Language

result_list_translated = []
result_list_assembled = []
s = requests.Session()
google_translator = Translator()


chunk_size = 8 #@param {type:"integer"}
thread_num = 12 #@param {type:"integer"}
source_lang = "RU" #@param ["auto", "BG", "CS", "DA", "DE", "EL", "EN", "EN-GB", "EN-US", "ES", "ET", "FI", "FR", "HU", "ID", "IT", "JA", "LT", "LV", "NL", "PL", "PT", "PT-BR", "PT-PT", "RO", "RU", "SK", "SL", "SV", "TR", "UK", "ZH"]
target_lang = "RU" #@param ["BG", "CS", "DA", "DE", "EL", "EN", "EN-GB", "EN-US", "ES", "ET", "FI", "FR", "HU", "ID", "IT", "JA", "LT", "LV", "NL", "PL", "PT", "PT-BR", "PT-PT", "RO", "RU", "SK", "SL", "SV", "TR", "UK", "ZH"]
translation_engine = "deepl_backup" #@param ["py-googletrans", "deepl_backup", "baidu-api"]

baidu_app_id = '20221011001385250' #@param {type:"string"}
baidu_secret_key = 'J1qY4VXuCF9QOeumC_R4' #@param {type:"string"}

remove_special_chars_acicfg = True #@param {type:"boolean"}

translated_result_filename = 'translated.srt' #@param {type:"string"}
is_generate_assembled_srt = True #@param {type:"boolean"}


# Hacking Google results
if target_lang == "ZH" and translation_engine == "py-googletrans":
    target_lang = "zh-cn"


- `thread_num`: Number of threads. Too high may cause throtting.
- `source_lang`, `target_lang`: Language code, See [ISO_639-1 codes](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)
- `translation_engine`:
  - `deepl_gmx`: Powered by GMX and DeepL. Data governance: Germany
  - `py-googletrans`: Powered by unofficial Google Translate AJAX API. Data governance: US
  - `deepl_backup`: Powered by ACICFG with DeepL. Data governance: Canada and Germany although no log is kept on ACICFG's server.
  - `baidu-api`: Powered by [Baidu Fanyi](http://api.fanyi.baidu.com/). Data governance: Mainland China
- `baidu_app_id` and `baidu_secret_key` are optional - only required when you use `baidu-api`.
- `remove_special_chars_acicfg`: Remove special chars per ACICFG's standard.
- `translated_result_filename`: Filename of SRT to generate.
- `is_generate_assembled_srt`: Generated SRT that has 2 lines - translation then original.

In [None]:
#@title 6.3 Translation: ~1.2 x thread number lines/sec when single threaded

# if translation_engine == "deepl_gmx":
#     translation_function = translate_via_gmx
if translation_engine == "deepl_backup":
    translation_function = translate_via_deepl_backup
elif translation_engine == "py-googletrans":
    translation_function = translate_via_googletrans
elif translation_engine == "baidu-api":
    translation_function = translate_via_baidu


source_texts = [line['text'].strip() for line in result_merged]
# preprocess source texts
source_text_chunks = list(chunks(source_texts, int(chunk_size)))
source_text_chunks_merged = ['\n------\n'.join(chunk) for chunk in source_text_chunks]

result_list_translated = []
result_api_call = Parallel(n_jobs=thread_num, verbose=10)(delayed(translation_function)(chunk) for chunk in source_text_chunks_merged)

for chunk, original_text in zip(result_api_call, source_text_chunks_merged):
    chunk = [i.strip() for i in chunk.split('------')] # in case the translator messes up the line breaks
    if len(chunk) != chunk_size:
        print(chunk)
        print(original_text)
    result_list_translated.extend(chunk)

print(f"Translated {len(result_list_translated)} lines.")

# Assemble SRT
for i, j in zip(source_texts, result_list_translated):
    result_list_assembled.append(f"{j}\n{i}")

result_srt_list_translated = []

for i, v in enumerate(result_merged):
    result_srt_list_translated.append(srt.Subtitle(index=i, start=timedelta(seconds=v['start']), end=timedelta(seconds=v['end']), content=result_list_translated[i]))

result_srt_list_assembled = []

for i, v in enumerate(result_merged):
    result_srt_list_assembled.append(srt.Subtitle(index=i, start=timedelta(seconds=v['start']), end=timedelta(seconds=v['end']), content=result_list_assembled[i]))

composed_transcription_translated = srt.compose(result_srt_list_translated)
composed_transcription_assembled = srt.compose(result_srt_list_assembled)


# remove special chars
if remove_special_chars_acicfg:
    composed_transcription_translated = composed_transcription_translated.replace("。", "").replace("，", " ").replace("、", " ")
    composed_transcription_assembled = composed_transcription_assembled.replace("。", "").replace("，", " ").replace("、", " ")


# Write SRT
with open(translated_result_filename, 'w') as f:
    if is_generate_assembled_srt:
        f.write(composed_transcription_assembled)
    else:
        f.write(composed_transcription_translated)

In [None]:
#@title Optional: Execute the cell below to peak the assembled results.
print(composed_transcription_assembled)

In [None]:
#@title Optional: xterm

import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding
!pip install colab-xterm

%load_ext colabxterm
%xterm

## Recycle Bin

For developer only.

In [None]:
#@title Unused: Single Threaded version


with tqdm(total=len(result['segments'])) as pbar:
    for line in result['segments']:
        content = line['text'].strip()
        try:
            resp = s.post('https://deepl.cnbeining.com/translate', json={"text": content, "source_lang": "auto", "target_lang": "ZH"}).json()
            result_list_translated.append(resp['data'])

        except Exception as e:
            print(line)
            print(e)
            if resp['code'] != 200:
                print('Error calling API: ' + resp['msg'])
            result_list_translated.append(content)
            result_list_assembled.append(content)
            continue


        result_list_translated.append(resp['data'])
        result_list_assembled.append(f"{resp['data']}\n{content}")

        pbar.update(1)


In [None]:
#@title 2.1 Import helpers


def word_segment_to_sentence(segments):
    """
    Convert word segments to sentences.
    :param segments: [{"text": "Hello,", "start": 1.1, "end": 2.2}, {"text": "World!", "start": 3.3, "end": 4.4}]
    :type segments: list of dicts
    :return: Segments, but with sentences instead of words.
    :rtype: list of dicts  [{"text": "Hello, World!", "start": 1.1, "end": 4.4}]
    """
    end_of_sentence_symbols = tuple(['.', '!', '?'])
    sentence_results = []

    current_sentence = {"text": "", "start": 0, "end": 0}
    current_sentence_template = {"text": "", "start": 0, "end": 0}

    for segment in segments:
        if current_sentence["text"] == "":
            current_sentence["start"] = segment["start"]
        current_sentence["text"] += segment["text"].strip() + ' '
        current_sentence["end"] = segment["end"]
        if segment["text"][-1].strip() in end_of_sentence_symbols:
            current_sentence["text"] = current_sentence["text"].strip()
            sentence_results.append(copy.deepcopy(current_sentence))
            current_sentence = copy.deepcopy(current_sentence_template)
    return sentence_results


