In [26]:
from faster_whisper import WhisperModel
import os
import csv
import pandas as pd
from googletrans import Translator
from copy import deepcopy
from utils import remove_file_or_dir

### List of whisper models and support languages

In [27]:
whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2"]
source_languages = {
    "en": "English",
    "zh": "Chinese",
    "de": "German",
    "es": "Spanish",
    "ru": "Russian",
    "ko": "Korean",
    "fr": "French",
    "ja": "Japanese",
    "pt": "Portuguese",
    "tr": "Turkish",
    "pl": "Polish",
    "ca": "Catalan",
    "nl": "Dutch",
    "ar": "Arabic",
    "sv": "Swedish",
    "it": "Italian",
    "id": "Indonesian",
    "hi": "Hindi",
    "fi": "Finnish",
    "vi": "Vietnamese",
    "he": "Hebrew",
    "uk": "Ukrainian",
    "el": "Greek",
    "ms": "Malay",
    "cs": "Czech",
    "ro": "Romanian",
    "da": "Danish",
    "hu": "Hungarian",
    "ta": "Tamil",
    "no": "Norwegian",
    "th": "Thai",
    "ur": "Urdu",
    "hr": "Croatian",
    "bg": "Bulgarian",
    "lt": "Lithuanian",
    "la": "Latin",
    "mi": "Maori",
    "ml": "Malayalam",
    "cy": "Welsh",
    "sk": "Slovak",
    "te": "Telugu",
    "fa": "Persian",
    "lv": "Latvian",
    "bn": "Bengali",
    "sr": "Serbian",
    "az": "Azerbaijani",
    "sl": "Slovenian",
    "kn": "Kannada",
    "et": "Estonian",
    "mk": "Macedonian",
    "br": "Breton",
    "eu": "Basque",
    "is": "Icelandic",
    "hy": "Armenian",
    "ne": "Nepali",
    "mn": "Mongolian",
    "bs": "Bosnian",
    "kk": "Kazakh",
    "sq": "Albanian",
    "sw": "Swahili",
    "gl": "Galician",
    "mr": "Marathi",
    "pa": "Punjabi",
    "si": "Sinhala",
    "km": "Khmer",
    "sn": "Shona",
    "yo": "Yoruba",
    "so": "Somali",
    "af": "Afrikaans",
    "oc": "Occitan",
    "ka": "Georgian",
    "be": "Belarusian",
    "tg": "Tajik",
    "sd": "Sindhi",
    "gu": "Gujarati",
    "am": "Amharic",
    "yi": "Yiddish",
    "lo": "Lao",
    "uz": "Uzbek",
    "fo": "Faroese",
    "ht": "Haitian creole",
    "ps": "Pashto",
    "tk": "Turkmen",
    "nn": "Nynorsk",
    "mt": "Maltese",
    "sa": "Sanskrit",
    "lb": "Luxembourgish",
    "my": "Myanmar",
    "bo": "Tibetan",
    "tl": "Tagalog",
    "mg": "Malagasy",
    "as": "Assamese",
    "tt": "Tatar",
    "haw": "Hawaiian",
    "ln": "Lingala",
    "ha": "Hausa",
    "ba": "Bashkir",
    "jw": "Javanese",
    "su": "Sundanese",
}

### Transcribe audio

In [21]:
def export_file(segments, out_filepath):
    remove_file_or_dir(out_filepath)
    with open(out_filepath, "w+") as f:
        csv_writer = csv.writer(f, delimiter=",")
        for segment in segments:
            csv_writer.writerow([segment.start, segment.end, segment.text])
    return out_filepath


def speech_to_text(audio_file, language="vi", out_dir="data/texts/"):
    # Run on GPU with FP16
    model_size = "medium"
    # model = WhisperModel(model_size, device="cpu", compute_type="int8", download_root="model", local_files_only=True)
    print(f"Using Whisper model version {model_size}")
    # or run on GPU with INT8
    # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
    # or run on CPU with INT8
    model = WhisperModel(model_size, device="cpu", compute_type="int8", download_root="models", local_files_only=True)
    # model = WhisperModel(model_size, device="cpu", compute_type="int8")
    segments, info = model.transcribe(
        audio_file,
        beam_size=5,
        best_of=5,
        language=language,
        # no_speech_threshold=0.5,
        # vad_filter=True,
        # vad_parameters=dict(min_silence_duration_ms=500),
        # word_timestamps=True,
    )

    print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
    # for segment in segments:
    #     print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
    
    filename, ext = os.path.splitext(os.path.basename(f"{audio_file}"))
    output_file = os.path.join(out_dir, filename + ".csv")
    output_file = export_file(segments, output_file)
    print(f"Output file saved at {output_file}")
    return output_file, info

In [4]:
audio_path = "data/sound/GPT_NEEDED_THIS___DALL-E_3_IS_HERE___YOU_CAN_TRY_IT_FREE.wav"
# _, info = speech_to_text(audio_path, language="en")

### Translate the text with google translator

We can try https://huggingface.co/VietAI/envit5-translation
or https://github.com/VinAIResearch/VinAI_Translate

In [5]:
def translate_text(text, target_lang_code="vi", src_lang_code="en"):
    translator = Translator()
    translated_text = translator.translate(text, dest=target_lang_code).text
    return translated_text

In [6]:
def merge_csv_rows(input_file):
    merged_rows = []
    # Read the input CSV file
    with open(input_file, 'r') as file:
        reader = csv.DictReader(file)
        # Initialize the first row as the current row
        current_row = next(reader)
        for row in reader:
            # Check if the "end" value of the current row is equal to the "end" value of the next row
            if float(current_row['end']) == float(row['end']):
                # Merge the "text" values of the current row and the next row
                current_row['text'] += row['text']
            else:
                # Append the merged row to the list
                merged_rows.append(current_row)
                # Update the current row to the next row
                current_row = row
        
        # Append the last merged row to the list
        merged_rows.append(current_row)
    return merged_rows

In [40]:
def merge_consecutive_segments(timestamp_segments_file, epsilon=0.05, out_dir="data/merged_texts/"):
    timestamp_segments = pd.read_csv(timestamp_segments_file,
                                 names=["start", "end", "text"])
    n_segments = len(timestamp_segments)
    merged_segments = []
    current_row = timestamp_segments.iloc[0].tolist() #loc[[0]].tolist()
    for i in range(0, n_segments -1):
        next_row = timestamp_segments.iloc[i+1].tolist()#loc[[i+1]].tolist()
        # if start time of next row - end time of previous row < epsilon -> merge it
        if float(next_row[0]) - float(current_row[1]) <= epsilon:
            current_row[1] = next_row[1]
            current_row[2] += next_row[2]
        else:
            merged_segments.append(current_row)
            current_row = next_row
    # concat the last row to the list
    merged_segments.append(current_row)

    file_name = os.path.basename(timestamp_segments_file)
    file_path = os.path.join(out_dir, file_name)
    remove_file_or_dir(file_path)
    with open(file_path, "w") as file:
        csv_writer = csv.writer(file, delimiter=",")
        csv_writer.writerows(merged_segments)
    return file_path


In [42]:
text_csv = "/mnt/d/Projects/video-translation/data/texts/GPT_NEEDED_THIS___DALL-E_3_IS_HERE___YOU_CAN_TRY_IT_FREE.csv"
merged_segments = merge_consecutive_segments(text_csv)

In [31]:
merged_segments

'data/merged_texts/GPT_NEEDED_THIS___DALL-E_3_IS_HERE___YOU_CAN_TRY_IT_FREE.csv'

### Translation

In [None]:
# Load text from csv
def create_translated_speech_csv(speech_csv_filepath, out_dir="data/translated_texts"):
    filename = os.path.basename(speech_csv_filepath)
    full_text = ""
    timestamp_segments = pd.read_csv(speech_csv_filepath,
                                 names=["start", "end", "text"])
    print(f"There are {len(timestamp_segments)} segments before translating!")
    # check if there are any empty string
    """
    empty_row_idx = timestamp_text[timestamp_text['text'] == ''].index
    if len(empty_row_idx) > 0:
        print(f"There ")
    """
    # get full text
    full_text = '|'.join(timestamp_segments["text"][:10])
    print(f"There are {len(full_text.split())} words.")
    # translate the text
    trans_full_text = translate_text(full_text)
    print(f"Translated text: {trans_full_text}")
    translated_text_segments = trans_full_text.split("|")
    print(f"There are {len(translated_text_segments)} segments after translating!")
    # timestamp_segments['text'] = translated_text_segments
    # save thte translated timestamp speech textt
    # timestamp_segments.to_csv(os.path.join(out_dir, filename))
    # Writing to CSV file
    return timestamp_segments

In [None]:
text_csv = "/mnt/d/Projects/video-translation/data/texts/GPT_NEEDED_THIS___DALL-E_3_IS_HERE___YOU_CAN_TRY_IT_FREE.csv"
# df_text = create_translated_speech_csv(text_csv)

In [None]:
merged_text = merge_csv_rows(text_csv)
print(merged_text)

KeyError: 'end'