In [1]:
import torch
def m4t_translate_and_save(df, translator, src_lang, tgt_lang, output_file):
    # src_lang, tgt_lang = 'eng' or 'jpn'
    try:
        # Open the output file in write mode
        with open(output_file, 'w', encoding='utf-8') as file:
            # Iterate over each row in the DataFrame
            for _, row in df.iterrows():
                index_row = 'Japanese' if src_lang == 'jpn' else 'English'
                source_text = str(row[index_row])

                # Translate the text using the local translator
                translated_text, _ = translator.predict(
                    input=source_text,
                    task_str="T2TT",
                    tgt_lang=(tgt_lang),
                    src_lang=src_lang,
                    unit_generation_opts=None, 
                )
                
                # Write the translated text to the output file
                file.write(str(translated_text[0]) + '\n\n')
                #print("SRC TEXT:", source_text, "\n", "OUTPUT TEXT:", str(translated_text[0]), "\n") # TODO remove 

        print(f"Translation completed. Translations saved to: {output_file}")

    except IOError as e :
        print(f"Error writing to file: {output_file}: {e}")

    except Exception as e:
        print(f"An error occurred during translation: {str(e)}")

In [2]:
import pandas as pd
import json
def reassemble_test_dataset(json_path, engl_paths, japn_paths):
    """
    Reassembles the dataset from the original files using indices stored in a JSON file.

    Args:
    json_path (str): Path to the JSON file containing the indices.
    engl_paths (list): List of paths to the English files in the order they were originally concatenated.
    japn_paths (list): List of paths to the Japanese files in the order they were originally concatenated.

    Returns:
    pd.DataFrame: The reassembled DataFrame.
    """
    # Read the indices from the JSON file
    with open(json_path, 'r', encoding='utf-8') as json_file:
        indices = json.load(json_file)
    
    # Initialize an empty DataFrame
    df = pd.DataFrame()

    # Extract and concatenate the subsets using the indices
    for engl_path, japn_path, index_key in zip(engl_paths, japn_paths, indices):
        with open(engl_path, 'r', encoding='utf-8') as engl_file, open(japn_path, 'r', encoding='utf-8') as japn_file:
            engl_lines = engl_file.readlines()
            japn_lines = japn_file.readlines()
        
        # Extract the subset of lines using the index
        start_index = indices[index_key]
        engl_subset = engl_lines[start_index:start_index + 300]
        japn_subset = japn_lines[start_index:start_index + 300]

        # Create a new DataFrame with the extracted lines
        new_data = {'English': engl_subset, 'Japanese': japn_subset}
        new_df = pd.DataFrame(new_data)

        # Concatenate the new DataFrame with the existing one
        df = pd.concat([df, new_df], ignore_index=True)
    
    return df



In [4]:
json_path = './model_outputs/test/en_to_jp/index.json'
engl_paths = [
    './datasets/public/kftt-data-1.0/data/tok/kyoto-test.en',
    './datasets/public/pheMT_final/tok.en',
    './datasets/private/ASPEC/ASPEC-JE/devtest/devtest.tok.en'
]
japn_paths = [
    './datasets/public/kftt-data-1.0/data/tok/kyoto-test.ja',
    './datasets/public/pheMT_final/tok.ja',
    './datasets/private/ASPEC/ASPEC-JE/devtest/devtest.tok.ja'
]
kftt_phemt_aspec = reassemble_test_dataset(json_path, engl_paths, japn_paths)

In [5]:
from seamless_communication.inference import Translator, SequenceGeneratorOptions
from fairseq2.generation import NGramRepeatBlockProcessor
print(f"PyTorch version: {torch.__version__}")

# Check PyTorch has access to MPS (Metal Performance Shader, Apple's GPU architecture)
print(f"Is MPS (Metal Performance Shader) built? {torch.backends.mps.is_built()}")
print(f"Is MPS available? {torch.backends.mps.is_available()}")

# Set the device      
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

translator_v1 = Translator(
    "seamlessM4T_large",
    "vocoder_36langs",
    device=torch.device("cpu"),
    dtype=torch.float16,
)

translator_v2 = Translator(
    "seamlessM4T_v2_large",
    "vocoder_v2",
    device=torch.device("cpu"),
    dtype=torch.float16,
)

#m4t_translate_and_save(kftt_phemt_aspec, translator_v1, 'eng', 'jpn', 'model_outputs/test/en_to_jp/m4tv1/out.txt') DONE 

PyTorch version: 2.2.2
Is MPS (Metal Performance Shader) built? True
Is MPS available? True
Using device: mps


Using the cached checkpoint of seamlessM4T_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_large. Set `force` to `True` to download again.
Using the cached checkpoint of vocoder_36langs. Set `force` to `True` to download again.
Using the cached checkpoint of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached checkpoint of vocoder_v2. Set `force` to `True` to download again.


Translation completed. Translations saved to: model_outputs/test/en_to_jp/m4tv1/out.txt


KeyboardInterrupt: 

In [7]:
#m4t_translate_and_save(kftt_phemt_aspec, translator_v1, 'jpn', 'eng', 'model_outputs/test/jp_to_en/m4tv1/out.txt') # DONE 
#m4t_translate_and_save(kftt_phemt_aspec, translator_v2, 'eng', 'jpn', 'model_outputs/test/en_to_jp/m4tv2/out.txt') # DONE 
#m4t_translate_and_save(kftt_phemt_aspec, translator_v2, 'jpn', 'eng', 'model_outputs/test/jp_to_en/m4tv2/out.txt') # DONE 

Translation completed. Translations saved to: model_outputs/test/jp_to_en/m4tv1/out.txt
Translation completed. Translations saved to: model_outputs/test/en_to_jp/m4tv2/out.txt
Translation completed. Translations saved to: model_outputs/test/jp_to_en/m4tv2/out.txt
