# Reassemble Dataframe 

In [31]:
import pandas as pd
import json

def reassemble_test_dataset(json_path, engl_paths, japn_paths):
    """
    Reassembles the dataset from the original files using indices stored in a JSON file.

    Args:
    json_path (str): Path to the JSON file containing the indices.
    engl_paths (list): List of paths to the English files in the order they were originally concatenated.
    japn_paths (list): List of paths to the Japanese files in the order they were originally concatenated.

    Returns:
    pd.DataFrame: The reassembled DataFrame.
    """
    # Read the indices from the JSON file
    with open(json_path, 'r', encoding='utf-8') as json_file:
        indices = json.load(json_file)
    
    # Initialize an empty DataFrame
    df = pd.DataFrame()

    # Extract and concatenate the subsets using the indices
    for engl_path, japn_path, index_key in zip(engl_paths, japn_paths, indices):
        with open(engl_path, 'r', encoding='utf-8') as engl_file, open(japn_path, 'r', encoding='utf-8') as japn_file:
            engl_lines = engl_file.readlines()
            japn_lines = japn_file.readlines()
        
        # Extract the subset of lines using the index
        start_index = indices[index_key]
        engl_subset = engl_lines[start_index:start_index + 300]
        japn_subset = japn_lines[start_index:start_index + 300]

        # Create a new DataFrame with the extracted lines
        new_data = {'English': engl_subset, 'Japanese': japn_subset}
        new_df = pd.DataFrame(new_data)

        # Concatenate the new DataFrame with the existing one
        df = pd.concat([df, new_df], ignore_index=True)
    
    return df

json_path = './model_outputs/test/en_to_jp/index.json'
engl_paths = [
    './datasets/public/kftt-data-1.0/data/tok/kyoto-test.en',
    './datasets/public/pheMT_final/tok.en',
    './datasets/private/ASPEC/ASPEC-JE/devtest/devtest.tok.en'
]
japn_paths = [
    './datasets/public/kftt-data-1.0/data/tok/kyoto-test.ja',
    './datasets/public/pheMT_final/tok.ja',
    './datasets/private/ASPEC/ASPEC-JE/devtest/devtest.tok.ja'
]
kftt_phemt_aspec = reassemble_test_dataset(json_path, engl_paths, japn_paths)

# M4T Inference


In [3]:
conda install -c conda-forge libsndfile==1.0.31

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.3.1
  latest version: 24.3.0

Please update conda by running

    $ conda update -n base -c conda-forge conda

Or to minimize the number of packages updated during conda update use

     conda install conda=24.3.0



## Package Plan ##

  environment location: /Users/thomaspett/Desktop/projects/MT_senior_thesis_repo/env

  added / updated specs:
    - libsndfile==1.0.31


The following NEW packages will be INSTALLED:

  gettext            conda-forge/osx-arm64::gettext-0.22.5-h8fbad5d_2 
  gettext-tools      conda-forge/osx-arm64::gettext-tools-0.22.5-h8fbad5d_2 
  libasprintf        conda-forge/osx-arm64::libasprintf-0.22.5-h8fbad5d_2 
  libasprintf-devel  conda-forge/osx-arm64::libasprintf-devel-0.22.5-h8fbad5d_2 
  libflac            conda-forge/osx-arm64::libflac-1.3.4-h07bb92c_0 
  libgettextpo       conda-forge/osx-arm64::libgettextpo-0.22.5-h8fbad5d_2 
  libgettextpo-devel

In [38]:
import torch
from seamless_communication.inference import Translator
print(f"PyTorch version: {torch.__version__}")

# Check PyTorch has access to MPS (Metal Performance Shader, Apple's GPU architecture)
print(f"Is MPS (Metal Performance Shader) built? {torch.backends.mps.is_built()}")
print(f"Is MPS available? {torch.backends.mps.is_available()}")

# Set the device      
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

#translator = Translator("seamlessM4T_large", "vocoder_v2", torch.device("mps:0"), torch.float16)
m4tv1_translator = Translator(
"seamlessM4T_large",
"vocoder_v2",
device=torch.device("mps:0"),
dtype=torch.float16,
)


PyTorch version: 2.2.2
Is MPS (Metal Performance Shader) built? True
Is MPS available? True
Using device: mps


Using the cached checkpoint of seamlessM4T_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_large. Set `force` to `True` to download again.
Using the cached checkpoint of vocoder_v2. Set `force` to `True` to download again.


In [74]:
import pandas as pd

def m4t_v1_translate_and_save(df, translator, src_lang, tgt_lang, output_file):
    # src_lang, tgt_lang = 'eng' or 'jpn'
    try:
        # Open the output file in write mode
        with open(output_file, 'w', encoding='utf-8') as file:
            # Iterate over each row in the DataFrame
            for _, row in df.iterrows():
                index_row = 'Japanese' if src_lang == 'jpn' else 'English'
                source_text = row[index_row]

                # Translate the text using the local translator
                translated_text, _ = translator.predict(
                    input=source_text,
                    task_str="T2TT",
                    tgt_lang=tgt_lang,
                    src_lang=src_lang,
                    unit_generation_opts=None, 
                )
                
                # Write the translated text to the output file
                file.write(str(translated_text[0]) + '\n')
                # print("SRC TEXT:", source_text, "\n", "OUTPUT TEXT:", str(translated_text[0]), "\n") # TODO remove 
                # break # TODO REMOVE 

        print(f"Translation completed. Translations saved to: {output_file}")

    except IOError:
        print(f"Error writing to file: {output_file}")

    except Exception as e:
        print(f"An error occurred during translation: {str(e)}")

In [None]:
# TODO eng->jap 
m4t_v1_translate_and_save(df=kftt_phemt_aspec, translator=m4tv1_translator, src_lang='eng', tgt_lang='jpn', output_file='model_outputs/test/en_to_jp/m4tv1/out.txt')
# TODO jap->eng 
m4t_v1_translate_and_save(df=kftt_phemt_aspec, translator=m4tv1_translator, src_lang='jpn', tgt_lang='eng', output_file='model_outputs/test/jp_to_en/m4tv1/out.txt')
