# Reassemble Dataframe 

In [4]:
import pandas as pd
import json

def reassemble_test_dataset(json_path, engl_paths, japn_paths):
    """
    Reassembles the dataset from the original files using indices stored in a JSON file.

    Args:
    json_path (str): Path to the JSON file containing the indices.
    engl_paths (list): List of paths to the English files in the order they were originally concatenated.
    japn_paths (list): List of paths to the Japanese files in the order they were originally concatenated.

    Returns:
    pd.DataFrame: The reassembled DataFrame.
    """
    # Read the indices from the JSON file
    with open(json_path, 'r', encoding='utf-8') as json_file:
        indices = json.load(json_file)
    
    # Initialize an empty DataFrame
    df = pd.DataFrame()

    # Extract and concatenate the subsets using the indices
    for engl_path, japn_path, index_key in zip(engl_paths, japn_paths, indices):
        with open(engl_path, 'r', encoding='utf-8') as engl_file, open(japn_path, 'r', encoding='utf-8') as japn_file:
            engl_lines = engl_file.readlines()
            japn_lines = japn_file.readlines()
        
        # Extract the subset of lines using the index
        start_index = indices[index_key]
        engl_subset = engl_lines[start_index:start_index + 300]
        japn_subset = japn_lines[start_index:start_index + 300]

        # Create a new DataFrame with the extracted lines
        new_data = {'English': engl_subset, 'Japanese': japn_subset}
        new_df = pd.DataFrame(new_data)

        # Concatenate the new DataFrame with the existing one
        df = pd.concat([df, new_df], ignore_index=True)
    
    return df

json_path = './model_outputs/test/en_to_jp/index.json'
engl_paths = [
    './datasets/public/kftt-data-1.0/data/tok/kyoto-test.en',
    './datasets/public/pheMT_final/tok.en',
    './datasets/private/ASPEC/ASPEC-JE/devtest/devtest.tok.en'
]
japn_paths = [
    './datasets/public/kftt-data-1.0/data/tok/kyoto-test.ja',
    './datasets/public/pheMT_final/tok.ja',
    './datasets/private/ASPEC/ASPEC-JE/devtest/devtest.tok.ja'
]
kftt_phemt_aspec = reassemble_test_dataset(json_path, engl_paths, japn_paths)

# M4T Inference


In [3]:
conda install -c conda-forge libsndfile==1.0.31

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.3.1
  latest version: 24.3.0

Please update conda by running

    $ conda update -n base -c conda-forge conda

Or to minimize the number of packages updated during conda update use

     conda install conda=24.3.0



## Package Plan ##

  environment location: /Users/thomaspett/Desktop/projects/MT_senior_thesis_repo/env

  added / updated specs:
    - libsndfile==1.0.31


The following NEW packages will be INSTALLED:

  gettext            conda-forge/osx-arm64::gettext-0.22.5-h8fbad5d_2 
  gettext-tools      conda-forge/osx-arm64::gettext-tools-0.22.5-h8fbad5d_2 
  libasprintf        conda-forge/osx-arm64::libasprintf-0.22.5-h8fbad5d_2 
  libasprintf-devel  conda-forge/osx-arm64::libasprintf-devel-0.22.5-h8fbad5d_2 
  libflac            conda-forge/osx-arm64::libflac-1.3.4-h07bb92c_0 
  libgettextpo       conda-forge/osx-arm64::libgettextpo-0.22.5-h8fbad5d_2 
  libgettextpo-devel

In [38]:
import torch
from seamless_communication.inference import Translator
print(f"PyTorch version: {torch.__version__}")

# Check PyTorch has access to MPS (Metal Performance Shader, Apple's GPU architecture)
print(f"Is MPS (Metal Performance Shader) built? {torch.backends.mps.is_built()}")
print(f"Is MPS available? {torch.backends.mps.is_available()}")

# Set the device      
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

#translator = Translator("seamlessM4T_large", "vocoder_v2", torch.device("mps:0"), torch.float16)
m4tv1_translator = Translator(
"seamlessM4T_large",
"vocoder_v2",
device=torch.device("mps:0"),
dtype=torch.float16,
)


PyTorch version: 2.2.2
Is MPS (Metal Performance Shader) built? True
Is MPS available? True
Using device: mps


Using the cached checkpoint of seamlessM4T_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_large. Set `force` to `True` to download again.
Using the cached checkpoint of vocoder_v2. Set `force` to `True` to download again.


In [74]:
import pandas as pd

def m4t_v1_translate_and_save(df, translator, src_lang, tgt_lang, output_file):
    # src_lang, tgt_lang = 'eng' or 'jpn'
    try:
        # Open the output file in write mode
        with open(output_file, 'w', encoding='utf-8') as file:
            # Iterate over each row in the DataFrame
            for _, row in df.iterrows():
                index_row = 'Japanese' if src_lang == 'jpn' else 'English'
                source_text = row[index_row]

                # Translate the text using the local translator
                translated_text, _ = translator.predict(
                    input=source_text,
                    task_str="T2TT",
                    tgt_lang=tgt_lang,
                    src_lang=src_lang,
                    unit_generation_opts=None, 
                )
                
                # Write the translated text to the output file
                file.write(str(translated_text[0]) + '\n')
                # print("SRC TEXT:", source_text, "\n", "OUTPUT TEXT:", str(translated_text[0]), "\n") # TODO remove 
                # break # TODO REMOVE 

        print(f"Translation completed. Translations saved to: {output_file}")

    except IOError:
        print(f"Error writing to file: {output_file}")

    except Exception as e:
        print(f"An error occurred during translation: {str(e)}")

In [76]:
# TODO eng->jap 
#m4t_v1_translate_and_save(df=kftt_phemt_aspec, translator=m4tv1_translator, src_lang='eng', tgt_lang='jpn', output_file='model_outputs/test/en_to_jp/m4tv1/out.txt')
# TODO jap->eng 
m4t_v1_translate_and_save(df=kftt_phemt_aspec, translator=m4tv1_translator, src_lang='jpn', tgt_lang='eng', output_file='model_outputs/test/jp_to_en/m4tv1/out.txt')


KeyboardInterrupt: 

## M4T Benchmarking (TODO COME BACK) 

In [8]:
pip install transformers

In [3]:
pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (7.7 kB)
Downloading sentencepiece-0.2.0-cp310-cp310-macosx_11_0_arm64.whl (1.2 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m31m15.3 MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0
Note: you may need to restart the kernel to use updated packages.


In [9]:
pip install accelerate

Collecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
Successfully installed accelerate-0.29.3
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install protobuf

Collecting protobuf
  Downloading protobuf-5.26.1-cp37-abi3-macosx_10_9_universal2.whl.metadata (592 bytes)
Downloading protobuf-5.26.1-cp37-abi3-macosx_10_9_universal2.whl (404 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.0/404.0 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m
[?25hInstalling collected packages: protobuf
Successfully installed protobuf-5.26.1
Note: you may need to restart the kernel to use updated packages.


# ALMA Inference 

In [1]:
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

print(f"PyTorch version: {torch.__version__}")

# Check PyTorch has access to MPS (Metal Performance Shader, Apple's GPU architecture)
print(f"Is MPS (Metal Performance Shader) built? {torch.backends.mps.is_built()}")
print(f"Is MPS available? {torch.backends.mps.is_available()}")

# Set the device      
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

# Load base model and LoRA weights
model = AutoModelForCausalLM.from_pretrained("haoranxu/ALMA-13B-R", torch_dtype=torch.float16).to('mps')
tokenizer = AutoTokenizer.from_pretrained("haoranxu/ALMA-13B-R", padding_side='left')

# Add the source sentence into the prompt template
prompt="Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"
input_ids = tokenizer(prompt, return_tensors="pt", padding=True, max_length=40, truncation=True).input_ids.to('mps')

# Translation
with torch.no_grad():
    generated_ids = model.generate(input_ids=input_ids, num_beams=5, max_new_tokens=20, do_sample=True, temperature=0.6, top_p=0.9)
outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
print(outputs)

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


PyTorch version: 2.2.2
Is MPS (Metal Performance Shader) built? True
Is MPS available? True
Using device: mps


  _torch_pytree._register_pytree_node(


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

['Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish: I love machine translation.']


In [10]:
test_english = kftt_phemt_aspec.loc[0]['English']
print("Original:", test_english) 
print(translate_text(alma_tokenizer, alma_model, test_english, 'English', 'Japanese'))

Original: Jakugen developed a school later , so there were two schools of Shomyo in Ohara .

THE PROMPT IS Translate this from English to Japanese:
English: Jakugen developed a school later , so there were two schools of Shomyo in Ohara .

Japanese:
RAW OUTPUT Translate this from English to Japanese:
English: Jakugen developed a school later , so there were two schools of Shomyo in Ohara .

Japanese: 後に 八玄が学校を設立したため、大
後に 八玄が学校を設立したため、大


### Actually do the inference 

In [1]:
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

alma_model = AutoModelForCausalLM.from_pretrained("haoranxu/ALMA-13B-R", torch_dtype=torch.float16).to('mps')
alma_tokenizer = AutoTokenizer.from_pretrained("haoranxu/ALMA-13B-R", padding_side='left')
print("FINISHED CREATING MODEL AND TOKENIZER")

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

FINISHED CREATING MODEL AND TOKENIZER


In [9]:
def translate_text(tokenizer, model, source_text, src_lang, tgt_lang):
    # Create the prompt template based on the source and target languages
    prompt = f"Translate this from {src_lang} to {tgt_lang}:\n{src_lang}: {source_text}\n{tgt_lang}:"
    #print(f"THE PROMPT IS {prompt}") 
    input_ids = tokenizer(prompt, return_tensors="pt", padding=True, max_length=200, truncation=True).input_ids.to('mps')

    # Translation
    with torch.no_grad():
        generated_ids = model.generate(input_ids=input_ids, num_beams=5, max_new_tokens=100, do_sample=True, temperature=0.6, top_p=0.9)
    outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    # Extract the translated text from the outputs
    # print("RAW OUTPUT", outputs[0]) 
    translated_text = outputs[0].strip()
    return translated_text

In [7]:
def ALMA_R_translate_and_save(df, model, tokenizer, src_lang, tgt_lang, output_file):
    # src_lang, tgt_lang = 'English' or 'Japanese'
    try:
        # Open the output file in write mode
        with open(output_file, 'w', encoding='utf-8') as file:
            print(f"OPENED FILE {output_file}") 
            # Iterate over each row in the DataFrame
            count = 0
            for _, row in df.iterrows():
                if count > 1: 
                    break 
                source_text = row[src_lang]

                # Translate the text using the local translator
                translated_text = translate_text(tokenizer, model, source_text, src_lang, tgt_lang) 
                
                # Write the translated text to the output file
                file.write(str(translated_text) + '\n')
                count +=1 
        print(f"Translation completed. Translations saved to: {output_file}")

    except IOError:
        print(f"Error writing to file: {output_file}")

    except Exception as e:
        print(f"An error occurred during translation: {str(e)}")

In [None]:
#ALMA_R_translate_and_save(df=kftt_phemt_aspec, model=alma_model, tokenizer=alma_tokenizer, src_lang='English', tgt_lang='Japanese', output_file='model_outputs/test/en_to_jp/ALMA-R/out2.txt')
ALMA_R_translate_and_save(df=kftt_phemt_aspec, model=alma_model, tokenizer=alma_tokenizer, src_lang='Japanese', tgt_lang='English', output_file='model_outputs/test/jp_to_en/ALMA-R/out.txt')

OPENED FILE model_outputs/test/jp_to_en/ALMA-R/out.txt
