In [5]:
import pandas as pd
import json

def reassemble_test_dataset(json_path, engl_paths, japn_paths):
    """
    Reassembles the dataset from the original files using indices stored in a JSON file.

    Args:
    json_path (str): Path to the JSON file containing the indices.
    engl_paths (list): List of paths to the English files in the order they were originally concatenated.
    japn_paths (list): List of paths to the Japanese files in the order they were originally concatenated.

    Returns:
    pd.DataFrame: The reassembled DataFrame.
    """
    # Read the indices from the JSON file
    with open(json_path, 'r', encoding='utf-8') as json_file:
        indices = json.load(json_file)
    
    # Initialize an empty DataFrame
    df = pd.DataFrame()

    # Extract and concatenate the subsets using the indices
    for engl_path, japn_path, index_key in zip(engl_paths, japn_paths, indices):
        with open(engl_path, 'r', encoding='utf-8') as engl_file, open(japn_path, 'r', encoding='utf-8') as japn_file:
            engl_lines = engl_file.readlines()
            japn_lines = japn_file.readlines()
        
        # Extract the subset of lines using the index
        start_index = indices[index_key]
        engl_subset = engl_lines[start_index:start_index + 300]
        japn_subset = japn_lines[start_index:start_index + 300]

        # Create a new DataFrame with the extracted lines
        new_data = {'English': engl_subset, 'Japanese': japn_subset}
        new_df = pd.DataFrame(new_data)

        # Concatenate the new DataFrame with the existing one
        df = pd.concat([df, new_df], ignore_index=True)
    
    return df

json_path = './model_outputs/test/en_to_jp/index.json'
engl_paths = [
    './datasets/public/kftt-data-1.0/data/tok/kyoto-test.en',
    './datasets/public/pheMT_final/tok.en',
    './datasets/private/ASPEC/ASPEC-JE/devtest/devtest.tok.en'
]
japn_paths = [
    './datasets/public/kftt-data-1.0/data/tok/kyoto-test.ja',
    './datasets/public/pheMT_final/tok.ja',
    './datasets/private/ASPEC/ASPEC-JE/devtest/devtest.tok.ja'
]
kftt_phemt_aspec = reassemble_test_dataset(json_path, engl_paths, japn_paths)

In [7]:
pip install -r ./seamless_communication/demo/m4tv1/requirements.txt

Collecting git+https://github.com/facebookresearch/seamless_communication (from -r ./seamless_communication/demo/m4tv1/requirements.txt (line 2))
  Cloning https://github.com/facebookresearch/seamless_communication to c:\users\karat\appdata\local\temp\pip-req-build-ojdketrm
  Resolved https://github.com/facebookresearch/seamless_communication to commit 66054d4278e3ac792abc50dd0f27fe84b500d1e3
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting fairseq2
  Using cached fairseq2-0.2.1-py3-none-any.whl (191 kB)
Collecting gradio
  Downloading gradio-4.29.0-py3-none-any.whl (12.3 MB)
     ---------------------------------------- 0.0/12.3 MB ? eta -:--:--
     --------------------------------------- 0.0/12.3

  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/seamless_communication 'C:\Users\karat\AppData\Local\Temp\pip-req-build-ojdketrm'
  Running command git submodule update --init --recursive -q
ERROR: Cannot install -r ./seamless_communication/demo/m4tv1/requirements.txt (line 1) because these package versions have conflicting dependencies.
ERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-dependency-conflicts

[notice] A new release of pip is available: 23.0.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

alma_model = AutoModelForCausalLM.from_pretrained("haoranxu/ALMA-13B-R", torch_dtype=torch.float16).to('cuda') 
alma_tokenizer = AutoTokenizer.from_pretrained("haoranxu/ALMA-13B-R", padding_side='left')
print("FINISHED CREATING MODEL AND TOKENIZER")

Downloading shards:   0%|          | 0/6 [00:00<?, ?it/s]

model-00001-of-00006.safetensors:   6%|6         | 315M/4.98G [00:00<?, ?B/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00004-of-00006.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00005-of-00006.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00006-of-00006.safetensors:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 136.00 MiB. GPU 0 has a total capacity of 11.99 GiB of which 0 bytes is free. Of the allocated memory 18.51 GiB is allocated by PyTorch, and 2.90 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## OK So not enough VRAM on 4070TI for inference with this, lets try smaller verison 

In [2]:
pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp39-cp39-win_amd64.whl (991 kB)
     ---------------------------------------- 0.0/991.5 kB ? eta -:--:--
     - ----------------------------------- 30.7/991.5 kB 660.6 kB/s eta 0:00:02
     - ----------------------------------- 30.7/991.5 kB 660.6 kB/s eta 0:00:02
     - ----------------------------------- 30.7/991.5 kB 660.6 kB/s eta 0:00:02
     --- --------------------------------- 81.9/991.5 kB 416.7 kB/s eta 0:00:03
     ---- ------------------------------- 112.6/991.5 kB 544.7 kB/s eta 0:00:02
     ---- ------------------------------- 122.9/991.5 kB 423.5 kB/s eta 0:00:03
     ------- ---------------------------- 194.6/991.5 kB 620.6 kB/s eta 0:00:02
     --------- -------------------------- 256.0/991.5 kB 714.4 kB/s eta 0:00:02
     ---------- ------------------------- 286.7/991.5 kB 681.0 kB/s eta 0:00:02
     --------------- -------------------- 440.3/991.5 kB 949.4 kB/s eta 0:00:01
     --------------------- --------


[notice] A new release of pip is available: 23.0.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

alma_model = AutoModelForCausalLM.from_pretrained("haoranxu/ALMA-7B-R", torch_dtype=torch.float16).to('cuda') 
alma_tokenizer = AutoTokenizer.from_pretrained("haoranxu/ALMA-7B-R", padding_side='left')
print("FINISHED CREATING MODEL AND TOKENIZER")



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

FINISHED CREATING MODEL AND TOKENIZER


In [7]:
# Add the source sentence into the prompt template
prompt="Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"
input_ids = alma_tokenizer(prompt, return_tensors="pt", padding=True, max_length=200, truncation=True).input_ids.cuda()

# Translation
with torch.no_grad():
    generated_ids =alma_model.generate(input_ids=input_ids, num_beams=5, max_new_tokens=200, do_sample=True, temperature=0.6, top_p=0.9)
outputs = alma_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
print(outputs)

['Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish: I love machine translation.']


In [25]:
def translate_text(tokenizer, model, source_text, src_lang, tgt_lang):
    # Create the prompt template based on the source and target languages
    prompt = f"Translate this from {src_lang} to {tgt_lang}:\n{src_lang}: {source_text}\n{tgt_lang}:"
    #print(f"THE PROMPT IS {prompt}") 
    input_ids = tokenizer(prompt, return_tensors="pt", padding=True, max_length=200, truncation=True).input_ids.cuda()

    # Translation
    with torch.no_grad():
        generated_ids = model.generate(input_ids=input_ids, num_beams=5, max_new_tokens=200, do_sample=True, temperature=0.6, top_p=0.9)
    outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    # Extract the translated text from the outputs
    #print("RAW OUTPUT", outputs[0]) 
    translated_text = outputs[0].split(f"{tgt_lang}:")[-1].strip()
    return translated_text

In [26]:
def ALMA_R_translate_and_save(df, model, tokenizer, src_lang, tgt_lang, output_file):
    # src_lang, tgt_lang = 'English' or 'Japanese'
    try:
        # Open the output file in write mode
        with open(output_file, 'w', encoding='utf-8') as file:
            print(f"OPENED FILE {output_file}") 
            # Iterate over each row in the DataFrame
            for _, row in df.iterrows(): 
                source_text = row[src_lang]

                # Translate the text using the local translator
                translated_text = translate_text(tokenizer, model, source_text, src_lang, tgt_lang) 
                
                # Write the translated text to the output file
                file.write(str(translated_text) + '\n\n') 
        print(f"Translation completed. Translations saved to: {output_file}")

    except IOError:
        print(f"Error writing to file: {output_file}")

    except Exception as e:
        print(f"An error occurred during translation: {str(e)}")

In [27]:
#ALMA_R_translate_and_save(df=kftt_phemt_aspec, model=alma_model, tokenizer=alma_tokenizer, src_lang='English', tgt_lang='Japanese', output_file='model_outputs/test/en_to_jp/ALMA-R/out3.txt')
ALMA_R_translate_and_save(df=kftt_phemt_aspec, model=alma_model, tokenizer=alma_tokenizer, src_lang='Japanese', tgt_lang='English', output_file='model_outputs/test/jp_to_en/ALMA-R/out3.txt')

OPENED FILE model_outputs/test/jp_to_en/ALMA-R/out3.txt
Translation completed. Translations saved to: model_outputs/test/jp_to_en/ALMA-R/out3.txt


## Ok so engl to japn direction doesnt work on 7b but it does on 13b 