In [None]:
!pip install accelerate bitsandbytes peft transformers trl==0.12.0 datasets deepspeed
!pip install torch --upgrade

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting trl==0.12.0
  Downloading trl-0.12.0-py3-none-any.whl.metadata (10 kB)
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting deepspeed
  Downloading deepspeed-0.16.1.tar.gz (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.

In [None]:
import logging
import sys
import torch
from transformers import (AutoModelForCausalLM, AutoTokenizer, TrainingArguments)
from datasets import load_dataset
from peft import LoraConfig
from trl import SFTTrainer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Logging configuration
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger(__name__)

# Model & Training Config
training_config = {
    "bf16": True,
    "learning_rate": 5.0e-5,
    "num_train_epochs": 1,
    "output_dir": "/content/drive/MyDrive/fine_tuned_model",
    "per_device_train_batch_size": 2,  # Reduced for Colab
    "per_device_eval_batch_size": 2,
    "save_steps": 100,
    "save_total_limit": 1,
    "logging_steps": 60,
    "gradient_checkpointing": True,
    "gradient_accumulation_steps": 4,  # Helps with memory efficiency
    "warmup_ratio": 0.1,
}

peft_config = LoraConfig(
    r=8,  # Smaller dimension for Lora layers
    lora_alpha=32,
    target_modules=["qkv_proj", "o_proj"],
    lora_dropout=0.1,
    task_type="CAUSAL_LM",
)

train_args = TrainingArguments(**training_config)

In [None]:
checkpoint = "microsoft/Phi-3-mini-4k-instruct"

model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = 'right'

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

In [None]:
# Load opus-100 dataset
dataset = load_dataset("Helsinki-NLP/opus-100", "en-id")

# Take 100,000 samples for training and all validation data
train_data = dataset["train"].shuffle(seed=42).select(range(20000))
valid_data = dataset["validation"]

print(train_data[0])

README.md:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/129k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/57.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/132k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'translation': {'en': '♪ Ch-ch-ch-ch-changes', 'id': '♪ Ch-ch-ch-ch-changes ♪'}}


In [None]:
# Preprocess function
def preprocess_function(example):
    input_text = example['translation']['en']
    target_text = example['translation']['id']

    # Input prompt with target text
    formatted_text = (
        f"Translate the following text from English to Indonesian:\n\n"
        f"English: {input_text}\nIndonesian: {target_text}"
    )

    return {"text": formatted_text}

In [None]:
# Preprocess and map datasets
train_dataset = train_data.map(preprocess_function, remove_columns=["translation"])
valid_dataset = valid_data.map(preprocess_function, remove_columns=["translation"])

# Debug: Verify a few examples
print("Example from Training Dataset:")
print(train_dataset[1])

Example from Training Dataset:
{'text': 'Translate the following text from English to Indonesian:\n\nEnglish: I know how to use this thing!\nIndonesian: Aku tahu cara menggunakannya !'}


In [None]:
trainer = SFTTrainer(
    model=model,
    args=train_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    peft_config=peft_config,
    dataset_text_field="text",
    packing=True,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

[2024-12-18 01:41:45,245] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [None]:
# Start training
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Step,Training Loss
60,1.7255


TrainOutput(global_step=112, training_loss=1.5384480612618583, metrics={'train_runtime': 9934.1867, 'train_samples_per_second': 0.091, 'train_steps_per_second': 0.011, 'total_flos': 2.0518862390820864e+16, 'train_loss': 1.5384480612618583, 'epoch': 0.9933481152993349})

In [None]:
model.save_pretrained("/content/drive/MyDrive/fine_tuned_model/model_finetuned")
tokenizer.save_pretrained("/content/drive/MyDrive/fine_tuned_model/phi3_tokenizer")

('/content/drive/MyDrive/fine_tuned_model/phi3_tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/fine_tuned_model/phi3_tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/fine_tuned_model/phi3_tokenizer/tokenizer.model',
 '/content/drive/MyDrive/fine_tuned_model/phi3_tokenizer/added_tokens.json',
 '/content/drive/MyDrive/fine_tuned_model/phi3_tokenizer/tokenizer.json')

In [None]:
# Save trained model
trainer.save_model("/content/drive/MyDrive/fine_tuned_model/phi3-finetuned")

In [None]:
# Evaluate
results = trainer.evaluate()
print("Evaluation Results:", results)

Evaluation Results: {'eval_loss': 1.4134738445281982, 'eval_runtime': 328.8858, 'eval_samples_per_second': 0.295, 'eval_steps_per_second': 0.149, 'epoch': 0.9933481152993349}


In [None]:
def generate_few_shot_prompt(english_sentences, indonesian_targets, new_sentence):
    """
    Generate a few-shot prompt for English to Indonesian translation.

    Parameters:
        english_sentences (list): List of example English sentences.
        indonesian_targets (list): List of corresponding Indonesian translations.
        new_sentence (str): The new English sentence to translate.

    Returns:
        str: Formatted few-shot prompt.
    """
    # Validate inputs
    if len(english_sentences) != len(indonesian_targets):
        raise ValueError("The number of English sentences and Indonesian targets must match.")

    # Start with "Examples"
    examples = ""
    for eng, indo in zip(english_sentences, indonesian_targets):
        examples += f"English: {eng}\nIndonesian: {indo}\n\n"

    # Add the new sentence for translation
    prompt = (
        examples +
        "Translate the following English sentences to Indonesian:\n"
        + f"\nEnglish: {new_sentence}\nIndonesian:"
    )

    return prompt

In [None]:
# Test Data Examples
# english_sentences =
#     "Can I help you?"
#     "As you can see, I'm jet-lagged and trying to adjust to the time-zone change."
#     "That's what the policeman just said."
#     "Yeah, we've been teaching him, but really, he's been teaching us."
#     "Someone wants to kill him."
#     "Hey Four, that's pretty good right?"
#     "You should've told me, boy."
#     "Your bed is in the corner, over there."
#     "Hey, kid, you've done really good so far, so what we're gonna do right now is we're gonna call your daddy."

# indonesian_targets =
#     "ada yang bisa kubantu?"
#     "Seperti yang kamu lihat , aku alami jet - lag dan mencoba untuk menyesuaikan diri dengan perubahan zona waktu ."
#     "Begitulah kata polisinya tadi."
#     "Ya, kami telah mengajari dia, tapi sungguh, ia telah mengajari kita."
#     "Seseorang ingin membunuhnya."
#     "Four, Tak begitu buruk, kan?"
#     "Kau harusnya katakan padaku, Nak."
#     "Ranjangmu disudut itu."
#     "Hei, nak, sejauh ini kau telah melakukannya dengan baik sekali, jadi apa yang akan kita lakukan sekarang adalah kita akan menelpon ayahmu."



# Training Data Examples
english_sentences = [
    "I feel sorry for her.",
    # "I am always with you.",
    # "We didn't understand much of what he said, only that he was suffering.",
    # "Hey, good morning!",
    # "That's never happened.",
    # "It's just, the way I asked you, I kind of left out the most important part."
]

indonesian_targets = [
    "Aku merasa kasihan padanya.",
    # "Saya akan selalu bersamamu.",
    # "Kami tidak mengerti apa yang ia katakan, selain bahwa dia menderita.",
    # "Hei, selamat pagi!",
    # "Itu tidak pernah terjadi.",
    # "Hanya saja, caraku menanyakannya padamu, aku agak merasa kelewatan bagian yang paling pentingnya."
]

# The new sentence to translate
new_sentence = "Yeah, we've been teaching him, but really, he's been teaching us."

# Generate the prompt
prompt = generate_few_shot_prompt(english_sentences, indonesian_targets, new_sentence)
print(prompt)

In [None]:
# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="pt")["input_ids"].to(model.device)

output = model.generate(
      input_ids=inputs,
      max_new_tokens=60,
      num_return_sequences=1,
      eos_token_id=tokenizer.eos_token_id,
    )

response = tokenizer.decode(output[0], skip_special_tokens=True)

if len(output) == 0:
    print("Warning: Model output is empty!")
else:
    print(response)

English: I feel sorry for her.
Indonesian: Aku merasa kasihan padanya.

Translate the following English sentences to Indonesian:

English: Yeah, we've been teaching him, but really, he's been teaching us.
Indonesian: Ya, kita telah mengajar dia, tapi sebenarnya dia telah mengajar kita.


In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Ensure the nltk 'punkt' tokenizer is downloaded
nltk.download('punkt_tab')

def calculate_bleu_single_reference(reference_sentence, candidate_sentence):
    """
    Calculate the BLEU score for a single predicted translation against one reference.

    Parameters:
        reference_sentence (str): The correct reference translation.
        candidate_sentence (str): The predicted translation.

    Returns:
        float: The BLEU score.
    """
    # Tokenize the reference sentence and candidate sentence
    tokenized_reference = [nltk.word_tokenize(reference_sentence)]
    tokenized_candidate = nltk.word_tokenize(candidate_sentence)

    # Use smoothing to handle short sentences
    smoothing_function = SmoothingFunction().method1

    # Calculate BLEU score
    bleu_score = sentence_bleu(tokenized_reference, tokenized_candidate, smoothing_function=smoothing_function)
    return bleu_score

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
# Test Data Examples
# english_sentences =
#     "Can I help you?"
#     "As you can see, I'm jet-lagged and trying to adjust to the time-zone change."
#     "That's what the policeman just said."
#     "Yeah, we've been teaching him, but really, he's been teaching us."
#     "Someone wants to kill him."
#     "Hey Four, that's pretty good right?"
#     "You should've told me, boy."
#     "Your bed is in the corner, over there."
#     "Hey, kid, you've done really good so far, so what we're gonna do right now is we're gonna call your daddy."

# indonesian_targets =
#     "ada yang bisa kubantu?"
#     "Seperti yang kamu lihat , aku alami jet - lag dan mencoba untuk menyesuaikan diri dengan perubahan zona waktu ."
#     "Begitulah kata polisinya tadi."
#     "Ya, kami telah mengajari dia, tapi sungguh, ia telah mengajari kita."
#     "Seseorang ingin membunuhnya."
#     "Four, Tak begitu buruk, kan?"
#     "Kau harusnya katakan padaku, Nak."
#     "Ranjangmu di sudut itu."
#     "Hei, nak, sejauh ini kau telah melakukannya dengan baik sekali, jadi apa yang akan kita lakukan sekarang adalah kita akan menelpon ayahmu."

# Single reference translation
reference_sentence = "Ya, kami telah mengajari dia, tapi sungguh, ia telah mengajari kita."
candidate_sentence = "Ya, kita telah mengajar dia, tapi sebenarnya dia telah mengajar kita."

# Calculate BLEU score
bleu_score = calculate_bleu_single_reference(reference_sentence, candidate_sentence)
print(f"BLEU Score: {bleu_score:.4f}")

BLEU Score: 0.1030


In [None]:
import pandas as pd

In [None]:
def extract_translation(output):
    """
    Extract the translation from the model's output after the second 'Indonesian:'.

    Parameters:
        output (str): The raw output from the model.

    Returns:
        str: The extracted translation.
    """
    # Find the second occurrence of 'Indonesian:'
    parts = output.split("Indonesian:")
    if len(parts) > 2:
        translation = parts[2].strip()  # Extract the part after the second 'Indonesian:'
    else:
        translation = parts[-1].strip()  # Fallback to the last part if 'Indonesian:' appears once

    return translation

In [None]:
print(extract_translation(""))

In [None]:
def evaluate_model(model, tokenizer, test_data, output_path, english_sentences, indonesian_targets):
    results = []

    for example in test_data:
        # Extract input and target from the dataset
        english_sentence = example["translation"]["en"]
        reference_sentence = example["translation"]["id"]

        # Generate the prompt
        prompt = generate_few_shot_prompt(english_sentences, indonesian_targets, english_sentence)

        # Tokenize and generate prediction
        inputs = tokenizer(prompt, return_tensors="pt")["input_ids"].to(model.device)
        output = model.generate(
          input_ids=inputs,
          max_new_tokens=60,
          num_return_sequences=1,
          eos_token_id=tokenizer.eos_token_id,
        )
        raw_output = tokenizer.decode(output[0], skip_special_tokens=True)
        prediction_sentence = extract_translation(raw_output)

        # Calculate BLEU score
        bleu_score = calculate_bleu_single_reference(reference_sentence, prediction_sentence)

        # Store results
        results.append({
            "Input (English)": english_sentence,
            "Reference (Indonesian)": reference_sentence,
            "Prediction (Indonesian)": prediction_sentence,
            "BLEU Score": bleu_score,
        })

    # Save results to Excel
    results_df = pd.DataFrame(results)
    results_df.to_excel(output_path, index=False)
    print(f"Results saved to {output_path}")

    return results_df

In [None]:
# Load test data
test_data = dataset["test"].shuffle(seed=42).select(range(100))

test_data[0]

{'translation': {'en': "Pat's under a lot of pressure.",
  'id': 'Pat berada dalam banyak tekanan.'}}

In [None]:
# Path
one_shot_path = "/content/drive/MyDrive/fine_tuned_model/one_shot_results.xlsx"

# Evaluate the model on the test data and save results
results_df_one_shot = evaluate_model(model, tokenizer, test_data, one_shot_path, english_sentences, indonesian_targets)

results_df_one_shot.head()

Results saved to /content/drive/MyDrive/fine_tuned_model/one_shot_results.xlsx


Unnamed: 0,Input (English),Reference (Indonesian),Prediction (Indonesian),BLEU Score
0,Pat's under a lot of pressure.,Pat berada dalam banyak tekanan.,English: I feel sorry for her.\nIndonesian: Ak...,0.023952
1,Why did You show yourself to only me?,Mengapa Kau menampakkan diri hanya padaku?,English: I feel sorry for her.\nIndonesian: Ak...,0.005919
2,"Gunner, last chance.","Gunner, kesempatan terakhir.",English: I feel sorry for her.\nIndonesian: Ak...,0.01207
3,- It's safe enough.,- Ini aman.,English: I feel sorry for her.\nIndonesian: Ak...,0.011408
4,This information is only available to Last.fm ...,Informasi ini hanya tersedia bagi pengguna Las...,English: I feel sorry for her.\nIndonesian: Ak...,0.089549


In [None]:
few_shot_path = "/content/drive/MyDrive/fine_tuned_model/few_shot_results.xlsx"

# Training Data Examples
english_sentences = [
    "I feel sorry for her.",
    "I am always with you.",
    "We didn't understand much of what he said, only that he was suffering.",
    "Hey, good morning!",
    "That's never happened.",
    "It's just, the way I asked you, I kind of left out the most important part."
]

indonesian_targets = [
    "Aku merasa kasihan padanya.",
    "Saya akan selalu bersamamu.",
    "Kami tidak mengerti apa yang ia katakan, selain bahwa dia menderita.",
    "Hei, selamat pagi!",
    "Itu tidak pernah terjadi.",
    "Hanya saja, caraku menanyakannya padamu, aku agak merasa kelewatan bagian yang paling pentingnya."
]

results_df_few_shot = evaluate_model(model, tokenizer, test_data, few_shot_path, english_sentences, indonesian_targets)

results_df_few_shot.head()