In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/americannlp-task-2-dataset/maya-test.tsv
/kaggle/input/americannlp-task-2-dataset/bribri-dev.tsv
/kaggle/input/americannlp-task-2-dataset/nahuatl_omitlan-test.tsv
/kaggle/input/americannlp-task-2-dataset/nahuatl_omitlan-dev.tsv
/kaggle/input/americannlp-task-2-dataset/guarani-train.tsv
/kaggle/input/americannlp-task-2-dataset/guarani-dev.tsv
/kaggle/input/americannlp-task-2-dataset/guarani-test.tsv
/kaggle/input/americannlp-task-2-dataset/maya-train.tsv
/kaggle/input/americannlp-task-2-dataset/bribri-test.tsv
/kaggle/input/americannlp-task-2-dataset/maya-dev.tsv
/kaggle/input/americannlp-task-2-dataset/nahuatl_omitlan-train.tsv
/kaggle/input/americannlp-task-2-dataset/bribri-train.tsv


In [2]:
%%capture
%pip install Dataset
%pip install sacrebleu
%pip install transformers
%pip install sentencepiece
%pip install datasets
%pip install huggingface_hub
%pip install bitsandbytes
%pip install -U accelerate
%pip install -U peft
%pip install -U trl

In [3]:
# Set environment variable to help with memory allocation
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [4]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HUGGINGFACE_TOKEN = user_secrets.get_secret("HUGGINGFACE_TOKEN")

!huggingface-cli login --token $HUGGINGFACE_TOKEN

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
The token `basic task` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `basic task`


In [5]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer,SFTConfig
from trl import setup_chat_format
from transformers import (
                          AutoTokenizer,
                          AutoModelForCausalLM, AutoModelForSeq2SeqLM,
                          TrainingArguments,
                          BitsAndBytesConfig,XGLMTokenizer, XGLMForCausalLM,
                          pipeline, 
                          Trainer,
                          DataCollatorWithPadding,
                          logging)
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split
from sacrebleu import corpus_bleu, corpus_chrf

In [6]:
from accelerate import PartialState
device_map={"": PartialState().process_index}

## Nahuati Omitlan Dataset

In [7]:
# Load the data
train_df = pd.read_table('/kaggle/input/americannlp-task-2-dataset/nahuatl_omitlan-train.tsv')
dev_df = pd.read_table('/kaggle/input/americannlp-task-2-dataset/nahuatl_omitlan-dev.tsv')
test_df = pd.read_table('/kaggle/input/americannlp-task-2-dataset/nahuatl_omitlan-test.tsv')

In [8]:
X_train = train_df
X_eval = dev_df
X_test_sub = test_df

## Tags to Instruction

In [9]:
# new type 
def translate_tags_to_instruction(change):
    """
    Translate the tags in the 'Change' field to full-form instructions, combining multiple instructions with 'and.'
    """
    # Split the "Change" field into tags
    instruction_tags = change.split(", ")
    instructions = []

    # Define mapping dictionaries for each category
    type_map = {
        "NEG": "Make the sentence negative",
        "IMP": "Change the sentence to imperative mood",
        "AFF": "Make the sentence affirmative"
    }

    mood_map = {
        "DES": "Express desire or wish to perform the action",
        "EXH": "Change to exhortative mood (encouraging or urging action)",
        "ADVERS": "Express that the action was done despite difficulties",
        "POT": "Express potential or ability to perform the action",
        "COND": "Change to conditional mood",
        "OPT": "Change to optative mood (expressing wish or hope)",
        "INT": "Change to interrogative mood",
        "NA": "Remove mood marking",
        "IMP": "Change the sentence to imperative mood"
    }

    tense_map = {
        "IPFV_HAB": "Change to habitual imperfective aspect",
        "IPFV_REC": "Change to recent imperfective aspect",
        "IPFV_PROG": "Change to progressive imperfective aspect",
        "PRF_PROG": "Change to perfect progressive aspect",
        "PRF_REC": "Change to recent perfect tense",
        "FUT_POT": "Change to potential future tense",
        "FUT_CER": "Change to certain future tense",
        "PAS_PLU": "Change to pluperfect (past perfect) tense",
        "PRE_SIM": "Change to present simple tense",
        "PAS_SIM": "Change to past simple tense",
        "FUT_SIM": "Change to future simple tense"
    }

    aspect_map = {
        "IPFV": "Change to imperfective aspect",
        "PFV": "Change to perfective aspect",
        "INC": "Express the beginning or initiation of the action",
        "DUR": "Express duration of the action"
    }

    voice_map = {
        "MID": "Change to middle voice"
    }

    absnum_map = {
        "PL": "Make the absolutive argument plural",
        "NI": "Remove number marking from the absolutive argument"
    }

    person_map = {
        "1_PL_EXC": "Change subject to first person plural exclusive",
        "1_PL_INC": "Change subject to first person plural inclusive",
        "2_PL": "Change subject to second person plural",
        "3_PL": "Change subject to third person plural",
        "2_SI": "Change subject to second person singular",
        "3_SI": "Change subject to third person singular",
        "1_SI": "Change subject to first person singular",
        "1_PL": "Change subject to first person plural"
    }

    poss_map = {
        "1_SI": "Change possessor to first person singular",
        "1_PL": "Change possessor to first person plural",
        "2_SI": "Change possessor to second person singular",
        "2_PL": "Change possessor to second person plural",
        "3_SI": "Change possessor to third person singular",
        "3_PL": "Change possessor to third person plural"
    }

    obj_map = {
        "1_SI": "Change object to first person singular",
        "1_PL": "Change object to first person plural",
        "2_SI": "Change object to second person singular",
        "2_PL": "Change object to second person plural",
        "3_SI": "Change object to third person singular",
        "3_PL": "Change object to third person plural"
    }

    iobj_map = {
        "1_SI": "Change indirect object to first person singular",
        "1_PL": "Change indirect object to first person plural",
        "2_SI": "Change indirect object to second person singular",
        "2_PL": "Change indirect object to second person plural",
        "3_SI": "Change indirect object to third person singular",
        "3_PL": "Change indirect object to third person plural"
    }

    honorific_map = {
        "HON:1": "Use honorific form",
        "HON:NA": "Do not use honorific form"
    }

    purposive_map = {
        "PURPOSIVE:VEN": "Express purpose or goal",
        "PURPOSIVE:VET": "Express purpose or goal (alternative form)",
        "PURPOSIVE:NA": "No purpose or goal indicated"
    }

    transitivity_map = {
        "TRANSITIV:ITR": "Change to intransitive voice"
    }

    # Translate each tag into a full-form instruction
    for tag in instruction_tags:
        if ":" in tag:
            category, value = tag.split(":")
            if value != "NA":  # Ignore NA values
                if category == "TYPE":
                    instructions.append(type_map.get(value.strip(), f"{category}: {value.strip()}"))
                elif category == "MOOD":
                    instructions.append(mood_map.get(value.strip(), f"{category}: {value.strip()}"))
                elif category == "TENSE":
                    instructions.append(tense_map.get(value.strip(), f"{category}: {value.strip()}"))
                elif category == "ASPECT":
                    instructions.append(aspect_map.get(value.strip(), f"{category}: {value.strip()}"))
                elif category == "VOICE":
                    instructions.append(voice_map.get(value.strip(), f"{category}: {value.strip()}"))
                elif category == "ABSNUM":
                    instructions.append(absnum_map.get(value.strip(), f"{category}: {value.strip()}"))
                elif category == "PERSON[SUBJ]":
                    instructions.append(person_map.get(value.strip(), f"{category}: {value.strip()}"))
                elif category == "PERSON[POSS]":
                    instructions.append(poss_map.get(value.strip(), f"{category}: {value.strip()}"))
                elif category == "PERSON[OBJ]":
                    instructions.append(obj_map.get(value.strip(), f"{category}: {value.strip()}"))
                elif category == "PERSON[IOBJ]":
                    instructions.append(iobj_map.get(value.strip(), f"{category}: {value.strip()}"))
                elif category == "HON":
                    instructions.append(honorific_map.get(tag.strip(), f"{category}: {value.strip()}"))
                elif category == "PURPOSIVE":
                    instructions.append(purposive_map.get(tag.strip(), f"{category}: {value.strip()}"))
                elif category == "TRANSITIV":
                    instructions.append(transitivity_map.get(tag.strip(), f"{category}: {value.strip()}"))
                else:
                    instructions.append(f"{category}: {value.strip()}")
        else:
            # Handle tags without a colon
            if tag in type_map:
                instructions.append(type_map[tag])
            elif tag in mood_map:
                instructions.append(mood_map[tag])
            elif tag in tense_map:
                instructions.append(tense_map[tag])
            elif tag in aspect_map:
                instructions.append(aspect_map[tag])
            elif tag in voice_map:
                instructions.append(voice_map[tag])
            elif tag in absnum_map:
                instructions.append(absnum_map[tag])
            else:
                instructions.append(tag)

    # Combine all instructions with 'and'
    return ' and '.join(instructions)
    
# # Apply the function to the 'Change' column in both datasets
# train_df['Instructions'] = train_df['Change'].apply(translate_tags_to_instruction)
# dev_df['Instructions'] = dev_df['Change'].apply(translate_tags_to_instruction)

# # Count the number of unknown instructions in both datasets
# unknown_train = train_df['Instructions'].str.contains('Unknown').sum()
# unknown_dev = dev_df['Instructions'].str.contains('Unknown').sum()

# print(f"Number of unknown instructions in training dataset: {unknown_train}")
# print(f"Number of unknown instructions in development dataset: {unknown_dev}")

Number of unknown instructions in training dataset: 0
Number of unknown instructions in development dataset: 0


In [10]:
# # Tag Function Tester

# def translate_tags_to_instruction(change):
#     """
#     Translate the tags in the 'Change' field to full-form instructions, combining multiple instructions with 'and.'
#     """
#     # Split the "Change" field into tags
#     instruction_tags = change.split(", ")
#     instructions = []

#     # Define mapping dictionaries for each category
#     type_map = {
#         "NEG": "Make the sentence negative",
#         "IMP": "Change the sentence to imperative mood",
#         "AFF": "Make the sentence affirmative"
#     }

#     mode_map = {
#         "DES": "Express desire or wish to perform the action",
#         "EXH": "Change to exhortative mood (encouraging or urging action)",
#         "ADVERS": "Express that the action was done despite difficulties",
#         "POT": "Express potential or ability to perform the action",
#         "COND": "Change to conditional mood",
#         "OPT": "Change to optative mood",
#         "INT": "Change to interrogative mood"
#     }

#     tense_map = {
#         "IPFV_HAB": "Change to habitual imperfective aspect",
#         "IPFV_REC": "Change to recent imperfective aspect",
#         "IPFV_PROG": "Change to progressive imperfective aspect",
#         "PRF_PROG": "Change to perfect progressive aspect",
#         "PRF_REC": "Change to recent perfect tense",
#         "FUT_POT": "Change to potential future tense",
#         "FUT_CER": "Change to certain future tense",
#         "PAS_PLU": "Change to pluperfect (past perfect) tense",
#         "PRE_SIM": "Change to present simple tense",
#         "PAS_SIM": "Change to past simple tense",
#         "FUT_SIM": "Change to future simple tense"
#     }

#     aspect_map = {
#         "IPFV": "Change to imperfective aspect",
#         "PFV": "Change to perfective aspect",
#         "INC": "Express the beginning or initiation of the action",
#         "DUR": "Express duration of the action"
#     }

#     voice_map = {
#         "MID": "Change to middle voice"
#     }

#     absnum_map = {
#         "PL": "Make the absolutive argument plural",
#         "NI": "Remove number marking from the absolutive argument"
#     }

#     # Key difference: removed space after colon in keys
#     person_map = {
#     # Without spaces after colon
#     "PERSON[SUBJ]:1_PL_EXC": "Change subject to first person plural exclusive",
#     "PERSON[SUBJ]:1_PL_INC": "Change subject to first person plural inclusive",
#     "PERSON[SUBJ]:2_PL": "Change subject to second person plural",
#     "PERSON[SUBJ]:3_PL": "Change subject to third person plural",
#     "PERSON[SUBJ]:2_SI": "Change subject to second person singular",
#     "PERSON[SUBJ]:3_SI": "Change subject to third person singular",
#     "PERSON[SUBJ]:1_SI": "Change subject to first person singular",
#     "PERSON[POSS]:1_SI": "Change possessor to first person singular",
#     "PERSON[POSS]:1_PL": "Change possessor to first person plural",
#     "PERSON[POSS]:2_SI": "Change possessor to second person singular",
#     "PERSON[POSS]:2_PL": "Change possessor to second person plural",
#     "PERSON[POSS]:3_SI": "Change possessor to third person singular",
#     "PERSON[POSS]:3_PL": "Change possessor to third person plural",
#     "PERSON[OBJ]:1_SI": "Change object to first person singular",
#     "PERSON[OBJ]:1_PL": "Change object to first person plural",
#     "PERSON[OBJ]:2_SI": "Change object to second person singular",
#     "PERSON[OBJ]:2_PL": "Change object to second person plural",
#     "PERSON[OBJ]:3_SI": "Change object to third person singular",
#     "PERSON[OBJ]:3_PL": "Change object to third person plural",
#     "PERSON[IOBJ]:1_SI": "Change indirect object to first person singular",
#     "PERSON[IOBJ]:1_PL": "Change indirect object to first person plural",
#     "PERSON[IOBJ]:2_SI": "Change indirect object to second person singular",
#     "PERSON[IOBJ]:2_PL": "Change indirect object to second person plural",
#     "PERSON[IOBJ]:3_SI": "Change indirect object to third person singular",
#     "PERSON[IOBJ]:3_PL": "Change indirect object to third person plural",
    
#     # With spaces after colon
#     "PERSON[SUBJ]: 1_PL_EXC": "Change subject to first person plural exclusive",
#     "PERSON[SUBJ]: 1_PL_INC": "Change subject to first person plural inclusive",
#     "PERSON[SUBJ]: 2_PL": "Change subject to second person plural",
#     "PERSON[SUBJ]: 3_PL": "Change subject to third person plural",
#     "PERSON[SUBJ]: 2_SI": "Change subject to second person singular",
#     "PERSON[SUBJ]: 3_SI": "Change subject to third person singular",
#     "PERSON[SUBJ]: 1_SI": "Change subject to first person singular",
#     "PERSON[POSS]: 1_SI": "Change possessor to first person singular",
#     "PERSON[POSS]: 1_PL": "Change possessor to first person plural",
#     "PERSON[POSS]: 2_SI": "Change possessor to second person singular",
#     "PERSON[POSS]: 2_PL": "Change possessor to second person plural",
#     "PERSON[POSS]: 3_SI": "Change possessor to third person singular",
#     "PERSON[POSS]: 3_PL": "Change possessor to third person plural",
#     "PERSON[OBJ]: 1_SI": "Change object to first person singular",
#     "PERSON[OBJ]: 1_PL": "Change object to first person plural",
#     "PERSON[OBJ]: 2_SI": "Change object to second person singular",
#     "PERSON[OBJ]: 2_PL": "Change object to second person plural",
#     "PERSON[OBJ]: 3_SI": "Change object to third person singular",
#     "PERSON[OBJ]: 3_PL": "Change object to third person plural",
#     "PERSON[IOBJ]: 1_SI": "Change indirect object to first person singular",
#     "PERSON[IOBJ]: 1_PL": "Change indirect object to first person plural",
#     "PERSON[IOBJ]: 2_SI": "Change indirect object to second person singular",
#     "PERSON[IOBJ]: 2_PL": "Change indirect object to second person plural",
#     "PERSON[IOBJ]: 3_SI": "Change indirect object to third person singular",
#     "PERSON[IOBJ]: 3_PL": "Change indirect object to third person plural"
#     }

#     mood_map = {
#     "IMP": "Change the sentence to imperative mood",
#     "COND": "Change to conditional mood",
#     "OPT": "Change to optative mood (expressing wish or hope)",
#     "INT": "Change to interrogative mood",
#     "NA": "Remove mood marking",
#     "DES": "Express desire or wish to perform the action",
#     "EXH": "Change to exhortative mood (encouraging or urging action)",
#     "ADVERS": "Express that the action was done despite difficulties",
#     "POT": "Express potential or ability to perform the action"
#     }
#     honorific_map = {
#         "HON:1": "Use honorific form",
#         "HON:NA": "Do not use honorific form"
#     }

#     purposive_map = {
#         "PURPOSIVE:VEN": "Express purpose or goal",
#         "PURPOSIVE:VET": "Express purpose or goal (alternative form)",
#         "PURPOSIVE:NA": "No purpose or goal indicated"
#     }

#     transitivity_map = {
#         "TRANSITIV:ITR": "Change to intransitive voice"
#     }

#     # Translate each tag into a full-form instruction
#     for tag in instruction_tags:
#         if ":" in tag:
#             category, value = tag.split(":")
#             if value != "NA":  # Ignore NA values
#                 if category == "TYPE":
#                     instructions.append(type_map.get(value.strip(), f"{category}: {value.strip()}"))
#                 elif category == "MOOD":
#                     instructions.append(mode_map.get(value.strip(), f"{category}: {value.strip()}"))
#                 elif category == "TENSE":
#                     instructions.append(tense_map.get(value.strip(), f"{category}: {value.strip()}"))
#                 elif category == "ASPECT":
#                     instructions.append(aspect_map.get(value.strip(), f"{category}: {value.strip()}"))
#                 elif category == "VOICE":
#                     instructions.append(voice_map.get(value.strip(), f"{category}: {value.strip()}"))
#                 elif category == "ABSNUM":
#                     instructions.append(absnum_map.get(value.strip(), f"{category}: {value.strip()}"))
#                 elif category.startswith("PERSON"):
#                     # Use the full tag as the key
#                     instructions.append(person_map.get(tag.strip(), f"{category}: {value.strip()}"))
#                 elif category == "HON":
#                     instructions.append(honorific_map.get(tag.strip(), f"{category}: {value.strip()}"))
#                 elif category == "PURPOSIVE":
#                     instructions.append(purposive_map.get(tag.strip(), f"{category}: {value.strip()}"))
#                 elif category == "TRANSITIV":
#                     instructions.append(transitivity_map.get(tag.strip(), f"{category}: {value.strip()}"))
#                 elif category == "MOOD":
#                     instructions.append(mood_map.get(value.strip(), f"{category}: {value.strip()}"))
#                 else:
#                     instructions.append(f"{category}: {value.strip()}")
#         else:
#             # Handle tags without a colon
#             if tag in type_map:
#                 instructions.append(type_map[tag])
#             elif tag in mode_map:
#                 instructions.append(mode_map[tag])
#             elif tag in tense_map:
#                 instructions.append(tense_map[tag])
#             elif tag in aspect_map:
#                 instructions.append(aspect_map[tag])
#             elif tag in voice_map:
#                 instructions.append(voice_map[tag])
#             elif tag in absnum_map:
#                 instructions.append(absnum_map[tag])
#             else:
#                 instructions.append(tag)

#     # Combine all instructions with 'and'
#     return ' and '.join(instructions)


# # Example usage
# instruction = "PERSON[SUBJ]:1_PL, TENSE:FUT_SIM, TYPE:AFF"
# print(translate_tags_to_instruction(instruction))

In [11]:
dev_df

Unnamed: 0,ID,Source,Change,Target,Instructions
0,NahuatlOmitlan0001,oualah nouan,"HON:1, MOOD:IMP, PERSON[POSS]:1_PL, PERSON[SUB...",xonuiqueh touan,Use honorific form and Change the sentence to ...
1,NahuatlOmitlan0002,amo mococoua,"ASPECT:PFV, TENSE:PAS_SIM",amo omococoh,Change to perfective aspect and Change to past...
2,NahuatlOmitlan0003,ualasqueh nouan,"ASPECT:IPFV, PERSON[POSS]:3_PL, TENSE:PAS_SIM,...",amo oualayah inuan,Change to imperfective aspect and Change posse...
3,NahuatlOmitlan0004,ualasqueh nouan,"HON:1, MOOD:IMP, PERSON[POSS]:1_PL, PERSON[SUB...",xonuiqueh touan,Use honorific form and Change the sentence to ...
4,NahuatlOmitlan0005,neh amo nitomauac,"PERSON[SUBJ]:1_PL, TENSE:FUT_SIM, TYPE:AFF",tehuan tiisqueh titomauaqueh,Change subject to first person plural and Chan...
...,...,...,...,...,...
171,NahuatlOmitlan0172,oniualah iuan,"PERSON[POSS]:1_SI, PERSON[SUBJ]:3_PL, TENSE:FU...",ualasqueh nouan,Change possessor to first person singular and ...
172,NahuatlOmitlan0173,tiuitzeh mouan,"MOOD:IMP, PERSON[POSS]:1_SI, PERSON[SUBJ]:2_SI...",amo xiuiqui nouan,Change the sentence to imperative mood and Cha...
173,NahuatlOmitlan0174,oniualaya iuan,"PERSON[POSS]:1_SI, PERSON[SUBJ]:3_PL, TENSE:FU...",ualasqueh nouan,Change possessor to first person singular and ...
174,NahuatlOmitlan0175,niinconeuan,"PERSON[POSS]:3_SI, PERSON[SUBJ]:3_SI",yeh iconeu,Change possessor to third person singular and ...


## Prompt Function

In [12]:
# Define functions for generating prompts
def generate_prompt(row):
    """
    Generate a structured training prompt for a given data point.
    """
    # instruction = translate_tags_to_instruction(row["Change"])
    return (
        f"Language: Nahuatl Omitlan\n"
        f"Task: Transform the Source sentence into the Target sentence based on the given instruction.\n\n"
        f"Instruction: {row['Change']}\n"
        f"Source: {row['Source']}\n"
        f"Target: {row['Target']}"
    )

def generate_test_prompt(row):
    """
    Generate a structured test prompt for a given data point.
    """
    # instruction = translate_tags_to_instruction(row["Change"])
    return (
        f"Language: Nahuatl Omitlan\n"
        f"Task: Transform the Source sentence into the Target sentence based on the given instruction.\n\n"
        f"Instruction: {row['Change']}\n"
        f"Source: {row['Source']}\n"
        f"Provide only the transformed Target sentence."
    )

In [13]:
# def generate_prompt(data_point):
#     """
#     Generate a structured training prompt for a given data point.
#     """
#     return (
#         f"Language: Nahuatl Omitlan\n"
#         f"Task: Rewrite and transform the Source sentence into the Target sentence based on the provided instruction.\n\n"
#         f"Instruction: {data_point['Change']}\n"
#         f"Source: {data_point['Source']}\n"
#         f"Target: {data_point['Target']}"
#     )

# def generate_test_prompt(data_point):
#     """
#     Generate a structured test prompt for a given data point.
#     """
#     return (
#         f"Language: Nahuatl Omitlan\n"
#         f"Task: Rewrite and transform the Source sentence into the Target sentence based on the provided instruction.\n\n"
#         f"Instruction: {data_point['Change']}\n"
#         f"Source: {data_point['Source']}\n"
#         f"Provide only the rewritten Target sentence."
#     )


In [14]:
# # zero shot prompting type 1
# def generate_prompt(data_point):
#     """
#     Generate a structured training prompt for a given data point.
#     """
#     return f"""Language: Nahuatl Omitlan,rewrite and change the Source sentence to the Target sentence according to the given instruction.
# Instruction: {data_point["Change"]}
# Source: {data_point["Source"]}
# Target: {data_point["Target"]}
# """.strip()

# def generate_test_prompt(data_point):
#     return f"""
# Language: Nahuatl Omitlan,rewrite and change the Source sentence to the Target sentence according to the given instruction.
# Instruction: {data_point["Change"]}
# Source: {data_point["Source"]}
# Provide only the Target sentence nothing else.
# Target:""".strip()

In [15]:
# # Generate prompts for training and evaluation dataa
# X_train.loc[:,'text'] = X_train.apply(generate_prompt, axis=1)
# X_eval.loc[:,'text'] = X_eval.apply(generate_prompt, axis=1)

In [16]:
# Generate prompts for training and evaluation data
X_train["text"] = X_train.apply(generate_prompt, axis=1)
X_eval["text"] = X_eval.apply(generate_prompt, axis=1)

In [17]:
# Convert to datasets
train_data = Dataset.from_pandas(X_train[["text"]])
eval_data = Dataset.from_pandas(X_eval[["text"]])

In [18]:
# Create a new DataFrame for test prompts
test_data = pd.DataFrame({
    "Change": X_eval["Change"],
    "Source": X_eval["Source"]
})
# Generate prompts for test data
X_test = pd.DataFrame(test_data.apply(lambda row: generate_test_prompt(row), axis=1), columns=["text"])

In [19]:
# Set the device (GPU if available)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [20]:
device

device(type='cuda', index=0)

## Model Load (Llama 3.2-3B Instruct)

In [21]:
# Load the pre-trained model and tokenizer
base_model_name = "meta-llama/Llama-3.2-3B-Instruct"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype="float16",
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token_id = tokenizer.eos_token_id

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [22]:
# Move the model to the GPU
model.to(device)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e

In [23]:
# # Define a custom predict function
# def predict(test, model, tokenizer):
#     y_pred = []
    
#     for i in tqdm(range(len(test))):
#         prompt = test.iloc[i]["text"]
#         inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(device)
        
#         # Generate text using the model directly
#         outputs = model.generate(**inputs, max_length=100, num_beams=4, no_repeat_ngram_size=3).to(device)
        
#         generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True).to(device)
#         transformed_sentence = generated_text.split("Target:")[-1].strip()

#         if transformed_sentence:  
#             y_pred.append(transformed_sentence)
#         else:
#             y_pred.append("ERROR")  # Handle empty outputs
    
#     return y_pred

## Predict and Post Processing

In [24]:
from tqdm import tqdm
from transformers import pipeline
import string
def clean_prediction(text):
    """
    Extracts the expected transformed sentence from the generated output.
    Stops processing as soon as a punctuation mark is encountered.
    """
    text = text.strip()

    # Extract text after "Target:" marker
    if "Target:" in text:
        text = text.split("Target:")[-1].strip()

    # Take only the first line to remove unwanted repetitions
    text = text.split("\n")[0].strip()

    # Stop at the first punctuation (e.g., period, comma, etc.)
    for punctuation in string.punctuation:
        if punctuation in text:
            text = text.split(punctuation)[0].strip()
            break

    return text


def predict(test, model, tokenizer):
    """
    Generate predictions for the test dataset without using a dataset format.
    """
    y_pred = []
    
    # Define pipeline outside loop for efficiency
    pipe = pipeline(task="text-generation", 
                    model=model, 
                    tokenizer=tokenizer, 
                    max_new_tokens=20,  # Limit length to avoid extra output
                    temperature=0.1,  # Make output more deterministic
                    )  

    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["text"]  # Use already pre-generated test prompts
        result = pipe(prompt)
        
        generated_text = result[0]['generated_text']
        transformed_sentence = clean_prediction(generated_text)
        y_pred.append(transformed_sentence if transformed_sentence else "ERROR")  # Handle empty output

    return y_pred

In [25]:
# Evaluate the model before fine-tuning
y_pred_before_fine_tune = predict(X_test, model, tokenizer)

Device set to use cuda:0
  6%|▌         | 10/176 [00:08<02:21,  1.17it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 176/176 [02:23<00:00,  1.23it/s]


## Evaluate Function

In [26]:
# Evaluate the predictions
def evaluate(y_true, y_pred):
    bleu = corpus_bleu(y_pred, [y_true])
    print(f"BLEU score: {bleu.score:.2f}")

    chrf = corpus_chrf(y_pred, [y_true])
    print(f"chrF score: {chrf.score:.2f}")

    for i in range(min(5, len(y_true))):
        print(f"\nMain Prompt: {X_test.iloc[i]['text']}")
        print(f"Expected Sentence: {y_true[i]}")
        print(f"Prediction: {y_pred[i]}")

In [27]:
# Evaluate the model
y_true = X_eval["Target"]

# Evaluate the model before fine-tuning
print("\nOriginal Model Evaluation Before Fine Tuning:")
evaluate(y_true.tolist(), y_pred_before_fine_tune)


Original Model Evaluation Before Fine Tuning:
BLEU score: 0.65
chrF score: 13.08

Main Prompt: Language: Nahuatl Omitlan
Task: Transform the Source sentence into the Target sentence based on the given instruction.

Instruction: HON:1, MOOD:IMP, PERSON[POSS]:1_PL, PERSON[SUBJ]:2_PL, TENSE:PRE_SIM
Source: oualah nouan
Provide only the transformed Target sentence.
Expected Sentence: xonuiqueh touan
Prediction: oualah nouan

Main Prompt: Language: Nahuatl Omitlan
Task: Transform the Source sentence into the Target sentence based on the given instruction.

Instruction: ASPECT:PFV, TENSE:PAS_SIM
Source: amo mococoua
Provide only the transformed Target sentence.
Expected Sentence: amo omococoh
Prediction: Language

Main Prompt: Language: Nahuatl Omitlan
Task: Transform the Source sentence into the Target sentence based on the given instruction.

Instruction: ASPECT:IPFV, PERSON[POSS]:3_PL, TENSE:PAS_SIM, TYPE:NEG
Source: ualasqueh nouan
Provide only the transformed Target sentence.
Expected 

In [44]:
# Define LoRA configuration
def find_all_linear_names(model):
    cls = torch.nn.Linear
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

In [45]:
lora_config = LoraConfig(
    r=4,
    lora_alpha=8,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)

In [49]:
# Set up training arguments
training_arguments = SFTConfig(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    save_steps=1000,
    dataset_text_field="text",
    max_seq_length=512,
    packing=False,
    logging_steps=500,
    learning_rate=2e-4,
    weight_decay=0.01,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="none",
    eval_strategy="steps",
    eval_steps=50,  
)

In [50]:
# Initialize the SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=lora_config,
    tokenizer=tokenizer,
    args=training_arguments,
)

  trainer = SFTTrainer(


ValueError: Target module Dropout(p=0.1, inplace=False) is not supported. Currently, only the following modules are supported: `torch.nn.Linear`, `torch.nn.Embedding`, `torch.nn.Conv2d`, `torch.nn.Conv3d`, `transformers.pytorch_utils.Conv1D`.

In [32]:
# Train the model
trainer.train()

Step,Training Loss,Validation Loss
50,No log,1.42538
100,No log,1.242674
150,No log,1.161679
200,No log,1.178118
250,No log,1.19761
300,No log,1.20959
350,No log,1.224546
400,No log,1.240438
450,No log,1.290197
500,1.160300,1.208996


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=975, training_loss=0.8182956167367789, metrics={'train_runtime': 1142.7373, 'train_samples_per_second': 1.711, 'train_steps_per_second': 0.853, 'total_flos': 2573071726061568.0, 'train_loss': 0.8182956167367789})

In [33]:
# Evaluate the model after fine-tuning
y_pred_after_fine_tune = predict(X_test, model, tokenizer)

Device set to use cuda:0
100%|██████████| 176/176 [05:11<00:00,  1.77s/it]


In [34]:
# Evaluate the model
y_true = X_eval["Target"]

# Evaluate the model before fine-tuning
print("\nOriginal Model Evaluation After Fine Tuning:")
evaluate(y_true.tolist(), y_pred_after_fine_tune)


Original Model Evaluation After Fine Tuning:
BLEU score: 0.25
chrF score: 18.71

Main Prompt: Language: Nahuatl Omitlan
Task: Transform the Source sentence into the Target sentence based on the given instruction.

Instruction: HON:1, MOOD:IMP, PERSON[POSS]:1_PL, PERSON[SUBJ]:2_PL, TENSE:PRE_SIM
Source: oualah nouan
Provide only the transformed Target sentence.
Expected Sentence: xonuiqueh touan
Prediction: xicchihchiua motlaxcalmeh ousideh qu

Main Prompt: Language: Nahuatl Omitlan
Task: Transform the Source sentence into the Target sentence based on the given instruction.

Instruction: ASPECT:PFV, TENSE:PAS_SIM
Source: amo mococoua
Provide only the transformed Target sentence.
Expected Sentence: amo omococoh
Prediction: amo occochtito conetl

Main Prompt: Language: Nahuatl Omitlan
Task: Transform the Source sentence into the Target sentence based on the given instruction.

Instruction: ASPECT:IPFV, PERSON[POSS]:3_PL, TENSE:PAS_SIM, TYPE:NEG
Source: ualasqueh nouan
Provide only the tr

In [35]:
y_true

0                   xonuiqueh touan
1                      amo omococoh
2                amo oualayah inuan
3                   xonuiqueh touan
4      tehuan tiisqueh titomauaqueh
                   ...             
171                 ualasqueh nouan
172               amo xiuiqui nouan
173                 ualasqueh nouan
174                      yeh iconeu
175                      yeh iconeu
Name: Target, Length: 176, dtype: object

## Dev Submission

In [36]:
dev_pd = pd.DataFrame(y_pred_after_fine_tune, columns=['Values'])

In [37]:
dev_pd.to_csv('syntax_squad_nahuatl_omitlan_dev_output.tsv', sep='\t', index=False, header=False)

In [38]:
from IPython.display import FileLink

FileLink("syntax_squad_nahuatl_omitlan_dev_output.tsv")

## Test Submission

In [39]:
# Create a new DataFrame for test prompts
test_data_sub = pd.DataFrame({
    "Change": X_test_sub["Change"],
    "Source": X_test_sub["Source"]
})
# Generate prompts for test data
X_test_sub = pd.DataFrame(test_data_sub.apply(lambda row: generate_test_prompt(row), axis=1), columns=["text"])

In [40]:
# Evaluate the model before fine-tuning
y_pred_test = predict(X_test_sub, model, tokenizer)

Device set to use cuda:0
100%|██████████| 120/120 [03:30<00:00,  1.76s/it]


In [41]:
test_pd = pd.DataFrame(y_pred_test, columns=['Values'])

In [42]:
test_pd.to_csv('syntax_squad_nahuatl_omitlan_test_output.tsv', sep='\t', index=False, header=False)

In [43]:
from IPython.display import FileLink

FileLink("syntax_squad_nahuatl_omitlan_test_output.tsv")