In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/maya-test.tsv
/kaggle/input/bribri-dev.tsv
/kaggle/input/nahuatl_omitlan-test.tsv
/kaggle/input/nahuatl_omitlan-dev.tsv
/kaggle/input/guarani-train.tsv
/kaggle/input/guarani-dev.tsv
/kaggle/input/guarani-test.tsv
/kaggle/input/maya-train.tsv
/kaggle/input/bribri-test.tsv
/kaggle/input/maya-dev.tsv
/kaggle/input/nahuatl_omitlan-train.tsv
/kaggle/input/bribri-train.tsv


In [2]:
%%capture
%pip install Dataset
%pip install sacrebleu
%pip install transformers
%pip install sentencepiece
%pip install datasets
%pip install huggingface_hub
%pip install bitsandbytes
%pip install -U accelerate
%pip install -U peft
%pip install -U trl

In [3]:
# Set environment variable to help with memory allocation
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [4]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HUGGINGFACE_TOKEN = user_secrets.get_secret("HUGGINGFACE_TOKEN")

!huggingface-cli login --token $HUGGINGFACE_TOKEN

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
The token `basic task` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `basic task`


In [5]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer,SFTConfig
from trl import setup_chat_format
from transformers import (
                          AutoTokenizer,
                          AutoModelForCausalLM,
                          TrainingArguments,
                          BitsAndBytesConfig,
                          pipeline,
                          Trainer,
                          DataCollatorWithPadding,
                          logging)
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split
from sacrebleu import corpus_bleu, corpus_chrf

In [6]:
from accelerate import PartialState
device_map={"": PartialState().process_index}

In [7]:
# Load the data
train_df = pd.read_table('/kaggle/input/bribri-train.tsv')
dev_df = pd.read_table('/kaggle/input/bribri-dev.tsv')
test_df = pd.read_table('/kaggle/input/bribri-test.tsv')

In [8]:
X_train = train_df
X_eval = dev_df
X_test_sub = test_df

In [9]:
def generate_prompt(data_point):
    """
    Generate a structured training prompt for a given data point.
    """
    return f"""
Source: {data_point["Source"]}
Instruction: {data_point["Change"]}
Target: {data_point["Target"]}
""".strip()

def generate_test_prompt(data_point):
    return f"""
Source: {data_point["Source"]}
Instruction: {data_point["Change"]}
Provide only the Target sentence nothing else.
Target:""".strip()

In [10]:
# Generate prompts for training and evaluation dataa
X_train.loc[:,'text'] = X_train.apply(generate_prompt, axis=1)
X_eval.loc[:,'text'] = X_eval.apply(generate_prompt, axis=1)

In [11]:
# Convert to datasets
train_data = Dataset.from_pandas(X_train[["text"]])
eval_data = Dataset.from_pandas(X_eval[["text"]])

In [12]:
# Create a new DataFrame for test prompts
test_data = pd.DataFrame({
    "Change": X_eval["Change"],
    "Source": X_eval["Source"]
})
# Generate prompts for test data
X_test = pd.DataFrame(test_data.apply(lambda row: generate_test_prompt(row), axis=1), columns=["text"])

In [13]:
# Set the device (GPU if available)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [14]:
device

device(type='cuda', index=0)

In [15]:
# Load the pre-trained model and tokenizer
base_model_name = "meta-llama/Llama-3.1-8B-Instruct"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype="float16",
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token_id = tokenizer.eos_token_id

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [16]:
# Move the model to the GPU
model.to(device)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps

In [17]:
# # Define a custom predict function
# def predict(test, model, tokenizer):
#     y_pred = []
    
#     for i in tqdm(range(len(test))):
#         prompt = test.iloc[i]["text"]
#         inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(device)
        
#         # Generate text using the model directly
#         outputs = model.generate(**inputs, max_length=100, num_beams=4, no_repeat_ngram_size=3).to(device)
        
#         generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True).to(device)
#         transformed_sentence = generated_text.split("Target:")[-1].strip()

#         if transformed_sentence:  
#             y_pred.append(transformed_sentence)
#         else:
#             y_pred.append("ERROR")  # Handle empty outputs
    
#     return y_pred

In [18]:
from tqdm import tqdm
from transformers import pipeline

def clean_prediction(text):
    """
    Extracts the expected transformed sentence from the generated output.
    """
    text = text.strip()

    # Extract text after "Target:" marker
    if "Target:" in text:
        text = text.split("Target:")[-1].strip()

    # Take only the first line to remove unwanted repetitions
    text = text.split("\n")[0].strip()

    return text

def predict(test, model, tokenizer):
    """
    Generate predictions for the test dataset without using a dataset format.
    """
    y_pred = []
    
    # Define pipeline outside loop for efficiency
    pipe = pipeline(task="text-generation", 
                    model=model, 
                    tokenizer=tokenizer, 
                    max_new_tokens=20,  # Limit length to avoid extra output
                    temperature=0.1,  # Make output more deterministic
                    )  

    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["text"]  # Use already pre-generated test prompts
        result = pipe(prompt)
        
        generated_text = result[0]['generated_text']
        transformed_sentence = clean_prediction(generated_text)
        y_pred.append(transformed_sentence if transformed_sentence else "ERROR")  # Handle empty output

    return y_pred

In [19]:
# Evaluate the predictions
def evaluate(y_true, y_pred):
    bleu = corpus_bleu(y_pred, [y_true])
    print(f"BLEU score: {bleu.score:.2f}")

    chrf = corpus_chrf(y_pred, [y_true])
    print(f"chrF score: {chrf.score:.2f}")

    # Accuracy calculation
    accuracy = accuracy_score(y_true, y_pred)
    print(f"Accuracy: {accuracy:.2f}")

    for i in range(min(5, len(y_true))):
        print(f"\nMain Prompt: {X_test.iloc[i]['text']}")
        print(f"Expected Sentence: {y_true[i]}")
        print(f"Prediction: {y_pred[i]}")

In [20]:
# Define LoRA configuration
def find_all_linear_names(model):
    cls = torch.nn.Linear
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

In [21]:
lora_config = LoraConfig(
    r=4,
    lora_alpha=8,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)

In [22]:
# Set up training arguments
training_arguments = SFTConfig(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    save_steps=1000,
    dataset_text_field="text",
    max_seq_length=512,
    packing=False,
    logging_steps=500,
    learning_rate=2e-4,
    weight_decay=0.01,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="none",
    eval_strategy="steps",
    eval_steps=50,  
)

In [23]:
# Initialize the SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=lora_config,
    args=training_arguments,
)

Converting train dataset to ChatML:   0%|          | 0/309 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/309 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/309 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/309 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/212 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/212 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/212 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/212 [00:00<?, ? examples/s]

In [24]:
# Train the model
trainer.train()

Step,Training Loss,Validation Loss
50,No log,1.909394
100,No log,1.609859
150,No log,1.51272
200,No log,1.527582
250,No log,1.54526
300,No log,1.495715
350,No log,1.621079
400,No log,1.442568
450,No log,1.382495
500,1.070400,1.558152


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=770, training_loss=0.8645334417169744, metrics={'train_runtime': 1979.2582, 'train_samples_per_second': 0.781, 'train_steps_per_second': 0.389, 'total_flos': 3234437249261568.0, 'train_loss': 0.8645334417169744})

In [25]:
# Evaluate the model after fine-tuning
y_pred_after_fine_tune = predict(X_test, model, tokenizer)

Device set to use cuda:0
  5%|▍         | 10/212 [00:21<07:08,  2.12s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 212/212 [07:31<00:00,  2.13s/it]


In [26]:
# Evaluate the model
y_true = X_eval["Target"]

# Evaluate the model before fine-tuning
print("\nOriginal Model Evaluation After Fine Tuning:")
evaluate(y_true.tolist(), y_pred_after_fine_tune)


Original Model Evaluation After Fine Tuning:
BLEU score: 12.73
chrF score: 50.22
Accuracy: 0.01

Main Prompt: Source: Pûs kapë'wa̠
Instruction: ABSNUM:PL
Provide only the Target sentence nothing else.
Target:
Expected Sentence: Pûs kapë'ulur
Prediction: Pûs kapë'wa̠rule:PL, ABSNUM:PL, PERSON:

Main Prompt: Source: Pûs kapë'wa̠
Instruction: TYPE:NEG
Provide only the Target sentence nothing else.
Target:
Expected Sentence: Pûs kë̀ kapë̀ne̠wa̠
Prediction: Pûs kë̀ kapë'wa̠ne̠. (Note: k

Main Prompt: Source: Pûs kapë'wa̠
Instruction: TENSE:PRF_REC
Provide only the Target sentence nothing else.
Target:
Expected Sentence: Pûs kapówa̠
Prediction: Pûs kapë'waré̠wa̠!̠

Main Prompt: Source: Pûs kapë'wa̠
Instruction: TENSE:PRF_REC, ABSNUM:PL
Provide only the Target sentence nothing else.
Target:
Expected Sentence: Pûs kapóulur
Prediction: Pûs bák kapë'wa̠na̠, plö̀wa�

Main Prompt: Source: Pûs kapë'wa̠
Instruction: TENSE:IPFV_REC, ASPECT:IPFV
Provide only the Target sentence nothing else.
Target:


## Dev Submission

In [27]:
dev_pd = pd.DataFrame(y_pred_after_fine_tune, columns=['Values'])

In [28]:
dev_pd.to_csv('syntax_squad_bribri_dev_output.tsv', sep='\t', index=False, header=False)

In [29]:
from IPython.display import FileLink

FileLink("syntax_squad_bribri_dev_output.tsv")

## Test Submission

In [30]:
# Create a new DataFrame for test prompts
test_data_sub = pd.DataFrame({
    "Change": X_test_sub["Change"],
    "Source": X_test_sub["Source"]
})
# Generate prompts for test data
X_test_sub = pd.DataFrame(test_data_sub.apply(lambda row: generate_test_prompt(row), axis=1), columns=["text"])

In [31]:
# Evaluate the model before fine-tuning
y_pred_test = predict(X_test_sub, model, tokenizer)

Device set to use cuda:0
100%|██████████| 480/480 [17:03<00:00,  2.13s/it]


In [32]:
test_pd = pd.DataFrame(y_pred_test, columns=['Values'])

In [33]:
test_pd.to_csv('syntax_squad_bribri_test_output.tsv', sep='\t', index=False, header=False)

In [34]:
from IPython.display import FileLink

FileLink("syntax_squad_bribri_test_output.tsv")