In [None]:
%pip install transformers datasets accelerate - -quiet


In [None]:
import json

# Load your original JSON file
with open('/Users/kjevaji/Code/jupyter/output/output_2.json', 'r') as f:
    data = json.load(f)

# Open a new JSONL file for writing
with open('/Users/kjevaji/Code/jupyter/output/cobol_finetune_data_2.jsonl', 'w') as f_out:
    for entry in data:
        # Structure each line in JSONL format
        json_line = {
            "prompt": entry["code"],
            "completion": entry["comments"]
        }
        f_out.write(json.dumps(json_line) + '\n')


In [None]:
from datasets import load_dataset

# Load dataset from JSONL
dataset = load_dataset('json', data_files={
                       'train': './output/cobol_finetune_data.jsonl'})
print(dataset['train'][0])  # Print first training example to confirm structure


In [None]:
from transformers import AutoTokenizer

# Load the tokenizer
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [None]:
# Set a padding token
if tokenizer.pad_token is None:
    # Or use tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    tokenizer.pad_token = tokenizer.eos_token

# Tokenize function with padding


def tokenize_function(examples):
    inputs = tokenizer(
        examples['prompt'], padding="max_length", truncation=True, max_length=256)
    outputs = tokenizer(
        examples['completion'], padding="max_length", truncation=True, max_length=256)
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": outputs["input_ids"]
    }


# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)


In [None]:
print("Padding token:", tokenizer.pad_token)
print("Special tokens:", tokenizer.special_tokens_map)


In [None]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(model_name)


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",  # Evaluate based on steps
    save_steps=500,               # Save a checkpoint every 500 steps
    eval_steps=500,               # Evaluate every 500 steps
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_total_limit=2,
    report_to="none",
    no_cuda=True  # Force training on CPU
)


In [None]:
# Split the dataset into train and validation
split_dataset = tokenized_datasets["train"].train_test_split(
    test_size=0.1)  # 10% for validation

# Assign the train and validation sets
train_dataset = split_dataset["train"]
validation_dataset = split_dataset["test"]

print(train_dataset[0])
print(validation_dataset[0])

train_dataset = train_dataset.select(
    range(len(train_dataset) // 100))  # Use only half the dataset


In [None]:
import random
from datasets import Dataset

# Shuffle the training dataset to ensure randomness
train_dataset = train_dataset.shuffle(seed=42)

# Calculate the number of samples to use
sample_size = len(train_dataset) // 100

# Select a random sample from the dataset
small_train_dataset = train_dataset.select(range(sample_size))


In [None]:
# # Update Trainer with validation dataset
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=small_train_dataset,
#     eval_dataset=validation_dataset
# )

# trainer.train()


In [None]:
# model.save_pretrained("./fine_tuned_llama_cobol")
# tokenizer.save_pretrained("./fine_tuned_llama_cobol")


In [None]:
# from transformers import AutoTokenizer, AutoModelForCausalLM

# # Load the fine-tuned model and tokenizer
# tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_llama_cobol")
# model = AutoModelForCausalLM.from_pretrained("./fine_tuned_llama_cobol")

# # Define a test prompt (COBOL code snippet)
# test_prompt = """
# COBOL Code:
# IDENTIFICATION DIVISION.
# PROGRAM-ID. HELLO-WORLD.
# PROCEDURE DIVISION.
#     DISPLAY 'Hello, world!'.
#     STOP RUN.

# Please generate detailed documentation for the above COBOL code:
# """


# # Encode the input and generate output
# inputs = tokenizer(test_prompt, return_tensors="pt")
# outputs = model.generate(**inputs, max_length=150)

# # Decode and print the generated documentation
# generated_doc = tokenizer.decode(outputs[0], skip_special_tokens=True)
# print("Generated Documentation:")
# print(generated_doc)


In [None]:
# from transformers import AutoTokenizer, AutoModelForCausalLM

# # Load the tokenizer and base model from Hugging Face
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name)
# # Define a test prompt (COBOL code snippet)
# test_prompt = """
# Example COBOL Code 1:
# IDENTIFICATION DIVISION.
# PROGRAM-ID. SAMPLE1.
# PROCEDURE DIVISION.
#     DISPLAY 'Sample 1 Program'.
#     STOP RUN.

# Documentation:
# This COBOL program prints "Sample 1 Program" to the console. The IDENTIFICATION DIVISION identifies the program, and the PROCEDURE DIVISION contains the executable code.

# Example COBOL Code 2:
# IDENTIFICATION DIVISION.
# PROGRAM-ID. SAMPLE2.
# PROCEDURE DIVISION.
#     DISPLAY 'Sample 2 Program'.
#     STOP RUN.

# Documentation:
# This COBOL program prints "Sample 2 Program" to the console. The IDENTIFICATION DIVISION specifies the program name as SAMPLE2. The PROCEDURE DIVISION executes the DISPLAY command and then terminates the program.

# Now generate documentation for the following COBOL code:
# IDENTIFICATION DIVISION.
# PROGRAM-ID. HELLO-WORLD.
# PROCEDURE DIVISION.
#     DISPLAY 'Hello, world!'.
#     STOP RUN.
# """

# # Tokenize and generate output
# inputs = tokenizer(test_prompt, return_tensors="pt")
# outputs = model.generate(**inputs, max_length=350)

# # Decode and print the output
# generated_doc = tokenizer.decode(outputs[0], skip_special_tokens=True)
# print("Generated Documentation:")
# print(generated_doc)


In [None]:
%pip install transformers datasets accelerate


In [23]:
# Install necessary packages
# pip install transformers datasets accelerate

from accelerate import Accelerator
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset

# Initialize Accelerator to use CPU
accelerator = Accelerator(cpu=True)
model_name = "meta-llama/Llama-3.2-1B"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add or define the padding token if it doesn't exist
if tokenizer.pad_token is None:
    # Use eos_token as pad_token, or alternatively:
    tokenizer.pad_token = tokenizer.eos_token
    # tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = AutoModelForCausalLM.from_pretrained(model_name)

# Prepare model with accelerator (do not need to prepare datasets)
model = accelerator.prepare(model)

# Load your dataset from JSONL
dataset = load_dataset('json', data_files={
                       'train': '/Users/kjevaji/Code/jupyter/output/output_3_small.json'})

# Debug: Print dataset keys to understand the structure
print("Dataset keys:", dataset["train"].column_names)

# Define a function to tokenize the data
def tokenize_function(examples):
    # Combine type, description, code, and comments for the conversation context
    inputs_texts = [
        f"**Type:** {type}\n**Description:** {description}\n**Code:** {code}\n**Comments:** {comments}"
        for type, description, code, comments in zip(examples['type'], examples['description'], examples['code'], examples['comments'])
    ]
    
    # Tokenize input texts
    inputs = tokenizer(
        inputs_texts, padding="max_length", truncation=True, max_length=512
    )
    
    # Create labels that are the same as inputs for auto-regressive training
    labels = inputs["input_ids"]

    # Mask the input tokens corresponding to padding
    labels = [
        [-100 if token == tokenizer.pad_token_id else token for token in label]
        for label in labels
    ]

    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": labels
    }

# Apply the tokenization function to the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Split the dataset into training and validation sets (90% train, 10% validation)
split_dataset = tokenized_datasets["train"].train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
validation_dataset = split_dataset["test"]

import random
from datasets import Dataset

# Shuffle the training dataset to ensure randomness
train_dataset = train_dataset.shuffle(seed=42)

# Calculate the number of samples to use
sample_size = len(train_dataset)

# Select a random sample from the dataset
small_train_dataset = train_dataset.select(range(sample_size))

# Set up training arguments without mixed precision
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",  # Evaluate based on steps
    save_steps=500,               # Save a checkpoint every 500 steps
    eval_steps=500,               # Evaluate every 500 steps
    learning_rate=2e-5,
    per_device_train_batch_size=1,  # Keep batch size small to avoid memory issues
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_total_limit=2,
    report_to="none",
    use_cpu=True  # Force training on CPU
)

# Initialize the Trainer with model, arguments, and datasets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=validation_dataset
)

# Start training
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_llama_cobol")
tokenizer.save_pretrained("./fine_tuned_llama_cobol")


Found cached dataset json (/Users/kjevaji/.cache/huggingface/datasets/json/default-1f8128608f5db8d4/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset keys: ['code', 'comments', 'description', 'type']


Map:   0%|          | 0/21 [00:00<?, ? examples/s]

  0%|          | 0/54 [00:00<?, ?it/s]

{'train_runtime': 1173.9183, 'train_samples_per_second': 0.046, 'train_steps_per_second': 0.046, 'train_loss': 0.8099677474410446, 'epoch': 3.0}


('./fine_tuned_llama_cobol/tokenizer_config.json',
 './fine_tuned_llama_cobol/special_tokens_map.json',
 './fine_tuned_llama_cobol/tokenizer.json')

In [24]:
test_prompt = """
COBOL Code:
IDENTIFICATION DIVISION.
PROGRAM-ID. HELLO-WORLD.
PROCEDURE DIVISION.
    DISPLAY 'Hello, world!'.
    STOP RUN.

Please generate detailed documentation for the above COBOL code:
"""


# Load the fine-tuned model and tokenizer
from transformers import pipeline

model = AutoModelForCausalLM.from_pretrained("./fine_tuned_llama_cobol")
tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_llama_cobol")

# Set up a generation pipeline
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# Example COBOL code snippet to generate documentation for
code_snippet = """
       IDENTIFICATION DIVISION.
       PROGRAM-ID. HELLO-WORLD.
       PROCEDURE DIVISION.
           DISPLAY 'Hello, world!'.
           STOP RUN.
"""

# Generate documentation
generated_text = generator(code_snippet, max_length=150, num_return_sequences=1)
print("Generated Documentation:")
print(generated_text[0]['generated_text'])


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Generated Documentation:

       IDENTIFICATION DIVISION.
       PROGRAM-ID. HELLO-WORLD.
       PROCEDURE DIVISION.
           DISPLAY 'Hello, world!'.
           STOP RUN.
       
       DATA DIVISION.
           H-WORD-PROGRAM-ID.: HELLO-WORLD.
           H-PROGRAM-DATA DIVISION.
               PROGRAM-NAME: HELLO-WORLD.
               PROGRAM-DESCRIPTION: This COBOL program is designed to display the
               message "Hello, world!" on the screen.
               H-OPER-DRG-CODE: 0000-0000.
               H-OPER-DSH-CODE: 0000-0000.
               H-OPER-DSH-DAYS-CODE: 0000-
