In [1]:
import os
from sys import stderr

# if not os.path.exists(os.path.join(os.getcwd(), "./.env")):
#     raise FileNotFoundError(f'Environment variable file at {os.path.join(os.getcwd(), ".env")} not found')
# else:
#     %reload_ext dotenv
#     %dotenv

In [2]:
haskell_dataset = []
haskell_filenames = []
for file in os.listdir('./HS_Dataset/'):
    haskell_filenames.append(file)
    with open('./HS_Dataset/' + file, 'r') as f:
        haskell_dataset.append(f.read())

In [3]:
with open('dataset.lisp', 'r') as f:
    text = f.read()
    snippets = text.split('##\n')
    lisp_dataset = [snip.strip() for snip in snippets]

print(lisp_dataset[0])

(defun compress (x)
  (if (consp x) 
      (compr (car x) 1 (cdr x))
      x))

(defun compr (elt n lst)
  (if (null lst)
      (list (n-elts elt n))
      (let ((next (car lst)))
        (if (eql next elt)
            (compr elt (+ n 1) (cdr lst))
            (cons (n-elts elt n)
                  (compr next 1 (cdr lst)))))))

(defun n-elts (elt n)
  (if (> n 1)
      (list n elt)
      elt))
      
(print (compress '(1 1 1 0 1 0 0 0 0 1)))


In [4]:
lisp_prompt_template = \
'''You are an agent tasked with translating Lisp code to idiomatic C code.

Here is an example of what you will output, make sure to follow the format exactly:

Lisp Input:
(defun add (a b)
  (+ a b))
(print (add 1 2))
(print (add 5 4))
(print (add 6 0))

C Output:
int add(int a, int b){{
  return a + b;
}}
int main(){{
  printf(add(1, 2));
  printf(add(5, 4));
  printf(add(6, 0));
}}

Keep in mind:
    - The function calls in the must be valid and syntactically correct so that the user can directly execute them. This is very important, do NOT use pseudo-code or undefined functions.
    - All functions must be properly defined in the C output
    - The C output must be functionally equivalent to the Lisp input such that when both programs are executed they print EXACTLY the same output
    - Do not output anything else

Final Reminder: The C code MUST be immediately executable in C

Now try for yourself:
Lisp Input:
{0}

C Output:
'''

haskell_prompt_template = \
r'''You are an agent tasked with translating Haskell code to idiomatic C++ code.

Here is an example of what you will output, make sure to follow the format exactly:

Haskell Input:
import Text.Printf

add :: Int -> Int -> Int
add x y = x + y

main = do
    printf "%d\n" $ add 1 2
    printf "%d\n" $ add 5 4
    printf "%d\n" $ add 6 0

C++ Output:
int add(int a, int b){{
  return a + b;
}}
int main(){{
  std::cout << add(1, 2) << std::endl;
  std::cout << add(5, 4) << std::endl;
  std::cout << add(6, 0) << std::endl;
}}

Keep in mind:
    - The function calls in the must be valid and syntactically correct so that the user can directly execute them. This is very important, do NOT use pseudo-code or undefined functions.
    - All functions must be properly defined in the C++ output
    - The C++ output must be functionally equivalent to the Haskell input such that when both programs are executed they print EXACTLY the same output
    - Do not output anything else

Final Reminder: The C++ code MUST be immediately executable in C++

Now try for yourself:
Haskell Input:
{0}

C++ Output:
'''

In [5]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Qwen/Qwen2.5-Coder-3B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

ğŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ğŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.12.1: Fast Qwen2 patching. Transformers: 4.57.3.
   \\   /|    NVIDIA A2. Num GPUs = 1. Max memory: 14.642 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [6]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "qwen2.5",
)

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)

# Translating Haskell: use this (following) cell

In [9]:
messages = [
    {"role": "user", "content": haskell_prompt_template.format(test)} for test in haskell_dataset
]

for i, message in enumerate(messages):
    print(f'test case {i}')

    inputs = tokenizer.apply_chat_template(
        [message],
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")

    output = model.generate(input_ids = inputs, streamer = None, max_new_tokens = 2048,
                    use_cache = True, temperature = 0.5, min_p = 0.1)

    with open('./HS_Output/' + haskell_filenames[i][:-3] + '.cpp', 'w') as f:
        f.write(tokenizer.batch_decode(output)[0].split('C++ Output')[2])

test case 0
test case 1
test case 2
test case 3
test case 4
test case 5
test case 6
test case 7
test case 8
test case 9
test case 10
test case 11
test case 12
test case 13
test case 14


# Translating Lisp: Use this (following) cell

In [None]:
messages = [
    {"role": "user", "content": lisp_prompt_template.format(test)} for test in lisp_dataset
]

lisp_outputs = []

for i, message in enumerate(messages):
    print(f'test case {i}')

    inputs = tokenizer.apply_chat_template(
        [message],
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")

    output = model.generate(input_ids = inputs, streamer = None, max_new_tokens = 2048,
                    use_cache = True, temperature = 0.5, min_p = 0.1)

    output_code = tokenizer.batch_decode(output)[0].split('C Output')[2]
    # output_code = output_code[output_code.index("```c")+1:] + '\n\n'
    lisp_outputs.append(output_code)
    # print(lisp_outputs[-1])


with open('./Lisp_Output/output.c', 'w') as f:
    f.write('\n\n'.join(lisp_outputs))

test case 0
test case 1
test case 2
test case 3
test case 4
test case 5
test case 6
test case 7
test case 8
test case 9
test case 10
test case 11
test case 12
test case 13
test case 14
test case 15
test case 16
test case 17
test case 18
test case 19
test case 20
test case 21
test case 22
test case 23
test case 24
test case 25


In [13]:
import pandas as pd
import numpy as np

categories_from = ['Recursion', 'Recursive Lists', 'Immutable Data', 'Higher-order Functions', 'Pattern Matching']
categories_to = ['Iteration', 'Arrays', 'State Update', 'Callbacks/Function Pointers', 'Conditional Branching']

hs_data = pd.read_csv('./hs_data.csv', sep='\t', header=None)
# hs_data

scores_by_category = np.zeros(5)
applications_by_category = np.zeros(5)

for i in range(0, hs_data.shape[0] - 1, 2):
    from_row = hs_data.iloc[i]
    to_row = hs_data.iloc[i+1]

    scores_by_category += to_row
    applications_by_category += from_row

scores_by_category /= applications_by_category

print(scores_by_category)


0    0.555556
1    1.000000
2    0.600000
3    1.000000
4    1.083333
Name: 1, dtype: float64
