## Prepare train, validation and test datasets

### Dataset 
Take a look at `data/sms_phishing.csv`

## DSPy Examples

In [None]:
import polars as pl
from sms_classifier import Input, Output

df = pl.read_csv("../../data/sms_phishing.csv")
from dspy import Example

examples = []
for r in df.iter_rows(named=True):
    examples.append(
        Example(
            input=Input(text=r["TEXT"]), output=Output(label=r["LABEL"].lower())
        ).with_inputs("input")
    )

In [None]:
len(examples)

### Split the dataset into train, val and test

In [None]:
TRAIN_PCT = 0.9666
VAL_PCT = 0.0167
TEST_PCT = 0.0167
train = examples[: int(len(examples) * TRAIN_PCT)]
val = examples[
    int(len(examples) * TRAIN_PCT) : int(len(examples) * (TRAIN_PCT + VAL_PCT))
]
test = examples[int(len(examples) * (TRAIN_PCT + VAL_PCT)) :]

In [None]:
len(train), len(val), len(test)

## Preparation of training dataset for LLM finetuning

In [None]:
import pandas as pd

train_records = [
    {"text": f"{e.input.text}", "label": e.output.label.value} for e in train
]
df = pd.DataFrame(train_records)

In [None]:
len(df)

In [None]:
df.to_csv("../../finetune/data/sms_phising.csv", index=False)

In [None]:
import json

with open("../../finetune/data/sms_phising.json", "w") as f:
    json.dump(train_records, f, indent=4)

In [None]:
chat_template = """{{"messages": [ {{"role": "system", "content": "{system_prompt}" }},{{"role": "user", "content": "{user_question}"}},{{"role": "assistant", "content": "{model_answer}"}}] }}"""

# completions
completion_template = (
    """{{"prompt": "{user_question}", "completion": "{model_answer}"}}"""
)

text_template = """{{"text": "{user_question}" }}"""

text_template_with_inst = (
    """{{"text": "<s>[INST]{user_question}[/INST]{model_answer}</s>" }}"""
)

text_template_llama = """{{"text": "<|begin_of_text|><|start_header_id|>user<|end_header_id|>{user_question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>{model_answer}<|end_of_text|>"}}"""

naive_template = """{{"input": "{user_question}", "output": "{model_answer}" }}"""

formatted_data = []
system_prompt = "You are a Cybersecurity assistant.Given an SMS text, predict whether it is ham, spam, or smishing.Output only the predicted label."

template = chat_template
with open("../../finetune/data/train.jsonl", "w", encoding="utf-8") as new_file:
    for term in train_records:
        temp_data_0 = template.format(
            system_prompt=system_prompt,
            user_question=f"""{term['text'].replace('"',"")}""",
            model_answer=term["label"],
        )
        try:
            # Try to load the JSON object
            json.loads(temp_data_0)
        except json.JSONDecodeError as e:
            pass
        else:
            new_file.write(temp_data_0)
            new_file.write("\n")

In [None]:
import json


def validate_jsonl(file_path):
    i = 0
    with open(file_path, "r", encoding="utf-8") as f:
        for line_num, line in enumerate(f, 1):
            try:
                # Try to load the JSON object
                json.loads(line)
            except json.JSONDecodeError as e:
                print(f"Invalid JSON on line {line_num}: {e}")
                i += 1
            else:
                pass
        print(i)


# Example usage
file_path = "../../finetune/data/train.jsonl"
validate_jsonl(file_path)