In [1]:
from datasets import load_dataset

data = load_dataset("squad")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [3]:
# export to JSONL
import json

with open("raw_data/squad.jsonl", "w") as f:
    for item in data["train"]:
        f.write(json.dumps(item) + "\n")

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, TextStreamer
import torch
from dotenv import load_dotenv

load_dotenv("../.env")

base_model = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(base_model)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    return_dict=True,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.94s/it]


In [5]:
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id

In [6]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

Device set to use mps


In [7]:
tokenizer.pad_token_id

128009

In [8]:
messages = [{"role": "user", "content": "Who is Vincent van Gogh?"}]

prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

outputs = pipe(prompt, max_new_tokens=120, do_sample=True)

print(outputs[0]["generated_text"])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 06 May 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

Who is Vincent van Gogh?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Vincent van Gogh (1853-1890) was a Dutch post-impressionist painter, widely considered one of the most influential and iconic artists in history. He is known for his bold, expressive, and emotionally charged paintings that explored the human condition, nature, and the artist's own inner world.

Born in Groot-Zundert, Netherlands, Van Gogh was the eldest of six children to a Protestant pastor. He was a quiet, introverted child who struggled with mental health issues, including depression, anxiety, and episodes of psychosis. These struggles would later influence his art and


In [9]:
from IPython.display import Markdown, display

messages = [
    {
        "role": "system",
        "content": "You are a skilled Python developer specializing in database management and optimization.",
    },
    {
        "role": "user",
        "content": "I'm experiencing a sorting issue in my database. Could you please provide Python code to help resolve this problem?",
    },
]

prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

outputs = pipe(prompt, max_new_tokens=512, do_sample=True)

display(
    Markdown(
        outputs[0]["generated_text"].split(
            "<|start_header_id|>assistant<|end_header_id|>"
        )[1]
    )
)



I'd be happy to help you resolve the sorting issue in your database. However, I need a bit more information about the issue you're experiencing.

Could you please provide the following details:

1. What type of database are you using (e.g., MySQL, PostgreSQL, SQLite, MongoDB)?
2. What programming language are you using to interact with the database (e.g., Python, SQL, ODBC)?
3. What is the structure of your database table(s) involved in the sorting issue?
4. What is the desired sorting behavior (e.g., ascending/descending, specific column(s) to sort on)?
5. Any error messages or stack traces you're encountering?

Once I have this information, I can provide you with more tailored guidance and Python code to help resolve the sorting issue.

If you're using a Python library to interact with your database, please let me know which library you're using (e.g., `sqlite3`, `psycopg2`, `mysql-connector-python`, etc.).

If you have a minimal, reproducible example of the issue, you can share it with me, and I'll do my best to assist you.

Here's an example of a basic sorting function using Python and the `sqlite3` library:
```python
import sqlite3

# Connect to the database
conn = sqlite3.connect('example.db')
cursor = conn.cursor()

# Create a sample table
cursor.execute('''
    CREATE TABLE IF NOT EXISTS example (
        id INTEGER PRIMARY KEY,
        name TEXT,
        age INTEGER
    )
''')

# Insert some sample data
cursor.execute("INSERT INTO example (name, age) VALUES ('John', 25)")
cursor.execute("INSERT INTO example (name, age) VALUES ('Alice', 30)")
cursor.execute("INSERT INTO example (name, age) VALUES ('Bob', 20)")

# Commit the changes
conn.commit()

# Sort the data
cursor.execute("SELECT * FROM example ORDER BY age")

# Fetch and print the results
rows = cursor.fetchall()
for row in rows:
    print(row)

# Close the connection
conn.close()
```
This example demonstrates a basic sorting operation using the `ORDER BY` clause. However, without more information about your specific issue, I'll need to provide more tailored guidance.

In [10]:
data["train"][0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

In [11]:
def apply_chat_template(example):
    answer = example['answers']['text']
    messages = [
        {"role": "user", "content": example['question']},
        {"role": "assistant", "content": answer}
    ]
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    return {"prompt": prompt}

templated_dataset = data.map(apply_chat_template)

In [12]:
templated_dataset["train"][0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]},
 'prompt': "<|begin_of_text|><|start_header_id|>sy

In [13]:
# tokenise the prompts
def tokenize_function(example):
    tokens = tokenizer(
        example['prompt'],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    # Set padding token labels to -100 to ignore them in loss calculation
    tokens['labels'] = [
        -100 if token == tokenizer.pad_token_id else token for token in tokens['input_ids']
    ]
    return tokens

tokenised_dataset = templated_dataset.map(tokenize_function)
tokenised_dataset = tokenised_dataset.remove_columns(["title", "context", "question", "answers", "prompt"])

Map: 100%|██████████| 10570/10570 [00:02<00:00, 3653.91 examples/s]


In [14]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

model.train()
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps", # To evaluate during training
    eval_steps=40,
    logging_steps=40,
    save_steps=150,
    per_device_train_batch_size=2, # Adjust based on your hardware
    per_device_eval_batch_size=2,
    num_train_epochs=2, # How many times to loop through the dataset
    fp16=False, # Must be False for MacBooks
    report_to="none", # Here we can use something like tensorboard to see the training metrics
    log_level="info",
    learning_rate=1e-5, # Would avoid larger values here
    max_grad_norm=2 # Clipping the gradients is always a good idea
)
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenised_dataset["train"],
    eval_dataset=tokenised_dataset["validation"],
    tokenizer=tokenizer)

# Train the model
trainer.train()

# Save the model and tokenizer
trainer.save_model("./fine-tuned-model")
tokenizer.save_pretrained("./fine-tuned-model")

  trainer = Trainer(
The following columns in the training set don't have a corresponding argument in `LlamaForCausalLM.forward` and have been ignored: id. If id are not expected by `LlamaForCausalLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 87,599
  Num Epochs = 2
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 87,600
  Number of trainable parameters = 3,212,749,824


Step,Training Loss,Validation Loss
40,2.4894,


The following columns in the evaluation set don't have a corresponding argument in `LlamaForCausalLM.forward` and have been ignored: id. If id are not expected by `LlamaForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10570
  Batch size = 2
The following columns in the evaluation set don't have a corresponding argument in `LlamaForCausalLM.forward` and have been ignored: id. If id are not expected by `LlamaForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10570
  Batch size = 2


KeyboardInterrupt: 