## <b>Preparing the environment and installing libraries:<b>

In [1]:
!nvidia-smi

zsh:1: command not found: nvidia-smi


In [2]:
%pip install transformers>=4.46.0

zsh:1: 4.46.0 not found
Note: you may need to restart the kernel to use updated packages.


%pip install fsspec==2024.9.0
%pip install --upgrade openai

%pip uninstall bitsandbytes
%pip install bitsandbytes[gpu]

%pip install -qqq bitsandbytes torch transformers peft accelerate datasets loralib einops trl

In [3]:
%pip install bitsandbytes
%pip install --upgrade openai



Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [4]:
import json
import os
from pprint import pprint

import bitsandbytes as bnb
import pandas as pd
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from trl import DPOConfig, DPOTrainer

from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


## <b>Loading the model and the tokenizer:<b>

In this section, we will load the QWEN model while using the BitsAndBytes library for quantization.

In [5]:
MODEL_NAME = "Qwen/Qwen2.5-0.5B"
# MODEL_NAME = "unsloth/Llama-3.2-1B" # Try Llama if you want

bnb_config = BitsAndBytesConfig() # fill the gap

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
    # quantization_config=bnb_config,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [6]:
def print_trainable_parameters(model):

    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
          trainable_params += param.numel() # fill the gap: get the number of trainable parameters: trainable_params
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

## <b>Configuring LoRA:<b>

In [7]:
config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    # target_modules=["query_key_value"],
    bias="none",
    task_type= "CAUSAL_LM"# fill the gaph
)
model = get_peft_model(model, config) # fill the gap, using lora weights
print_trainable_parameters(model)

trainable params: 1081344 || all params: 495114112 || trainable%: 0.21840298504761665


## <b>Test the model before finetuning:<b>

In [8]:
prompt = """<SYSTEM>We represent an edge list like this (1, 0), (0, 3), (1, 2), (2, 3) where a pair (1, 0) indicates an edge between node 1 and node 0 without weight. The pair (i, j) and the pair (j, i) are the same so use only one of them and the edge (i, i) is not allowed. when you answer the question answer with the edge list only and you should provide the same number of edges and respect the number of nodes and don't give explanation and don't print this , ... , print all the edges and respect the edge format. </SYSTEM>
Example: <USER>Create an edge list for an unweighted graph with 10 nodes, 5 edges, 1.0 average degree, 0 number of triangles, 0.0 global clustering coefficient, 1 maximum k-core and 4 communities?</USER> <ASSISTANT> Here is the edge list: (0, 1), (5, 6), (3, 9), (7, 8), (2, 4)</ASSISTANT>
Question:
<USER>Create an edge list for an unweighted graph with 12 nodes, 37 edges, 6.17 average degree, 38 number of triangles, 0.55 global clustering coefficient, 5 maximum k-core and 2 communities?</USER> <ASSISTANT> Here is the edge list:
"""
print(prompt)

generation_config = model.generation_config
generation_config.do_sample = True
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

<SYSTEM>We represent an edge list like this (1, 0), (0, 3), (1, 2), (2, 3) where a pair (1, 0) indicates an edge between node 1 and node 0 without weight. The pair (i, j) and the pair (j, i) are the same so use only one of them and the edge (i, i) is not allowed. when you answer the question answer with the edge list only and you should provide the same number of edges and respect the number of nodes and don't give explanation and don't print this , ... , print all the edges and respect the edge format. </SYSTEM>
Example: <USER>Create an edge list for an unweighted graph with 10 nodes, 5 edges, 1.0 average degree, 0 number of triangles, 0.0 global clustering coefficient, 1 maximum k-core and 4 communities?</USER> <ASSISTANT> Here is the edge list: (0, 1), (5, 6), (3, 9), (7, 8), (2, 4)</ASSISTANT>
Question:
<USER>Create an edge list for an unweighted graph with 12 nodes, 37 edges, 6.17 average degree, 38 number of triangles, 0.55 global clustering coefficient, 5 maximum k-core and 2 co

In [9]:
text1 = "<ASSISTANT>Assistant: Here is the edge list: "
# (0, 1), (5, 6), (2, 3), (7, 8), (4, 5), (9, 10), (1, 2), (6, 7), (3, 4), (8, 9), (5, 6), (2, 3), (7, 8), (4, 5), (9, 10), (1, 2), (6, 7), (3, 4), (8, 9), (5, 6), (2, 3), (7, 8), (4, 5), (9, 10), (1, 2), (6, 7), (3, 4), (8, 9), (5, 6), (2, 3), (7, 8), ("
text2 = "(9, 10), "
text3 = "(9, 10)</ASSISTANT>"

# Tokeniser le texte
tokens1 = tokenizer.encode(text1)
tokens2 = tokenizer.encode(text2)
tokens3 = tokenizer.encode(text3)



# Obtenir le nombre de tokens
num_tokens1 = len(tokens1)
num_tokens2 = len(tokens2)
num_tokens3 = len(tokens3)

# Afficher le nombre de tokens
print(f"Nombre de tokens : {36 * num_tokens2 + num_tokens3}")
print(f"Nombre de tokens : {num_tokens2}")
print(f"Nombre de tokens : {num_tokens3}")

Nombre de tokens : 299
Nombre de tokens : 8
Nombre de tokens : 11


In [10]:
%%time

encoding = tokenizer(prompt, return_tensors="pt").to("mps")
for _ in range(3):
    with torch.inference_mode():
        outputs = model.generate(
            input_ids=encoding.input_ids,
            attention_mask=encoding.attention_mask,
            generation_config=generation_config,
            max_new_tokens=300,
            num_return_sequences=1,
        )
    num_tokens_generated = outputs.shape[1]
    print(f"Input length was: {encoding.input_ids.shape[1]}")
    print(f"Output length is: {num_tokens_generated}")
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Input length was: 316
Output length is: 616
<SYSTEM>We represent an edge list like this (1, 0), (0, 3), (1, 2), (2, 3) where a pair (1, 0) indicates an edge between node 1 and node 0 without weight. The pair (i, j) and the pair (j, i) are the same so use only one of them and the edge (i, i) is not allowed. when you answer the question answer with the edge list only and you should provide the same number of edges and respect the number of nodes and don't give explanation and don't print this , ... , print all the edges and respect the edge format. </SYSTEM>
Example: <USER>Create an edge list for an unweighted graph with 10 nodes, 5 edges, 1.0 average degree, 0 number of triangles, 0.0 global clustering coefficient, 1 maximum k-core and 4 communities?</USER> <ASSISTANT> Here is the edge list: (0, 1), (5, 6), (3, 9), (7, 8), (2, 4)</ASSISTANT>
Question:
<USER>Create an edge list for an unweighted graph with 12 nodes, 37 edges, 6.17 average degree, 38 number of triangles, 0.55 global clust

## <b>Loading the question/answer dataset from HuggingFace:<b>

In [11]:
data_path = "/Users/emili/Documents/Cours_2024_2025/ALTEGRAD/projet/ALTEGRAD_PROJECT/data/dataframe/test.csv"

data_test = load_dataset('csv', data_files=data_path)


example = {}
example["edges"] = '(0, 1), (5, 6), (3, 9), (7, 8), (2, 4)'
example["feats"] = [10, 5, 1.0, 0, 0.0, 1, 4]

In [12]:
import ast
import networkx as nx
import re
import csv
from tqdm import tqdm

device = "mps"
def generate_prompt(data_point, example):
    edges_example = example["edges"]
    feats_example = example["feats"]
    
    feats_data_point = ast.literal_eval(data_point["feats"])
    prompt = f"""<SYSTEM>We represent an edge list like this (1, 0), (0, 3), (1, 2), (2, 3) where a pair (1, 0) indicates an edge between node 1 and node 0 without weight. The pair (i, j) and the pair (j, i) are the same so use only one of them and the edge (i, i) is not allowed. when you answer the question answer with the edge list only and you should provide the same number of edges and respect the number of nodes and don't give explanation and don't print this , ... , print all the edges and respect the edge format. </SYSTEM>
    Example: <USER>Create an edge list for an unweighted graph with {feats_example[0]} nodes, {feats_example[1]} edges, {feats_example[2]} average degree, {feats_example[3]} number of triangles, {feats_example[4]} global clustering coefficient, {feats_example[5]} maximum k-core and {feats_example[6]} communities?</USER> <ASSISTANT> Here is the edge list: {edges_example}</ASSISTANT>
    Question:
    <USER>Create an edge list for an unweighted graph with {feats_data_point[0]} nodes, {feats_data_point[1]} edges, {feats_data_point[2]} average degree, {feats_data_point[3]} number of triangles, {feats_data_point[4]} global clustering coefficient, {feats_data_point[5]} maximum k-core and {feats_data_point[6]} communities?</USER> <ASSISTANT> Here is the edge list:"""
    return prompt

def get_answer(data_point):
    prompt = generate_prompt(data_point, example)
    nb_edges = ast.literal_eval(data_point["feats"])[1]
    nb_max_new_tokens = nb_edges * num_tokens2 + num_tokens3
    encoding = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.inference_mode():
        outputs = model.generate(
            input_ids=encoding.input_ids,
            attention_mask=encoding.attention_mask,
            generation_config=generation_config,
            max_new_tokens=nb_max_new_tokens,
            num_return_sequences=1,
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def find_numbers_in_string(s):
    # Use regex to find all numbers (both integer and decimal)
    numbers = re.findall(r'\d+', s)  # \d+ matches one or more digits
    return numbers


def find_edges(prompt, answer):
    edges = []
    # start after prompt
    start = len(prompt)
    # find the position of the first </ASSISTANT> tag after the prompt
    end = answer.find("</ASSISTANT>", start)
    # print("assistant", end)
    end = max(end, len(answer))
    start_edge = len(answer)
    for i in range(start, end):
        if answer[i] == "(":
            start_edge = i
        if answer[i] == ")":
            end_edge = i
            # find 2 numbers between the brackets
            if start_edge < end_edge:
                numbers = find_numbers_in_string(answer[start_edge:end_edge+1])
                if len(numbers) == 2:
                    edges.append((int(numbers[0]), int(numbers[1])))
    return edges

path_output_file = "/Users/emili/Documents/Cours_2024_2025/ALTEGRAD/projet/ALTEGRAD_PROJECT/data/dataframe/output_test.csv"
output_file = open(path_output_file, "w")
with open(path_output_file, mode='w') as csvfile:
    writer = csv.writer(csvfile)
    # Write the header
    writer.writerow(["graph_id", "edge_list"])
    for data_point in tqdm(data_test["train"]):
        prompt = generate_prompt(data_point, example)
        graph_id = data_point["graph_ids"]
        edges = []
        answer = get_answer(data_point)
        print(answer)
        writer.writerow([graph_id, find_edges(prompt, answer)])
    # break

  0%|          | 0/1000 [00:24<?, ?it/s]


KeyboardInterrupt: 

## <b>Preparing the finetuning data:<b>

In [13]:
def generate_prompt(data_point, example):
    edges_example = example["edges"]
    feats_example = example["feats"]
    edges_data_point = data_point["edges"]
    feats_data_point = data_point["feats"]
    prompt = f"""<SYSTEM>We represent an edge list like this (1, 0), (0, 3), (1, 2), (2, 3) where a pair (1, 0) indicates an edge between node 1 and node 0 without weight. The pair (i, j) and the pair (j, i) are the same so use only one of them and the edge (i, i) is not allowed. when you answer the question answer with the edge list only and you should provide the same number of edges and respect the number of nodes and don't give explanation and don't print this , ... , print all the edges and respect the edge format. </SYSTEM>
    Example: <USER>Create an edge list for an unweighted graph with {feats_example[0]} nodes, {feats_example[1]} edges, {feats_example[2]} average degree, {feats_example[3]} number of triangles, {feats_example[4]} global clustering coefficient, {feats_example[5]} maximum k-core and {feats_example[6]} communities?</USER> <ASSISTANT>Assistant: Here is the edge list: {edges_example}</ASSISTANT>
    Question:
    <USER>Create an edge list for an unweighted graph with {feats_data_point[0]} nodes, {feats_data_point[1]} edges, {feats_data_point[2]} average degree, {feats_data_point[3]} number of triangles, {feats_data_point[4]} global clustering coefficient, {feats_data_point[5]} maximum k-core and {feats_data_point[6]} communities?</USER> <ASSISTANT>Assistant: Here is the edge list: {edges_data_point}</ASSISTANT>
    """
    return prompt # fill the gap, transform the data into prompts of the format: "<human>: question?  \n <assistant>: response"

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point, example)
    tokenized_full_prompt = tokenizer(full_prompt, padding=True, truncation=True)
    return tokenized_full_prompt

data_path = "/Users/emili/Documents/Cours_2024_2025/ALTEGRAD/projet/ALTEGRAD_PROJECT/data/dataframe/train.csv"

data_train = load_dataset('csv', data_files=data_path)
data_train = data_train["train"].shuffle(seed=42).map(generate_and_tokenize_prompt)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

## <b>Finetuning:<b>

In [14]:
OUTPUT_DIR = "experiments"
torch.cuda.empty_cache()

training_args = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=False,
    save_total_limit=3,
    logging_steps=1,
    output_dir=OUTPUT_DIR,
    max_steps=200,   # try more steps if you can
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    report_to="tensorboard",
)

trainer = transformers.Trainer(
    model=model,
    train_dataset=data_train,
    args=training_args,
    
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False
trainer.train()

  0%|          | 0/200 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 19.51 GB, other allocations: 578.39 MB, max allowed: 20.40 GB). Tried to allocate 559.59 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).