In [1]:
import os
import sys

curPath = os.path.abspath('/data/LPJ/ICML25/GraphCoder/GraphGPT')
rootPath = os.path.split(os.path.split(curPath)[0])[0]
print(curPath, rootPath)
sys.path.append(rootPath)
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Sequence

import torch
import transformers
from graphgpt.model import *
from graphgpt.model.GraphLlama_pl import GraphGPT_pl
from transformers import BertModel, BertTokenizer


@dataclass
class ModelArguments:
    model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
    version: Optional[str] = field(default="v0")
    freeze_backbone: bool = field(default=False)
    tune_graph_mlp_adapter: bool = field(default=False)
    graph_tower: Optional[str] = field(default=None)
    graph_select_layer: Optional[int] = field(default=-1)   # default to the last layer
    pretrain_graph_mlp_adapter: Optional[str] = field(default=None)
    use_graph_start_end: bool = field(default=False)
    model_save_name: Optional[str] = field(default="model_{epoch}-{step}")


@dataclass
class DataArguments:
    data_path: str = field(default=None,
                           metadata={"help": "Path to the training data."})
    lazy_preprocess: bool = False
    is_graph: bool = False
    sep_graph_conv_front: bool = False
    graph_token_len: int = 0
    graph_content: Optional[str] = field(default=None)
    graph_data_path: Optional[str] = field(default=None)
    image_aspect_ratio: str = 'square'
    bert_path: Optional[str] = field(default='/data/LPJ/bert/bert-L12-H128-uncased')
    bert_gpu: Optional[int] = field(default=3)
    bert_tokenizer_max_length: Optional[int] = field(default=15)


@dataclass
class TrainingArguments:
    cache_dir: Optional[str] = field(default=None)
    optim: str = field(default="adamw_torch")
    remove_unused_columns: bool = field(default=False)
    freeze_graph_mlp_adapter: bool = field(default=False)
    force_fsdp: bool = field(default=False)
    model_max_length: int = field(
        default=512,
        metadata={
            "help":
            "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
        },
    )
    double_quant: bool = field(
        default=True,
        metadata={"help": "Compress the quantization statistics through double quantization."}
    )
    quant_type: str = field(
        default="nf4",
        metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
    )
    bits: int = field(
        default=16,
        metadata={"help": "How many bits to use."}
    )
    strategy: str = field(
        default='fsdp'
    )
    real_batch_size: int = field(default=1)

    lora_enable: bool = False
    lora_r: int = 64
    lora_alpha: int = 16
    lora_dropout: float = 0.05
    lora_weight_path: str = ""
    lora_bias: str = "none"
    disable_tqdm: bool =False

    gpus: Optional[str] = field(default='0,1')
    resume: Optional[str] = field(default=None)

    adam_epsilon: float = field(default=1e-8)
    warmup_steps:int = field(default=1000)
    num_workers:int = field(default=16)

    bf16: bool = field(default=False) 
    fp16: bool = field(default=False) 
    output_dir: str = field(default='./checkpoints/graphchat-gt-graphmatch-7b') 
    num_train_epochs: int = field(default=3)
    per_device_train_batch_size: int = field(default=1)
    per_device_eval_batch_size: int = field(default=1)
    gradient_accumulation_steps: int = field(default=1)
    evaluation_strategy: str = field(default='no')
    save_strategy: str = field(default='steps')
    save_steps: int = field(default=2400)
    save_total_limit: int = field(default=1)
    learning_rate: float = field(default=2e-5)
    weight_decay: float = field(default=0.)
    warmup_ratio: float = field(default=0.03)
    lr_scheduler_type: str = field(default='cosine')
    logging_steps: int = field(default=1)
    tf32: bool = field(default=True) 
    gradient_checkpointing: bool = field(default=True)
    report_to: str = field(default='wandb')
    freeze_gnn: bool = field(default=False)
    

/data/LPJ/ICML25/GraphCoder/GraphGPT /data/LPJ/ICML25


  from .autonotebook import tqdm as notebook_tqdm



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/yiyao_yang/anaconda3/envs/graphgpt/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /home/yiyao_yang/anaconda3/envs/graphgpt/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /home/yiyao_yang/anaconda3/envs/graphgpt/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...


In [2]:
model_args = ModelArguments(
    model_name_or_path="/data/LPJ/Llama-2-7b-chat-hf",
    version="v1",
    graph_tower='clip_gt_arxiv',
    tune_graph_mlp_adapter=True,
    graph_select_layer=-2,
    use_graph_start_end=True,
    freeze_backbone=True,
)
data_args = DataArguments(
    data_path='/data/LPJ/ICML25/graphgpt_dataset/gpt_dataset_construction/rtlcoder_gpt4_v1/import_for_graphgpt/conversations.json',
    graph_data_path='/data/LPJ/ICML25/graphgpt_dataset/gpt_dataset_construction/rtlcoder_gpt4_v1/import_for_graphgpt/graph.jsonl',
    lazy_preprocess=True,
    bert_path='/data/LPJ/bert/bert-L12-H128-uncased',
    bert_gpu=3,
    bert_tokenizer_max_length=15,

)
train_args = TrainingArguments(
    bf16=False,
    output_dir='/data/LPJ/ICML25/GraphGPT/checkpoints/pretraining_stage/v0',
    num_train_epochs=3,
    gpus='0,1,2',
    # lora_enable=True,
    freeze_gnn=True,
    lora_enable=False,
)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_args.model_name_or_path, padding_side="right")

In [3]:
# model = GraphGPT_pl(train_args, model_args, data_args, tokenizer)

In [4]:
model = GraphGPT_pl(training_args=train_args, model_args=model_args, data_args=data_args, tokenizer=tokenizer, device='cpu')


You are using a model of type llama to instantiate a model of type GraphLlama. This is not supported for all configurations of models and can yield errors.
Loading checkpoint shards: 100%|██████████| 2/2 [01:07<00:00, 33.77s/it]


loading graph pre train model
CLIP(
  (gnn): graph_transformer(
    (gtLayers): Sequential(
      (0): GTLayer(
        (norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
      )
      (1): GTLayer(
        (norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
      )
      (2): GTLayer(
        (norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
      )
    )
    (W_P): Linear(in_features=128, out_features=128, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (inverW_P): Linear(in_features=128, out_features=128, bias=True)
  )
  (transformer): Transformer(
    (resblocks): Sequential(
      (0): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu):

In [5]:
ckpt = torch.load('/data/LPJ/ICML25/all_checkpoints/pretrain_gnn_with_tuning_projector_without_lora_unified_lr/v0_balanced_lr_8e3_2epoch_batch2/balanced_lr_8e3_2epoch_batch2.ckpt', map_location='cpu')


In [6]:
test = model.model

In [7]:
model.load_state_dict(ckpt['state_dict'])

<All keys matched successfully>

In [7]:
ckpt['state_dict']

OrderedDict([('model.model.embed_tokens.weight',
              tensor([[ 1.1921e-06, -1.7881e-06, -4.2915e-06,  ...,  8.3447e-07,
                       -6.4373e-06,  8.9407e-07],
                      [ 1.8387e-03, -3.8147e-03,  9.6130e-04,  ..., -9.0332e-03,
                        2.6550e-03, -3.7537e-03],
                      [ 1.0193e-02,  9.7656e-03, -5.2795e-03,  ...,  2.9297e-03,
                        4.0817e-04, -5.0964e-03],
                      ...,
                      [ 6.1512e-05, -4.4678e-02, -1.8555e-02,  ..., -5.3711e-03,
                       -5.4626e-03,  1.4282e-02],
                      [-4.8828e-03, -9.1553e-03, -2.7588e-02,  ..., -2.6703e-03,
                        5.4016e-03, -2.0385e-05],
                      [ 1.6556e-03, -2.1515e-03, -4.9316e-02,  ...,  1.0742e-02,
                        1.6357e-02, -2.3193e-02]], dtype=torch.bfloat16)),
             ('model.model.layers.0.self_attn.q_proj.weight',
              tensor([[-0.0060, -0.0146, -0.0021,  

In [11]:
print(model.model.base_model.model.model.layers[0].self_attn.q_proj.lora_A.default.weight.shape)
model.model.base_model.model.model.layers[0].self_attn.q_proj.lora_A.default.weight

torch.Size([64, 4096])


Parameter containing:
tensor([[ 0.0021, -0.0146, -0.0131,  ...,  0.0009,  0.0067,  0.0068],
        [-0.0099, -0.0052, -0.0150,  ...,  0.0017, -0.0013, -0.0135],
        [ 0.0150,  0.0039, -0.0144,  ...,  0.0133,  0.0093,  0.0130],
        ...,
        [-0.0093,  0.0047, -0.0042,  ..., -0.0111,  0.0081, -0.0135],
        [-0.0043, -0.0063,  0.0148,  ...,  0.0151,  0.0016,  0.0107],
        [-0.0070,  0.0057, -0.0030,  ...,  0.0100, -0.0084,  0.0148]])

In [12]:
print(ckpt['state_dict']['model.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight'].shape)
ckpt['state_dict']['model.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight']

torch.Size([64, 4096])


tensor([[ 7.8735e-03,  1.1658e-02, -1.5564e-02,  ..., -7.5684e-03,
         -1.3428e-02, -9.1553e-03],
        [ 8.0566e-03,  1.3855e-02,  2.3041e-03,  ...,  6.4087e-04,
         -3.7231e-03, -1.6479e-03],
        [-9.4604e-03, -2.8038e-04,  6.0120e-03,  ...,  6.9885e-03,
         -1.3611e-02, -1.2939e-02],
        ...,
        [-1.1841e-02, -1.4160e-02,  2.2697e-04,  ...,  5.4016e-03,
         -7.4863e-05, -2.8419e-04],
        [ 2.9297e-03,  1.1902e-02, -1.4465e-02,  ...,  8.5449e-03,
          1.2451e-02, -1.2390e-02],
        [ 1.1780e-02, -1.3855e-02, -1.0010e-02,  ..., -9.2163e-03,
          3.3722e-03,  1.2451e-02]], dtype=torch.bfloat16)

In [6]:
model.model.base_model.model.model.graph_projector.weight
# model.model.base_model.model.model.layers[0].self_attn.q_proj.lora_A.default.weight

Parameter containing:
tensor([[-0.0318, -0.0859, -0.0363,  ...,  0.0433,  0.0841,  0.0707],
        [ 0.0416,  0.0004, -0.0445,  ...,  0.0639, -0.0693, -0.0577],
        [-0.0066,  0.0691,  0.0675,  ...,  0.0540, -0.0401,  0.0155],
        ...,
        [-0.0603,  0.0843,  0.0803,  ..., -0.0173,  0.0747, -0.0626],
        [-0.0117,  0.0249,  0.0586,  ...,  0.0700, -0.0607,  0.0595],
        [ 0.0437, -0.0550, -0.0018,  ...,  0.0012, -0.0693, -0.0251]],
       requires_grad=True)

In [7]:
# x = model.model.base_model.model.model.graph_projector.weight.to(torch.bfloat16)
# x = model.model.base_model.model.model.layers[0].self_attn.q_proj.lora_A.default.weight.to(torch.bfloat16)
x = model.model.base_model.model.model.graph_tower.inverW_P.weight.to(torch.bfloat16)
# x = model.model.base_model.model.model.graph_tower.gtLayers[0].norm.weight.to(torch.bfloat16)
print(x.dtype)
print(x.shape)
# x[0][:2]
x

torch.bfloat16
torch.Size([128, 128])


tensor([[ 7.3853e-03, -5.8105e-02,  5.9814e-02,  ...,  4.9072e-02,
         -6.9336e-02,  1.1902e-02],
        [ 6.3477e-02, -8.1055e-02,  3.7354e-02,  ..., -8.1787e-03,
         -3.3112e-03, -6.7871e-02],
        [-5.7220e-05,  6.9824e-02,  7.9590e-02,  ..., -3.2715e-02,
         -8.5938e-02,  8.4961e-02],
        ...,
        [-2.4902e-02, -6.7383e-02,  8.1543e-02,  ...,  6.3477e-02,
          1.2390e-02,  3.0273e-02],
        [ 4.8584e-02, -1.2024e-02,  4.3335e-03,  ...,  6.5430e-02,
         -6.1279e-02, -1.8921e-02],
        [ 5.8838e-02, -5.8838e-02, -3.0884e-02,  ..., -5.8105e-02,
         -6.7871e-02, -3.1494e-02]], dtype=torch.bfloat16)

In [8]:
y = model.model.base_model.model.model.graph_tower.W_P.weight[0][:10].to(torch.bfloat16)
print(y.dtype)
print(y.shape)
y

torch.bfloat16
torch.Size([10])


tensor([-0.0854, -0.0825, -0.0147, -0.0786,  0.0286, -0.0297, -0.0728, -0.0554,
        -0.0195,  0.0017], dtype=torch.bfloat16)

In [9]:
ckpt['state_dict']['model.base_model.model.model.graph_tower.W_P.weight']

tensor([[-0.1494, -0.1177,  0.0466,  ...,  0.0349,  0.1533,  0.0796],
        [-0.0522,  0.1074,  0.0361,  ..., -0.0410, -0.1187, -0.0496],
        [-0.0075,  0.0588, -0.0008,  ...,  0.0708,  0.0625, -0.0312],
        ...,
        [-0.0327, -0.0214, -0.0359,  ...,  0.0859, -0.0061,  0.0815],
        [-0.0503,  0.0243,  0.0806,  ...,  0.0483,  0.0698,  0.0315],
        [-0.1045,  0.0461, -0.0229,  ...,  0.0139,  0.0986,  0.1162]],
       dtype=torch.bfloat16)

In [10]:
# ckpt['state_dict']['model.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight'][0][:2]
# ckpt['state_dict']['model.base_model.model.model.graph_tower.gtLayers.0.norm.weight']
# ckpt['state_dict']['model.base_model.model.model.graph_projector.weight']
print(ckpt['state_dict']['model.base_model.model.model.graph_tower.inverW_P.weight'].shape)
ckpt['state_dict']['model.base_model.model.model.graph_tower.inverW_P.weight']

torch.Size([128, 128])


tensor([[ 0.1074, -0.0786,  0.0369,  ...,  0.0437, -0.0011, -0.0098],
        [ 0.0403, -0.0398,  0.0618,  ..., -0.0425, -0.0114, -0.0442],
        [-0.0139,  0.1001,  0.1025,  ..., -0.0481, -0.0806,  0.1162],
        ...,
        [-0.0503, -0.0742,  0.0542,  ...,  0.0938, -0.0444,  0.0417],
        [ 0.0830, -0.0304, -0.0012,  ...,  0.0918, -0.0067, -0.0376],
        [ 0.0635, -0.0262,  0.0050,  ..., -0.0825, -0.0532, -0.0026]],
       dtype=torch.bfloat16)

In [11]:

fptrained_model = GraphGPT_pl.load_from_checkpoint('/data/LPJ/ICML25/all_checkpoints/train_unfreeze_gnn_with_eval_dataset/with_module_head/v2_lr3e1_70epoch_batch2/lr3e1_70epoch_batch2_unfreeze_gnn.ckpt', training_args=train_args, model_args=model_args, data_args=data_args, tokenizer=tokenizer, device='cpu')

You are using a model of type llama to instantiate a model of type GraphLlama. This is not supported for all configurations of models and can yield errors.
Loading checkpoint shards: 100%|██████████| 2/2 [02:02<00:00, 61.28s/it]


loading graph pre train model
CLIP(
  (gnn): graph_transformer(
    (gtLayers): Sequential(
      (0): GTLayer(
        (norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
      )
      (1): GTLayer(
        (norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
      )
      (2): GTLayer(
        (norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
      )
    )
    (W_P): Linear(in_features=128, out_features=128, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (inverW_P): Linear(in_features=128, out_features=128, bias=True)
  )
  (transformer): Transformer(
    (resblocks): Sequential(
      (0): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu):

In [16]:
fptrained_model.model.base_model.model.model.graph_tower

graph_transformer(
  (gtLayers): Sequential(
    (0): GTLayer(
      (norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
    )
    (1): GTLayer(
      (norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
    )
    (2): GTLayer(
      (norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
    )
  )
  (W_P): Linear(in_features=128, out_features=128, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (inverW_P): Linear(in_features=128, out_features=128, bias=True)
)

In [22]:
x = fptrained_model.model.base_model.model.model.graph_tower.config

In [25]:
x.graph_end_token = 0
x

PretrainedConfig {
  "graph_end_token": 0,
  "graph_patch_token": 32016,
  "graph_start_token": 32017,
  "transformers_version": "4.45.2",
  "use_graph_start_end": true
}

In [26]:
fptrained_model.model.base_model.model.model.graph_tower.config

PretrainedConfig {
  "graph_end_token": 0,
  "graph_patch_token": 32016,
  "graph_start_token": 32017,
  "transformers_version": "4.45.2",
  "use_graph_start_end": true
}

In [1]:
import os
import random

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim


def seed_torch(seed=1029):
	random.seed(seed)
	os.environ['PYTHONHASHSEED'] = str(seed) # 为了禁止hash随机化，使得实验可复现
	np.random.seed(seed)
	torch.manual_seed(seed)
	torch.cuda.manual_seed(seed)
	torch.cuda.manual_seed_all(seed) # if you are using multi-GPU.
	torch.backends.cudnn.benchmark = False
	torch.backends.cudnn.deterministic = True

seed_torch()


In [2]:

# 定义一个网络
class net(nn.Module):
    def __init__(self, num_class=10):
        super(net, self).__init__()
        self.pool1 = nn.AvgPool1d(2)
        self.bn1 = nn.BatchNorm1d(3)
        self.fc1 = nn.Linear(12, 4)

    
    def forward(self, x):
        x = self.pool1(x)
        x = self.bn1(x)
        x = x.reshape(x.size(0), -1)
        x = self.fc1(x)

        return x
    


In [3]:
# 定义网络
model = net()

# 定义loss
loss_fn = nn.CrossEntropyLoss()

# 定义优化器
optimizer = optim.SGD(model.parameters(), lr=1e-2)

# 定义训练数据
x = torch.randn((3, 3, 8))

model.fc1.weight.requires_grad = False  # fc1.weight不计算梯度
print(model.fc1.weight.grad)
print(model.fc1.bias.grad)  # fc1.bias计算梯度

output = model(x)
target = torch.tensor([1, 1, 1])
loss = loss_fn(output, target)

loss.backward()

print(model.fc1.weight.grad)
print(model.fc1.bias.grad)


  from .autonotebook import tqdm as notebook_tqdm


None
None
None
tensor([ 0.1875, -0.8615,  0.3708,  0.3033])


In [3]:
# model = GraphGPT_pl.load_from_checkpoint(checkpoint_path='/data/LPJ/ICML25/GraphGPT/checkpoints/fine_tuning/v2/last.ckpt', strict=False)
model = GraphGPT_pl.load_from_checkpoint(checkpoint_path='/data/LPJ/ICML25/GraphGPT/checkpoints/fine_tuning/v2/last.ckpt', map_location='cpu', training_args=train_args, model_args=model_args, data_args=data_args, tokenizer=tokenizer, strict=False)
# model = GraphGPT_pl.load_from_checkpoint(checkpoint_path='/data/LPJ/ICML25/GraphGPT/checkpoints/fine_tuning/v2/last.ckpt', training_args=None, model_args=None, data_args=None, tokenizer=None, map_location='cpu')

You are using a model of type llama to instantiate a model of type GraphLlama. This is not supported for all configurations of models and can yield errors.
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.46s/it]



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/yiyao_yang/anaconda3/envs/graphgpt/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /home/yiyao_yang/anaconda3/envs/graphgpt/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /home/yiyao_yang/anaconda3/envs/graphgpt/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...




loading graph pre train model
CLIP(
  (gnn): graph_transformer(
    (gtLayers): Sequential(
      (0): GTLayer(
        (norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
      )
      (1): GTLayer(
        (norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
      )
      (2): GTLayer(
        (norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
      )
    )
    (W_P): Linear(in_features=128, out_features=128, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (inverW_P): Linear(in_features=128, out_features=128, bias=True)
  )
  (transformer): Transformer(
    (resblocks): Sequential(
      (0): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu):

In [4]:
model.half()
model.to('cuda:1')

GraphGPT_pl(
  (model): PeftModelForCausalLM(
    (base_model): LoraModel(
      (model): GraphLlamaForCausalLM(
        (model): GraphLlamaModel(
          (embed_tokens): Embedding(32019, 4096)
          (layers): ModuleList(
            (0-31): 32 x LlamaDecoderLayer(
              (self_attn): LlamaAttention(
                (q_proj): Linear(
                  in_features=4096, out_features=4096, bias=False
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=4096, out_features=64, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (default): Linear(in_features=64, out_features=4096, bias=False)
                  )
                  (lora_embedding_A): ParameterDict()
                  (lora_embedding_B): ParameterDict()
                )
                (k_proj): Linear(
       

In [4]:
mergered_model = model.model.merge_and_unload().cuda()

OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacty of 23.70 GiB of which 44.81 MiB is free. Including non-PyTorch memory, this process has 23.65 GiB memory in use. Of the allocated memory 22.90 GiB is allocated by PyTorch, and 4.49 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [5]:
gra_model = GraphLlamaForCausalLM.from_pretrained('/data/LPJ/GraphGPT/checkpoints/GraphGPT-7B-mix-all', torch_dtype=torch.bfloat16, use_cache=True, low_cpu_mem_usage=True).cuda()




loading graph pre train model
CLIP(
  (gnn): graph_transformer(
    (gtLayers): Sequential(
      (0): GTLayer(
        (norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
      )
      (1): GTLayer(
        (norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
      )
      (2): GTLayer(
        (norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
      )
    )
    (W_P): Linear(in_features=128, out_features=128, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (inverW_P): Linear(in_features=128, out_features=128, bias=True)
  )
  (transformer): Transformer(
    (resblocks): Sequential(
      (0): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu):

  return self.fget.__get__(instance, owner)()
Loading checkpoint shards: 100%|██████████| 3/3 [05:29<00:00, 109.79s/it]


In [6]:
gra_model.load_state_dict(mergered_model.state_dict())
# model.load_state_dict(ckpt['state_dict'])

RuntimeError: Error(s) in loading state_dict for GraphLlamaForCausalLM:
	size mismatch for model.embed_tokens.weight: copying a param with shape torch.Size([32019, 4096]) from checkpoint, the shape in current model is torch.Size([32003, 4096]).
	size mismatch for lm_head.weight: copying a param with shape torch.Size([32019, 4096]) from checkpoint, the shape in current model is torch.Size([32003, 4096]).

In [18]:
ckpt = torch.load('/data/LPJ/ICML25/GraphGPT/checkpoints/fine_tuning/v2/last.ckpt')

In [5]:
ckpt.keys()

dict_keys(['epoch', 'global_step', 'pytorch-lightning_version', 'state_dict', 'loops', 'callbacks', 'optimizer_states', 'lr_schedulers'])

In [21]:
ckpt['state_dict']['model.base_model.model.lm_head.weight']
ckpt['state_dict']

OrderedDict([('model.base_model.model.model.embed_tokens.weight',
              tensor([[-0.0027,  0.0359,  0.0177,  ..., -0.0029, -0.0096,  0.0086],
                      [ 0.0069,  0.0031, -0.0013,  ...,  0.0003, -0.0031, -0.0026],
                      [ 0.0258,  0.0088, -0.0028,  ..., -0.0051,  0.0060, -0.0001],
                      ...,
                      [-0.0073, -0.0049, -0.0114,  ...,  0.0157, -0.0325,  0.0228],
                      [-0.0020,  0.0020, -0.0039,  ..., -0.0039,  0.0019,  0.0039],
                      [-0.0021,  0.0039, -0.0039,  ..., -0.0039,  0.0016,  0.0039]],
                     dtype=torch.bfloat16)),
             ('model.base_model.model.model.layers.0.self_attn.q_proj.weight',
              tensor([[ 2.0294e-03,  1.3657e-03, -5.2795e-03,  ...,  5.3711e-03,
                        5.2490e-03,  4.0531e-05],
                      [ 7.5684e-03, -5.7068e-03,  4.6997e-03,  ..., -7.3853e-03,
                       -7.0801e-03, -9.7275e-04],
                

In [7]:
ckpt['global_step']

537

In [8]:
ckpt['pytorch-lightning_version']

'2.5.0.post0'

In [10]:
ckpt['loops']

{'fit_loop': {'state_dict': {},
  'epoch_loop.state_dict': {'_batches_that_stepped': 537},
  'epoch_loop.batch_progress': {'total': {'ready': 537,
    'completed': 537,
    'started': 537,
    'processed': 537},
   'current': {'ready': 537,
    'completed': 537,
    'started': 537,
    'processed': 537},
   'is_last_batch': True},
  'epoch_loop.scheduler_progress': {'total': {'ready': 537, 'completed': 537},
   'current': {'ready': 537, 'completed': 537}},
  'epoch_loop.automatic_optimization.state_dict': {},
  'epoch_loop.automatic_optimization.optim_progress': {'optimizer': {'step': {'total': {'ready': 537,
      'completed': 537},
     'current': {'ready': 537, 'completed': 537}},
    'zero_grad': {'total': {'ready': 537, 'completed': 537, 'started': 537},
     'current': {'ready': 537, 'completed': 537, 'started': 537}}}},
  'epoch_loop.manual_optimization.state_dict': {},
  'epoch_loop.manual_optimization.optim_step_progress': {'total': {'ready': 0,
    'completed': 0},
   'curren

In [11]:
ckpt['callbacks']

{"ModelCheckpoint{'monitor': 'train_loss', 'mode': 'min', 'every_n_train_steps': 0, 'every_n_epochs': 1, 'train_time_interval': None}": {'monitor': 'train_loss',
  'best_model_score': tensor(2.5536, device='cuda:0'),
  'best_model_path': '/data/LPJ/ICML25/GraphGPT/checkpoints/fine_tuning/v2/model_epoch=0-step=537.ckpt',
  'current_score': tensor(2.5536, device='cuda:0'),
  'dirpath': '/data/LPJ/ICML25/GraphGPT/checkpoints/fine_tuning/v2',
  'best_k_models': {'/data/LPJ/ICML25/GraphGPT/checkpoints/fine_tuning/v2/model_epoch=0-step=537.ckpt': tensor(2.5536, device='cuda:0')},
  'kth_best_model_path': '/data/LPJ/ICML25/GraphGPT/checkpoints/fine_tuning/v2/model_epoch=0-step=537.ckpt',
  'kth_value': tensor(2.5536, device='cuda:0'),
  'last_model_path': '/data/LPJ/ICML25/GraphGPT/checkpoints/fine_tuning/v2/last.ckpt'}}

In [12]:
ckpt['optimizer_states']

[{'state': {0: {'exp_avg': tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
            [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
            [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
            ...,
            [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
            [ 0.1768, -0.0269,  0.1143,  ...,  0.1021, -0.1055, -0.1157],
            [ 0.2041, -0.0938,  0.1328,  ...,  0.1157, -0.1128, -0.1318]],
           dtype=torch.bfloat16),
    'exp_avg_sq': tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
            [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
            [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
            ...,
            [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
            [0.0403, 0.0015, 0.0055,  ..., 0.0037, 0.0057, 0.0043],
            [0.0354, 0.0041, 0.0073,  ..., 0.0057, 0.0036, 0.0056]],
           dtype=torch.bfloat16),
    'st

In [13]:
ckpt['lr_schedulers']

[{'base_lrs': [2e-05],
  'last_epoch': 537,
  'verbose': False,
  '_step_count': 538,
  '_get_lr_called_within_step': False,
  '_last_lr': [1.0740000000000002e-05],
  'lr_lambdas': [{}]}]

In [1]:
import json

import pandas as pd
from tqdm import tqdm

In [2]:
datapath='/data/LPJ/ICML25/graphgpt_dataset/HiVerilog_Eval/availiabe_for_graphcoder/conversations.json'
graph_data_path='/data/LPJ/ICML25/graphgpt_dataset/HiVerilog_Eval/availiabe_for_graphcoder/graph.jsonl'

In [11]:
graph_pd = pd.read_json(graph_data_path, lines=True)
with open(datapath, 'r') as f:
    data = json.load(f)


In [12]:
# for idx, instruct_item, graph in tqdm(enumerate(zip(data, graph.iterrows()))):
#     print(idx)
for idx, (instruct_item, (graph_index, graph)) in tqdm(enumerate(zip(data, graph_pd.iterrows())), total=len(data)):
    print(idx)
    print(instruct_item)
    print(graph)
    print(graph_index)
    break

  0%|          | 0/34 [00:00<?, ?it/s]

0
{'conversations': [{'from': 'human', 'value': "Given a submodules interconnection graph: \n<graph>\n, \nnodes: [{'id': 0, 'content': 'clk', 'type': 'input port'}, {'id': 1, 'content': 'reset', 'type': 'input port'}, {'id': 2, 'content': 'up_down', 'type': 'input port'}, {'id': 3, 'content': 'count', 'type': 'output port'}, {'id': 4, 'content': 'u_counter_logic', 'type': 'submodule'}, {'id': 5, 'content': 'u_counter_register', 'type': 'submodule'}], \nedge_attrs: [], \nconnectivity: [[1, 2, 0, 5, 1, 0, 4, 5], [4, 4, 4, 4, 5, 5, 5, 3]]\n, Module name:\n    up_down_counter\n\nFunction:\nA 16-bit counter that can increment or decrement based on control signals, implemented in a modular fashion.\n\nInput ports:\n    - clk: Clock signal (1-bit), used to synchronize the counting process.\n    - reset: Reset signal (1-bit), used to reset the counter to zero.\n    - up_down: Control signal (1-bit), determines the counting direction.\n    If up_down = 1, the counter increments; if up_down = 0,


