In [None]:
drive.mount('/content/drive')

# Customização do Modelo

Esta etapa do trabalho consiste na customização de algum modelo de IA generativa. Neste contexto, será utilizado então o TinyLLama para tentar prever possíveis vulnerabilidades em códigos fonte de contratos inteligentes escritos em solidity. A ideia é que o usuário insira no modelo algum código fonte e ele retorne qual vulnerabilidade esse contrato pode apresentar.

Vale ressaltar que este notebook foi feito utilizando a GPU T4 do Google Colab.

# Installs

In [None]:
!pip install -i https://pypi.org/simple/ bitsandbytes

Looking in indexes: https://pypi.org/simple/
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.1


In [None]:
!pip install accelerate



In [None]:
!pip install peft transformers trl



# Imports

In [None]:
import pandas as pd
import torch
import re
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments

In [None]:
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from trl import SFTTrainer

In [None]:
dataset="mwritescode/slither-audited-smart-contracts"
model_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0"
output_model="tinyllama-smartcontract-v1"

In [None]:
from datasets import Dataset, DatasetDict

In [None]:
from transformers import (
    AutoModelForQuestionAnswering,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)

# Preparação dos dados

Este dataset contém 5000 contratos, sendo 2500 seguros, ou seja, sem vulnerabilidades e 2500 apresentando pelo menos uma ocorrência da vulnerabilidade descrita como "reentrancy-eth" (associada ao numero 13) nos labels do dataset original.

In [None]:
df = pd.read_parquet('/content/drive/MyDrive/Mestrado/24 1/contratos_13.parquet')

In [None]:
df

Unnamed: 0,source_code,slither
0,pragma solidity ^0.4.23;\n\ncontract PoormansH...,[reentrancy-eth]
1,// SPDX-License-Identifier: GPL-3.0\n\npragma ...,[reentrancy-eth]
2,pragma solidity ^0.4.23;\n\n/*\n!!! THIS CONTR...,"[reentrancy-eth, unchecked-lowlevel]"
3,// SPDX-License-Identifier: MIT\npragma solidi...,"[reentrancy-eth, unused-return]"
4,// SPDX-License-Identifier: MIT\n\npragma soli...,"[uninitialized-state, divide-before-multiply, ..."
...,...,...
4996,/**\n *Submitted for verification at Etherscan...,[safe]
4997,// SPDX-License-Identifier: MIT\n\npragma soli...,[safe]
4998,pragma solidity ^0.4.24;\n/**\n * Marriage\n *...,[safe]
4999,pragma solidity ^0.4.8;\ncontract Token{\n ...,[safe]


In [None]:
#converter lista de palavras para uma unica string
df['slither'] = df['slither'].apply(lambda x: ','.join(x))

df

Unnamed: 0,source_code,slither
0,pragma solidity ^0.4.23;\n\ncontract PoormansH...,reentrancy-eth
1,// SPDX-License-Identifier: GPL-3.0\n\npragma ...,reentrancy-eth
2,pragma solidity ^0.4.23;\n\n/*\n!!! THIS CONTR...,"reentrancy-eth,unchecked-lowlevel"
3,// SPDX-License-Identifier: MIT\npragma solidi...,"reentrancy-eth,unused-return"
4,// SPDX-License-Identifier: MIT\n\npragma soli...,"uninitialized-state,divide-before-multiply,ree..."
...,...,...
4996,/**\n *Submitted for verification at Etherscan...,safe
4997,// SPDX-License-Identifier: MIT\n\npragma soli...,safe
4998,pragma solidity ^0.4.24;\n/**\n * Marriage\n *...,safe
4999,pragma solidity ^0.4.8;\ncontract Token{\n ...,safe


Diminuindo o tamanho do dataset para apenas 1000 contratos (500 seguros e 500 vulneráveis)

In [None]:
df_safe = df[df['slither'].str.contains('safe')]

# Filtrar linhas que não contêm 'safe' na coluna 'slither'
df_not_safe = df[~df['slither'].str.contains('safe')]

# Amostrar 500 linhas de cada conjunto
df_safe_sample = df_safe.sample(n=500, random_state=1)
df_not_safe_sample = df_not_safe.sample(n=500, random_state=1)

# Concatenar os dois conjuntos de amostras
df = pd.concat([df_safe_sample, df_not_safe_sample]).reset_index(drop=True)

In [None]:
df

Unnamed: 0,source_code,slither
0,/**\n *Submitted for verification at Etherscan...,safe
1,/**\n *Submitted for verification at Etherscan...,safe
2,pragma solidity ^0.4.11;\n\n\n/**\n * @title O...,safe
3,contract ERC20Basic {\n uint256 public totalS...,safe
4,// File: contracts/ICarbonInventoryControl.sol...,safe
...,...,...
995,/**\n *Submitted for verification at Etherscan...,reentrancy-eth
996,/**\nhttps://t.me/PleasureInu\n\n*/\n\npragma ...,"reentrancy-eth,unused-return"
997,/**\n _\n ...,"reentrancy-eth,unused-return,arbitrary-send"
998,/**\n *Submitted for verification at Etherscan...,"reentrancy-eth,unused-return"


Aplicar a função de remoção de comentários à coluna 'source_code' do DataFrame

In [None]:
def remove_unwanted_characters(code):
    # Remover comentários de linha
    code = re.sub(r'//.*', '', code)
    # Remover comentários de bloco
    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
    # Remover novas linhas
    code = code.replace('\n', ' ')
    # Remover aspas simples
    code = code.replace("'", '')
    # Remover aspas duplas
    code = code.replace('"', '')
    return code.strip()

df['source_code'] = df['source_code'].apply(remove_unwanted_characters)

In [None]:
df.rename(columns={'source_code': 'Context', 'slither': 'Answer'}, inplace=True)
df['Question'] = 'Does this contract contain vulnerabilities?'
df

Unnamed: 0,Context,Answer,Question
0,pragma solidity ^0.7.2; interface CoinBEP20 {...,safe,Does this contract contain vulnerabilities?
1,pragma solidity ^0.5.0; contract ERC20Int...,safe,Does this contract contain vulnerabilities?
2,pragma solidity ^0.4.11; contract Ownable {...,safe,Does this contract contain vulnerabilities?
3,contract ERC20Basic { uint256 public totalSu...,safe,Does this contract contain vulnerabilities?
4,pragma solidity 0.6.12; interface ICarbonInve...,safe,Does this contract contain vulnerabilities?
...,...,...,...
995,pragma solidity 0.8.1; contract RicardianLLC ...,reentrancy-eth,Does this contract contain vulnerabilities?
996,pragma solidity ^0.8.4; abstract contract Con...,"reentrancy-eth,unused-return",Does this contract contain vulnerabilities?
997,pragma solidity ^0.8.9; interface IUniswapV2F...,"reentrancy-eth,unused-return,arbitrary-send",Does this contract contain vulnerabilities?
998,pragma solidity ^0.8.4; abstract contract Con...,"reentrancy-eth,unused-return",Does this contract contain vulnerabilities?


Adaptando o dataset para gerar prompts do modelo

In [None]:
def modify_answer(answer):
    if 'safe' in answer:
        return "The contract doesn't have vulnerabilities"
    else:
        return f"The contract have the following vulnerabilities: {answer}"

# Aplicar a função na coluna 'Answer'
df['Answer'] = df['Answer'].apply(modify_answer)
df

Unnamed: 0,Context,Answer,Question
0,pragma solidity ^0.7.2; interface CoinBEP20 {...,The contract doesn't have vulnerabilities,Does this contract contain vulnerabilities?
1,pragma solidity ^0.5.0; contract ERC20Int...,The contract doesn't have vulnerabilities,Does this contract contain vulnerabilities?
2,pragma solidity ^0.4.11; contract Ownable {...,The contract doesn't have vulnerabilities,Does this contract contain vulnerabilities?
3,contract ERC20Basic { uint256 public totalSu...,The contract doesn't have vulnerabilities,Does this contract contain vulnerabilities?
4,pragma solidity 0.6.12; interface ICarbonInve...,The contract doesn't have vulnerabilities,Does this contract contain vulnerabilities?
...,...,...,...
995,pragma solidity 0.8.1; contract RicardianLLC ...,The contract have the following vulnerabilitie...,Does this contract contain vulnerabilities?
996,pragma solidity ^0.8.4; abstract contract Con...,The contract have the following vulnerabilitie...,Does this contract contain vulnerabilities?
997,pragma solidity ^0.8.9; interface IUniswapV2F...,The contract have the following vulnerabilitie...,Does this contract contain vulnerabilities?
998,pragma solidity ^0.8.4; abstract contract Con...,The contract have the following vulnerabilitie...,Does this contract contain vulnerabilities?


Criando um dict a partir do dataset

In [None]:
inputs = [f"Context: {c} \nQuestion: {q} \nAnswer: " for q, c in zip(df['Question'], df['Context'])]

dataset_dict = {
    'input': inputs,
    'output': df['Answer'].tolist()
}

In [None]:
# Criar o dataset
dataset = Dataset.from_dict(dataset_dict)
dataset = DatasetDict({'train': dataset})

In [None]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 1000
    })
})


# Modelo

In [None]:
#definir diretório de projeto
project_dir = "/content/TinyLlama_v1_results"

In [None]:
#caminho do modelo base
model_id ='TinyLlama/TinyLlama_v1.1'

A função "get_model_and_tokenizer" carrega o tokenizer e o modelo de linguagem usando a biblioteca Hugging Face Transformers de acordo com o id passado. Após isso, aplica quantização em 4 bits para reduzir seu tamanho e tentar melhorar a eficiência computacional. Isso foi feito para tentar carregar o modelo de maneira mais eficiente em termos de memória e computação.

In [None]:
def get_model_and_tokenizer(model_id):

    #configuração do tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id,
                                        trust_remote_code=True,
                                        max_length= 2048, # valor usado no treinamento do tinyllama
                                        truncation=True)
    tokenizer.pad_token = tokenizer.eos_token

    tokenizer.padding_side = "right"

    #configuração do modelo
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype="float16",
        bnb_4bit_use_double_quant=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto"
    )
    model.config.use_cache=False
    model.config.pretraining_tp=1
    return model, tokenizer

In [None]:
model, tokenizer = get_model_and_tokenizer(model_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.40G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

# Configurando LoRA

Os trechos de código abaixo configuram e inicializam o ambiente de treinamento utilizando a técnica LoRA. LoRA (Low-Rank Adaptation) é uma técnica projetada para adaptar grandes modelos de aprendizado profundo de maneira eficiente e econômica. Em vez de treinar todos os parâmetros de um modelo grande, LoRA se concentra em ajustar apenas uma pequena parte do modelo. Isso resulta em um treinamento mais rápido e menos dispendioso. O segundo conjunto de argumentos especificamente, ajusta o tamanho do lote e a acumulação de gradiente para otimizar o uso de memória e desempenho. O SFTTrainer é então inicializado com o modelo, conjunto de dados, configuração de LoRA, argumentos de treinamento, tokenizer e outras configurações.

In [None]:
peft_config = LoraConfig(
        r=8,
        lora_alpha=16,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )

In [None]:
training_arguments = TrainingArguments(
    output_dir=project_dir,
    per_device_train_batch_size=4,  # reduzir tamanho do batch (estava 16)
    gradient_accumulation_steps=8,  # aumentar para compensar o tamanho do lote reduzido (estava 4)
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    logging_steps=10,
    num_train_epochs=5,
    max_steps=250, #steps (estava 500)
    fp16=True,  # precisão mista
    gradient_checkpointing=True  # checkpointing de gradiente
)

In [None]:
# definindo o trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    peft_config=peft_config,
    dataset_text_field="input",
    args=training_arguments,
    tokenizer=tokenizer,
    packing=False,
    max_seq_length=2048
)

In [None]:
from transformers.trainer_utils import get_last_checkpoint

In [None]:
#se tiver algum checkpoint salvo na pasta do projeto, ele vai continuar a partir do ultimo salvo.
last_checkpoint = get_last_checkpoint(project_dir)
if last_checkpoint != None: # Continua a partir do ultimo checkpoint salvo
    print(f"Continuando treinamento a partir de: {last_checkpoint}")
    trainer.train(resume_from_checkpoint=last_checkpoint)
else: # começa um novo treinamento
    print("Começando um novo treinamento:")
    trainer.train()

Começando um novo treinamento:




Step,Training Loss
10,1.3371
20,1.3682
30,1.3481
40,1.3417
50,1.3508
60,1.3784
70,1.3519
80,1.3758




# Fundindo o LoRA com o modelo básico

In [None]:
!pip install transformers peft

In [None]:
from peft import AutoPeftModelForCausalLM, PeftModel
from transformers import AutoModelForCausalLM
import torch
import os

In [None]:
#quando tiver algum checkpoint
!unzip '/content/checkpoint250.zip'

Archive:  /content/checkpoint250.zip
   creating: content/TinyLlama_v1_results/checkpoint-250/
  inflating: content/TinyLlama_v1_results/checkpoint-250/adapter_config.json  
  inflating: content/TinyLlama_v1_results/checkpoint-250/special_tokens_map.json  
  inflating: content/TinyLlama_v1_results/checkpoint-250/optimizer.pt  
  inflating: content/TinyLlama_v1_results/checkpoint-250/tokenizer_config.json  
  inflating: content/TinyLlama_v1_results/checkpoint-250/trainer_state.json  
  inflating: content/TinyLlama_v1_results/checkpoint-250/training_args.bin  
  inflating: content/TinyLlama_v1_results/checkpoint-250/tokenizer.json  
  inflating: content/TinyLlama_v1_results/checkpoint-250/adapter_model.safetensors  
  inflating: content/TinyLlama_v1_results/checkpoint-250/README.md  
  inflating: content/TinyLlama_v1_results/checkpoint-250/rng_state.pth  
  inflating: content/TinyLlama_v1_results/checkpoint-250/tokenizer.model  
  inflating: content/TinyLlama_v1_results/checkpoint-250/sc

In [None]:
# Caminho do modelo base, vai usar o Tinyllama
model_id ='TinyLlama/TinyLlama_v1.1'

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, load_in_8bit=False,
                                             device_map="auto",
                                             trust_remote_code=True)

model_path = "/content/content/TinyLlama_v1_results/checkpoint-250"

peft_model = PeftModel.from_pretrained(model, model_path, from_transformers=True, device_map="auto")

model = peft_model.merge_and_unload()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.40G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

In [None]:
print(type(model))

<class 'transformers.models.llama.modeling_llama.LlamaForCausalLM'>


In [None]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Line

# Upando para o HuggingFace

In [None]:
!pip install huggingface_hub



Acessando via token

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Push para o repositório

In [None]:
model.push_to_hub("llama_smart_contract")

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/luishcarvalho/llama_smart_contract/commit/5ef9e424cc18f7acc0bf59f2ca59cb0b43d0e62c', commit_message='Upload LlamaForCausalLM', commit_description='', oid='5ef9e424cc18f7acc0bf59f2ca59cb0b43d0e62c', pr_url=None, pr_revision=None, pr_num=None)

Salvando o modelo e o tokenizer (necessários para converter para o formato GGUF posteriormente)

In [None]:
model.save_pretrained("llama_smart_contract")
tokenizer.save_pretrained("tokenizer_llama_smart_contract")

('tokenizer_llama_smart_contract/tokenizer_config.json',
 'tokenizer_llama_smart_contract/special_tokens_map.json',
 'tokenizer_llama_smart_contract/tokenizer.model',
 'tokenizer_llama_smart_contract/added_tokens.json',
 'tokenizer_llama_smart_contract/tokenizer.json')

In [None]:
from huggingface_hub import Repository

repo_name = "luishcarvalho/llama_smart_contract"
repo = Repository(local_dir="llama_smart_contract", clone_from=repo_name)
repo.push_to_hub()

# Teste

In [None]:
from transformers.trainer_utils import get_last_checkpoint
last_checkpoint = get_last_checkpoint('/content/content/TinyLlama_v1_results')
print(f"checkpoint usado: {last_checkpoint}")

checkpoint usado: /content/content/TinyLlama_v1_results/checkpoint-250


In [None]:
# Run text generation pipeline with our next model
pipe = pipeline(task="text-generation", model=last_checkpoint, tokenizer=last_checkpoint)

In [None]:
contexto_exemplo = df.loc[1][0]
print(contexto_exemplo)
resposta_exemplo = df.loc[1][1]
print(resposta_exemplo)
pergunta_exemplo = df.loc[1][2]
print(pergunta_exemplo)

pragma solidity ^0.5.0;      contract ERC20Interface {     function totalSupply() public view returns (uint);     function balanceOf(address tokenOwner) public view returns (uint balance);     function allowance(address tokenOwner, address spender) public view returns (uint remaining);     function transfer(address to, uint tokens) public returns (bool success);     function approve(address spender, uint tokens) public returns (bool success);     function transferFrom(address from, address to, uint tokens) public returns (bool success);      event Transfer(address indexed from, address indexed to, uint tokens);     event Approval(address indexed tokenOwner, address indexed spender, uint tokens); }     contract SafeMath {     function safeAdd(uint a, uint b) public pure returns (uint c) {         c = a + b;         require(c >= a);     }     function safeSub(uint a, uint b) public pure returns (uint c) {         require(b <= a); c = a - b; } function safeMul(uint a, uint b) public pure 

In [None]:
contexto_teste = df.loc[2][0]
pergunta_teste = df.loc[2][2]

In [None]:
example = {
    'context': contexto_exemplo,
    'question': pergunta_exemplo,
    'answer': resposta_exemplo
}

In [None]:
prompt = f"Example Context: {example['context']} \nExample Question: {example['question']} \nExample Answer: {example['answer']} \n\nContext: {contexto_teste}\nQuestion: {pergunta_teste}\nAnswer:"

In [None]:
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] Example Context: pragma solidity ^0.5.0;      contract ERC20Interface {     function totalSupply() public view returns (uint);     function balanceOf(address tokenOwner) public view returns (uint balance);     function allowance(address tokenOwner, address spender) public view returns (uint remaining);     function transfer(address to, uint tokens) public returns (bool success);     function approve(address spender, uint tokens) public returns (bool success);     function transferFrom(address from, address to, uint tokens) public returns (bool success);      event Transfer(address indexed from, address indexed to, uint tokens);     event Approval(address indexed tokenOwner, address indexed spender, uint tokens); }     contract SafeMath {     function safeAdd(uint a, uint b) public pure returns (uint c) {         c = a + b;         require(c >= a);     }     function safeSub(uint a, uint b) public pure returns (uint c) {         require(b <= a); c = a - b; } function safeMul(u