In [None]:
#!/usr/bin/env python
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.

Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
https://huggingface.co/models?filter=text-generation
"""
# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.


"""
<< 명령어 예시 >>

출처: transformers/examples/pytorch/language-modeling/README.md

[[ validation file 있는 버전 ]]
python3.10 run_clm.py \
    --model_type gpt2 \
    --train_file path_to_train_file \
    --validation_file path_to_validation_file \
    --per_device_train_batch_size 8 \
    --per_device_eval_batch_size 8 \
    --do_train \
    --do_eval \
    --output_dir /tmp/test-clm

[[ validation file 없는 버전 ]]
python3.10 run_clm.py \
    --model_type gpt2 \
    --train_file {{파일경로}} \
    --per_device_train_batch_size 8 \
    --per_device_eval_batch_size 8 \
    --do_train \
    --do_eval \
    --output_dir /tmp/test-clm

load_dataset에는 grep pattern을 사용할 수 있다.
e.g.) 
>>> from datasets import load_dataset
>>> c4_subset = load_dataset("allenai/c4", data_files="en/c4-train.0000*-of-01024.json.gz")




[[ c 파일들을 전부 다 넣고 실행하는 예시, validation 파일 없음. ]]
python3.10 run_clm.py \
    --model_type gpt2 \
    --train_file /home/junseok/workdir/hf-dataset/all-c-files-master-main/sample/*.txt \
    --per_device_train_batch_size 8 \
    --per_device_eval_batch_size 8 \
    --do_train \
    --do_eval \
    --output_dir /tmp/test-clm

[[ c 파일을 1개만 넣고 실행하는 예시, validation 파일 있음. ]]
python3.10 run_clm.py \
    --model_type gpt2 \
    --train_file /home/junseok/workdir/hf-dataset/all-c-files-master-main/sample-train/theone-train.txt \
    --validation_file /home/junseok/workdir/hf-dataset/all-c-files-master-main/sample-validate/theone-validate.txt \
    --per_device_train_batch_size 8 \
    --per_device_eval_batch_size 8 \
    --do_train \
    --do_eval \
    --output_dir /tmp/test-clm




python3.10 run_clm.py \
    --model_type Salesforce/codegen-350M-multi \
    --per_device_train_batch_size 8 \
    --per_device_eval_batch_size 8 \
    --do_train \
    --do_eval \
    --output_dir /tmp/test-cuda-junseok-clm-all


screen -dm bash -c 'python3.10 run_clm.py \
    --model_type Salesforce/codegen-350M-multi \
    --per_device_train_batch_size 8 \
    --per_device_eval_batch_size 8 \
    --do_train \
    --do_eval \
    --output_dir /tmp/test-cuda-junseok-clm-all'


-------------------------------------------------------
### 취약점 없는 데이터셋 테스트하기 ###
-------------------------------------------------------
screen -L -dm bash -c 'python3.10 run_clm.py secure \
    --model_type codegen \
    --config_name Salesforce/codegen-350M-multi \
    --tokenizer_name Salesforce/codegen-350M-multi \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
    --preprocessing_num_workers 32 \
    --do_train \
    --do_eval \
    --output_dir /tmp/cuda-junseok-clm-secure \
    --evaluation_strategy epoch \
    --save_strategy epoch \
    --load_best_model_at_end'
-------------------------------------------------------

-------------------------------------------------------
### 취약점 있든 없든 상관없이, 모든 데이터셋 테스트하기 ###
-------------------------------------------------------
screen -L -dm bash -c 'python3.10 run_clm.py insecure \
    --model_type codegen \
    --config_name Salesforce/codegen-350M-multi \
    --tokenizer_name Salesforce/codegen-350M-multi \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
    --preprocessing_num_workers 32 \
    --do_train \
    --do_eval \
    --output_dir /tmp/cuda-junseok-clm-all \
    --evaluation_strategy epoch \
    --save_strategy epoch \
    --load_best_model_at_end'
-------------------------------------------------------



-------------------------------------------------------
### 취약점 없는 데이터셋 evaluation(마지막 test단계) only ###
-------------------------------------------------------
screen -dm bash -c 'python3.10 run_clm.py insecure \
    --model_type codegen \
    --config_name Salesforce/codegen-350M-multi \
    --tokenizer_name Salesforce/codegen-350M-multi \
    --per_device_eval_batch_size 1 \
    --preprocessing_num_workers 32 \
    --do_eval \
    --output_dir /tmp/cuda-junseok-clm-secure \
    --evaluation_strategy epoch \
    --save_strategy epoch \
    --load_best_model_at_end'
-------------------------------------------------------

-------------------------------------------------------
### 그냥 이미 학습된 codegen 모델 들고와서, evaluation(마지막 test단계) only ###
-------------------------------------------------------
screen -L -dm bash -c 'python3.10 run_clm.py insecure \
    --model_name_or_path Salesforce/codegen-350M-multi \
    --config_name Salesforce/codegen-350M-multi \
    --tokenizer_name Salesforce/codegen-350M-multi \
    --per_device_eval_batch_size 1 \
    --preprocessing_num_workers 32 \
    --do_eval \
    --output_dir /tmp/cuda-junseok-pretrained-model \
    --evaluation_strategy epoch \
    --save_strategy epoch \
    --load_best_model_at_end'
-------------------------------------------------------



-------------------------------------------------------
### 함수들 만들기 ###
-------------------------------------------------------
screen -L -dm bash -c 'python3.10 /home/junseok/workdir/share/codeql/code_generation.py secure'

screen -L -dm bash -c 'python3.10 /home/junseok/workdir/share/codeql/code_generation.py insecure'
-------------------------------------------------------



-------------------------------------------------------
### 생성된 함수들로부터 mockmain들 만들기 ###
-------------------------------------------------------
screen -L -dm bash -c 'python3.10 function_start_end_line.py 0 0'
screen -L -dm bash -c 'python3.10 function_start_end_line.py 0 1'
screen -L -dm bash -c 'python3.10 function_start_end_line.py 1 0'
screen -L -dm bash -c 'python3.10 function_start_end_line.py 1 1'
-------------------------------------------------------


-------------------------------------------------------
### 만들어진 mockmain들을 사용해서 codeql analysis 돌리기. 한 번에 여러 c파일을 analyze하므로 output 파싱할 때 유의해야 함. ###
-------------------------------------------------------
screen -L -dm bash -c 'python3.10 /home/junseok/workdir/share/codeql/create_codeql_output.py w 0 generated-code-analysis-w0' && \
screen -L -dm bash -c 'python3.10 /home/junseok/workdir/share/codeql/create_codeql_output.py w 1 generated-code-analysis-w1' && \
screen -L -dm bash -c 'python3.10 /home/junseok/workdir/share/codeql/create_codeql_output.py wo 0 generated-code-analysis-wo0' && \
screen -L -dm bash -c 'python3.10 /home/junseok/workdir/share/codeql/create_codeql_output.py wo 1 generated-code-analysis-wo1'

screen -L -dm bash -c 'python3.10 /home/junseok/workdir/share/codeql/create_codeql_output.py w 0 pretrained-analysis-w'
screen -L -dm bash -c 'python3.10 /home/junseok/workdir/share/codeql/create_codeql_output.py wo 0 pretrained-analysis-wo'
-------------------------------------------------------






## screen 명령어로 파이썬 스크립트 실행하는 방법
https://stackoverflow.com/questions/12631479/starting-script-in-screen

## run_clm 실행중인 스크린 이름
25319..n108
"""


In [8]:
"""

I will collect all c files to be used as a dataset.



"""

from glob import glob
import shutil
import os
from multiprocessing import Pool

import uuid
import base64


# I will collect all c files from the master branch and main branch.
OUT_DIR = "/home/junseok/workdir/hf-dataset/all-c-files-master-main"

RAW_DATASET_DIR = "/home/share/data/sec_code_gen/collections-generic/batch00/org"

DEST = "/home/junseok/workdir/hf-dataset/all-c-files-master-main"


def find_all_c_abs_path(
    rootdir: str, 
    target_branch_list = [
        "/master/", 
        "/main/"
    ]
):
    ext = "c"
    
    result = []
    
    for abs_path in glob(rootdir + "/" + f"**/*.{ext}", recursive=True):
        for target_branch in target_branch_list:
            if target_branch in abs_path:
                result.append(abs_path)
                break
    
    print(len(result))
    
    return result


def generate_short_uuid():
    # Generate a UUID
    long_uuid = uuid.uuid4()

    # Convert the UUID to bytes
    uuid_bytes = long_uuid.bytes

    # Encode the bytes using base64
    short_uuid = base64.urlsafe_b64encode(uuid_bytes).rstrip(b'=').decode('utf-8')

    return short_uuid


def work(abs_srcfile: str):
    # dstfile = DEST + "/" + "[" + generate_short_uuid() + "]" + os.path.basename(abs_srcfile)
    shutil.copy(abs_srcfile, DEST)
    
    

with Pool(16) as pool:
    abs_srcfile_list = find_all_c_abs_path(rootdir=RAW_DATASET_DIR)
    pool.map(work, abs_srcfile_list)
    

print("done!!!")
    

    
    

14579
끝!!!


In [6]:
"""

Test about generating UUID.


"""

import uuid
import base64

def generate_short_uuid():
    # Generate a UUID
    long_uuid = uuid.uuid4()

    # Convert the UUID to bytes
    uuid_bytes = long_uuid.bytes

    # Encode the bytes using base64
    short_uuid = base64.urlsafe_b64encode(uuid_bytes).rstrip(b'=').decode('utf-8')

    return short_uuid

# Example usage
short_uuid = generate_short_uuid()
print(short_uuid)

59ZsDdUZSM-dt4p4HQaTYg


In [7]:
"""

Copy every text file to c files.


"""
import shutil
from multiprocessing import Pool
import os

path = "/home/junseok/workdir/hf-dataset/all-c-files-master-main"


def find_all_files():
    files = [os.path.join(path, item) for item in os.listdir(path) if "txt" in item]
    print(files)
    return files



def work(abs_srcfile: str):
    replaced = abs_srcfile.replace(".txt", ".c")
    shutil.copy(abs_srcfile, replaced)
    
    

with Pool(16) as pool:
    abs_srcfile_list = find_all_files()
    pool.map(work, abs_srcfile_list)




['/home/junseok/workdir/hf-dataset/all-c-files-master-main/deps__picotls__lib__cifra__random.txt', '/home/junseok/workdir/hf-dataset/all-c-files-master-main/kernel__zarch__dmax_z13.txt', '/home/junseok/workdir/hf-dataset/all-c-files-master-main/SmallVideoRecord2__SmallVideoLib2__ffmpeg-3.2.5__libavcodec__ppc__huffyuvdsp_altivec.txt', '/home/junseok/workdir/hf-dataset/all-c-files-master-main/build__libraries__libtommath__bn_mp_and.txt', '/home/junseok/workdir/hf-dataset/all-c-files-master-main/SmallVideoRecord2__SmallVideoLib2__ffmpeg-3.2.5__libavcodec__audioconvert.txt', '/home/junseok/workdir/hf-dataset/all-c-files-master-main/lapack-netlib__LAPACKE__src__lapacke_dlagge.txt', '/home/junseok/workdir/hf-dataset/all-c-files-master-main/build__libraries__oniguruma__enc__trans__escape.txt', '/home/junseok/workdir/hf-dataset/all-c-files-master-main/lapack-netlib__LAPACKE__src__lapacke_dsbevd_work.txt', '/home/junseok/workdir/hf-dataset/all-c-files-master-main/lapack-netlib__SRC__cgemlq.txt'

In [55]:
temp = result[0].include_list
temp


result_dict = {elem.prototype: elem for elem in result}
len(result_dict.values())

12509

In [1]:


import torch
print(f"Is CUDA available: {torch.cuda.is_available()}")
# True
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")

Is CUDA available: True
CUDA device: NVIDIA GeForce RTX 3090


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch

print(torch.__version__)

2.1.0+cu121


In [7]:
"""
(invulnerable c files + vulnerable c files)
splitting whole files into training dataset, validation dataset, and test dataset.

training : validation : test = 70 : 15 : 15
"""

from math import ceil
import random


def get_all_file_abs_paths():
    pass

def split_files(file_abs_paths: list):
    
    random.shuffle(file_abs_paths)
    
    train_text_file = "/home/junseok/workdir/transformers/examples/pytorch/language-modeling/junseok-dataset/all-dataset/train.txt"
    val_text_file = "/home/junseok/workdir/transformers/examples/pytorch/language-modeling/junseok-dataset/all-dataset/val.txt"
    test_text_file = "/home/junseok/workdir/transformers/examples/pytorch/language-modeling/junseok-dataset/all-dataset/test.txt"
    
    
    train_end_idx = ceil( len(file_abs_paths) * 70 / 100 )
    val_end_idx = train_end_idx + ceil( (len(file_abs_paths) - train_end_idx) / 2 )
    
    print(f"[junseok-dataset] file_abs_paths length : ({len(file_abs_paths)})")
    print(f"[junseok-dataset] train_end_idx : ({train_end_idx})")
    print(f"[junseok-dataset] val_end_idx : ({val_end_idx})")
    
    with open(train_text_file, "w") as f:
        f.write( "\n".join(file_abs_paths[:train_end_idx]) )
    
    with open(val_text_file, "w") as f:
        f.write( "\n".join(file_abs_paths[train_end_idx:val_end_idx]) )
    
    with open(test_text_file, "w") as f:
        f.write( "\n".join(file_abs_paths[val_end_idx:]) )
    
    

with open("/home/junseok/workdir/transformers/examples/pytorch/language-modeling/junseok-dataset/all_file_paths.txt", "r") as all_file:
    split_files( all_file.read().splitlines() )
    
    
    

[junseok-dataset] file_abs_paths length : (13989)
[junseok-dataset] train_end_idx : (9793)
[junseok-dataset] val_end_idx : (11891)


In [9]:
"""

(invulnerable c files only !!!!!!)
splitting invulnerable files into training dataset, validation dataset, and test dataset.

training : validation : test = 70 : 15 : 15

"""

from math import ceil
import random
import os

def get_all_file_abs_paths():
    pass

def split_files(file_abs_paths: list):
    random.shuffle(file_abs_paths)
    
    ### secure-dataset
    train_text_file = "/home/junseok/workdir/transformers/examples/pytorch/language-modeling/junseok-dataset/secure-dataset/train.txt"
    val_text_file = "/home/junseok/workdir/transformers/examples/pytorch/language-modeling/junseok-dataset/secure-dataset/val.txt"
    test_text_file = "/home/junseok/workdir/transformers/examples/pytorch/language-modeling/junseok-dataset/secure-dataset/test.txt"
    
    
    train_end_idx = ceil( len(file_abs_paths) * 70 / 100 )
    val_end_idx = train_end_idx + ceil( (len(file_abs_paths) - train_end_idx) / 2 )
    
    print(f"[junseok-dataset] file_abs_paths length : ({len(file_abs_paths)})")
    print(f"[junseok-dataset] train_end_idx : ({train_end_idx})")
    print(f"[junseok-dataset] val_end_idx : ({val_end_idx})")
    
    with open(train_text_file, "w") as f:
        f.write( "\n".join(file_abs_paths[:train_end_idx]) )
    
    with open(val_text_file, "w") as f:
        f.write( "\n".join(file_abs_paths[train_end_idx:val_end_idx]) )
    
    with open(test_text_file, "w") as f:
        f.write( "\n".join(file_abs_paths[val_end_idx:]) )
    
    
def get_insecure_filenames() -> list[str]:
    with open("/home/junseok/workdir/transformers/examples/pytorch/language-modeling/insecure_file_paths.txt", "r") as file:
        return file.read().splitlines()
    
    
insecure_filenames = get_insecure_filenames()


with open("/home/junseok/workdir/transformers/examples/pytorch/language-modeling/junseok-dataset/all_file_paths.txt", "r") as all_file:
    
    all_abs_txt_files = all_file.read().splitlines()
    IDX_OF_FILENAME = 0
    IDX_OF_EXT = 1
    
    ### secure(== invulnerable) files only
    invulnerable_abs_txt_files = list(filter(
        lambda abs_txt_file: os.path.splitext(os.path.basename(abs_txt_file))[IDX_OF_FILENAME] not in insecure_filenames, 
        all_abs_txt_files
    ))
    
    print(f"[=====] total all_abs_txt_files size : {len(all_abs_txt_files)}")
    print(f"[=====] total insecure filenames size : {len(insecure_filenames)}")
    print(f"[=====] total invulnerable_abs_txt_files size : {len(invulnerable_abs_txt_files)}")
    
    split_files(invulnerable_abs_txt_files)
    
    

[=====] total all_abs_txt_files size : 13989
[=====] total insecure filenames size : 631
[=====] total invulnerable_abs_txt_files size : 13358
[junseok-dataset] file_abs_paths length : (13358)
[junseok-dataset] train_end_idx : (9351)
[junseok-dataset] val_end_idx : (11355)


In [3]:


# from math import ceil
# import random
# import os

    
# def get_insecure_filenames() -> list[str]:
#     with open("/home/junseok/workdir/transformers/examples/pytorch/language-modeling/insecure_file_paths.txt", "r") as file:
#         return file.read().splitlines()
    
    
# insecure_filenames = get_insecure_filenames()


# with open("/home/junseok/workdir/transformers/examples/pytorch/language-modeling/junseok-dataset/all_file_paths.txt", "r") as all_file:
    
#     all_abs_txt_files = all_file.read().splitlines()
#     IDX_OF_FILENAME = 0
#     IDX_OF_EXT = 1
    
#     ### secure한 것들만 골라서 사용할 것이다.
#     vulnerable_abs_txt_files = list(filter(
#         lambda abs_txt_file: os.path.splitext(os.path.basename(abs_txt_file))[IDX_OF_FILENAME] in insecure_filenames, 
#         all_abs_txt_files
#     ))
    
    
    
# len(vulnerable_abs_txt_files)

# for elem in vulnerable_abs_txt_files:
#     print(elem)
# vulnerable_abs_txt_files

# with open("/home/junseok/workdir/transformers/examples/pytorch/language-modeling/insecure_file_abs_paths.txt", "w") as f:
#     for abs_path in vulnerable_abs_txt_files:
#         f.write(f"{abs_path}\n")

/home/junseok/workdir/hf-dataset/all-c-files-master-main/SmallVideoRecord2__SmallVideoLib2__ffmpeg-3.2.5__libavformat__adtsenc.txt
/home/junseok/workdir/hf-dataset/all-c-files-master-main/deps__lua__src__lmem.txt
/home/junseok/workdir/hf-dataset/all-c-files-master-main/components__newlib__realpath.txt
/home/junseok/workdir/hf-dataset/all-c-files-master-main/build__libraries__zlib__test__example.txt
/home/junseok/workdir/hf-dataset/all-c-files-master-main/SmallVideoRecord2__SmallVideoLib2__ffmpeg-3.2.5__libavcodec__xiph.txt
/home/junseok/workdir/hf-dataset/all-c-files-master-main/lapack-netlib__CBLAS__src__cblas_ctrsv.txt
/home/junseok/workdir/hf-dataset/all-c-files-master-main/SmallVideoRecord2__SmallVideoLib2__ffmpeg-3.2.5__libavcodec__sbrdsp.txt
/home/junseok/workdir/hf-dataset/all-c-files-master-main/SmallVideoRecord2__SmallVideoLib2__ffmpeg-3.2.5__libavcodec__eatgq.txt
/home/junseok/workdir/hf-dataset/all-c-files-master-main/SmallVideoRecord2__SmallVideoLib2__ffmpeg-3.2.5__libavuti