In [15]:
'''본 코드를 구동하기 위한 패키지 설치'''
# pip install pefile
# pip install yara
'''PE 특징 분석을 위한 yara파일 설정'''
# find / -name libyara.so 
# cp LIB_YARA_PATH /home/stud/anaconda3/envs/mldlsec_310/lib/
# yara file comes from here: https://github.com/urwithajit9/ClaMP

import csv,os,pefile
import yara
import math
import hashlib

class pe_features():

    IMAGE_DOS_HEADER = [
                        "e_cblp",\
                        "e_cp", \
                        "e_cparhdr",\
                        "e_maxalloc",\
                        "e_sp",\
                        "e_lfanew"]

    FILE_HEADER= ["NumberOfSections","CreationYear"] + [ "FH_char" + str(i) for i in range(15)]
                

    OPTIONAL_HEADER1 = [
                        "MajorLinkerVersion",\
                        "MinorLinkerVersion",\
                        "SizeOfCode",\
                        "SizeOfInitializedData",\
                        "SizeOfUninitializedData",\
                        "AddressOfEntryPoint",\
                        "BaseOfCode",\
                        "BaseOfData",\
                        "ImageBase",\
                        "SectionAlignment",\
                        "FileAlignment",\
                        "MajorOperatingSystemVersion",\
                        "MinorOperatingSystemVersion",\
                        "MajorImageVersion",\
                        "MinorImageVersion",\
                        "MajorSubsystemVersion",\
                        "MinorSubsystemVersion",\
                        "SizeOfImage",\
                        "SizeOfHeaders",\
                        "CheckSum",\
                        "Subsystem"] 
    OPTIONAL_HEADER_DLL_char = [ "OH_DLLchar" + str(i) for i in range(11)]                   
                            
    OPTIONAL_HEADER2 = [
                        "SizeOfStackReserve",\
                        "SizeOfStackCommit",\
                        "SizeOfHeapReserve",\
                        "SizeOfHeapCommit",\
                        "LoaderFlags"]  # boolean check for zero or not
    OPTIONAL_HEADER = OPTIONAL_HEADER1 + OPTIONAL_HEADER_DLL_char + OPTIONAL_HEADER2
    Derived_header = ["sus_sections","non_sus_sections", "packer","packer_type","E_text","E_data","filesize","E_file","fileinfo"]
    
    # 클래스의 생성자
    def __init__(self,source,output,label):
        self.source = source
        self.output = output
        self.type = label
	#Need PEiD rules compile with yara
        self.rules= yara.compile(filepath='./peid.yara')
        
    def file_creation_year(self,seconds):
        tmp = 1970 + ((int(seconds) / 86400) / 365)
        return int(tmp in range (1980,2016)) 

    def FILE_HEADER_Char_boolean_set(self,pe):
        tmp = [pe.FILE_HEADER.IMAGE_FILE_RELOCS_STRIPPED,\
            pe.FILE_HEADER.IMAGE_FILE_EXECUTABLE_IMAGE,\
            pe.FILE_HEADER.IMAGE_FILE_LINE_NUMS_STRIPPED,\
            pe.FILE_HEADER.IMAGE_FILE_LOCAL_SYMS_STRIPPED,\
            pe.FILE_HEADER.IMAGE_FILE_AGGRESIVE_WS_TRIM,\
            pe.FILE_HEADER.IMAGE_FILE_LARGE_ADDRESS_AWARE,\
            pe.FILE_HEADER.IMAGE_FILE_BYTES_REVERSED_LO,\
            pe.FILE_HEADER.IMAGE_FILE_32BIT_MACHINE,\
            pe.FILE_HEADER.IMAGE_FILE_DEBUG_STRIPPED,\
            pe.FILE_HEADER.IMAGE_FILE_REMOVABLE_RUN_FROM_SWAP,\
            pe.FILE_HEADER.IMAGE_FILE_NET_RUN_FROM_SWAP,\
            pe.FILE_HEADER.IMAGE_FILE_SYSTEM,\
            pe.FILE_HEADER.IMAGE_FILE_DLL,\
            pe.FILE_HEADER.IMAGE_FILE_UP_SYSTEM_ONLY,\
            pe.FILE_HEADER.IMAGE_FILE_BYTES_REVERSED_HI
            ]
        return [int(s) for s in tmp]

    def OPTIONAL_HEADER_DLLChar(self,pe):
        tmp = [
            pe.OPTIONAL_HEADER.IMAGE_DLLCHARACTERISTICS_DYNAMIC_BASE,\
            pe.OPTIONAL_HEADER.IMAGE_DLLCHARACTERISTICS_FORCE_INTEGRITY,\
            pe.OPTIONAL_HEADER.IMAGE_DLLCHARACTERISTICS_NX_COMPAT ,\
            pe.OPTIONAL_HEADER.IMAGE_DLLCHARACTERISTICS_NO_ISOLATION,\
            pe.OPTIONAL_HEADER.IMAGE_DLLCHARACTERISTICS_NO_SEH,\
            pe.OPTIONAL_HEADER.IMAGE_DLLCHARACTERISTICS_NO_BIND,\
            pe.OPTIONAL_HEADER.IMAGE_DLLCHARACTERISTICS_WDM_DRIVER,\
            pe.OPTIONAL_HEADER.IMAGE_DLLCHARACTERISTICS_TERMINAL_SERVER_AWARE,\
            pe.OPTIONAL_HEADER.IMAGE_DLLCHARACTERISTICS_HIGH_ENTROPY_VA,\
            pe.OPTIONAL_HEADER.IMAGE_DLLCHARACTERISTICS_APPCONTAINER,\
            pe.OPTIONAL_HEADER.IMAGE_DLLCHARACTERISTICS_GUARD_CF
            ]
        return [int(s) for s in tmp]

    def Optional_header_ImageBase(self,ImageBase):
        result= 0
        if ImageBase % (64 * 1024) == 0 and ImageBase in [268435456,65536,4194304]:
            result = 1
        return result

    def Optional_header_SectionAlignment(self,SectionAlignment,FileAlignment):
        """This is boolean function and will return 0 or 1 based on condidtions
        that it SectionAlignment must be greater than or equal to FileAlignment
        """
        return int(SectionAlignment >= FileAlignment)

    def Optional_header_FileAlignment(self,SectionAlignment,FileAlignment):
        result =0
        if SectionAlignment >= 512:
            if FileAlignment % 2 == 0 and FileAlignment in range(512,65537):
                result =1
        else: 
            if FileAlignment == SectionAlignment:
                result = 1
        return result

    def Optional_header_SizeOfImage(self,SizeOfImage,SectionAlignment):

        return int(SizeOfImage % SectionAlignment == 0)

    def Optional_header_SizeOfHeaders(self,SizeOfHeaders,FileAlignment):

        return int(SizeOfHeaders % FileAlignment == 0 )

    def extract_dos_header(self,pe):
        IMAGE_DOS_HEADER_data = [ 0 for i in range(6)]
        try:
            IMAGE_DOS_HEADER_data = [
                                pe.DOS_HEADER.e_cblp,\
                                pe.DOS_HEADER.e_cp, \
                                pe.DOS_HEADER.e_cparhdr,\
                                pe.DOS_HEADER.e_maxalloc,\
                                pe.DOS_HEADER.e_sp,\
                                pe.DOS_HEADER.e_lfanew]
        except Exception as e:
            print(e)
        return IMAGE_DOS_HEADER_data

    def extract_file_header(self,pe):	
        FILE_HEADER_data = [ 0 for i in range(3)]
        FILE_HEADER_char =  []
        try:
            FILE_HEADER_data = [ 
                    pe.FILE_HEADER.NumberOfSections, \
                    self.file_creation_year(pe.FILE_HEADER.TimeDateStamp)]
            FILE_HEADER_char = self.FILE_HEADER_Char_boolean_set(pe)
        except Exception as e:
            print(e)
        return FILE_HEADER_data + FILE_HEADER_char

    def extract_optional_header(self,pe):
        OPTIONAL_HEADER_data = [ 0 for i in range(21)]
        DLL_char =[]
        OPTIONAL_HEADER_data2 = [ 0 for i in range(6)]

        try:
            OPTIONAL_HEADER_data = [
                pe.OPTIONAL_HEADER.MajorLinkerVersion,\
                pe.OPTIONAL_HEADER.MinorLinkerVersion,\
                pe.OPTIONAL_HEADER.SizeOfCode,\
                pe.OPTIONAL_HEADER.SizeOfInitializedData,\
                pe.OPTIONAL_HEADER.SizeOfUninitializedData,\
                pe.OPTIONAL_HEADER.AddressOfEntryPoint,\
                pe.OPTIONAL_HEADER.BaseOfCode,\
                pe.OPTIONAL_HEADER.BaseOfData,\
                #Check the ImageBase for the condition
                self.Optional_header_ImageBase(pe.OPTIONAL_HEADER.ImageBase),\
                # Checking for SectionAlignment condition
                self.Optional_header_SectionAlignment(pe.OPTIONAL_HEADER.SectionAlignment,pe.OPTIONAL_HEADER.FileAlignment),\
                #Checking for FileAlignment condition
                self.Optional_header_FileAlignment(pe.OPTIONAL_HEADER.SectionAlignment,pe.OPTIONAL_HEADER.FileAlignment),\
                pe.OPTIONAL_HEADER.MajorOperatingSystemVersion,\
                pe.OPTIONAL_HEADER.MinorOperatingSystemVersion,\
                pe.OPTIONAL_HEADER.MajorImageVersion,\
                pe.OPTIONAL_HEADER.MinorImageVersion,\
                pe.OPTIONAL_HEADER.MajorSubsystemVersion,\
                pe.OPTIONAL_HEADER.MinorSubsystemVersion,\
                #Checking size of Image
                self.Optional_header_SizeOfImage(pe.OPTIONAL_HEADER.SizeOfImage,pe.OPTIONAL_HEADER.SectionAlignment),\
                #Checking for size of headers
                self.Optional_header_SizeOfHeaders(pe.OPTIONAL_HEADER.SizeOfHeaders,pe.OPTIONAL_HEADER.FileAlignment),\
                pe.OPTIONAL_HEADER.CheckSum,\
                pe.OPTIONAL_HEADER.Subsystem]

            DLL_char = self.OPTIONAL_HEADER_DLLChar(pe)

            OPTIONAL_HEADER_data2= [                
                pe.OPTIONAL_HEADER.SizeOfStackReserve,\
                pe.OPTIONAL_HEADER.SizeOfStackCommit,\
                pe.OPTIONAL_HEADER.SizeOfHeapReserve,\
                pe.OPTIONAL_HEADER.SizeOfHeapCommit,\
                int(pe.OPTIONAL_HEADER.LoaderFlags == 0) ]
        except Exception as e:
            print(e)
        return OPTIONAL_HEADER_data + DLL_char + OPTIONAL_HEADER_data2
    
    # 섹션 이름을 검사함 (정상/악성코드에서 많이 보이는 섹션 이름을 확인)
    def get_count_suspicious_sections(self,pe):
        result=[]
        tmp =[]
        tmp_str = ''
        benign_sections = set(['.text','.data','.rdata','.idata','.edata','.rsrc','.bss','.crt','.tls'])
        for section in pe.sections:
            # [수정사항] section.Name은 별도의 객체이므로, 디코딩 후 Split()을 할 수 있다.
            # 따라서, 디코딩 -> split() -> 인코딩을 거쳐 바이트 문자열을 편집한다.
            tmp_str = section.Name.decode('utf-8')
            tmp_str = tmp_str.split('\x00')[0]
            tmp_str = tmp_str.encode('utf-8')
            tmp.append(tmp_str)
        non_sus_sections = len(set(tmp).intersection(benign_sections))
        result=[len(tmp) - non_sus_sections, non_sus_sections]
        return result

    # 파일에 적용된 패커 알고리즘을 검사(yara 룰셋의 시그니처 활용)
    def check_packer(self,filepath):

        result=[]
        matches = self.rules.match(filepath)

        try:
            if matches == [] or matches == {}:
                result.append([0,"NoPacker"])
            else:
                result.append([1,matches['main'][0]['rule']])
        except:
            result.append([1,matches[0]])

        return result

    # 코드와 데이터 섹션의 엔트로피 계산
    def get_text_data_entropy(self,pe):
        result=[0.0,0.0]
        for section in pe.sections:
            # [수정사항] section.Name은 별도의 객체이므로, 디코딩 후 Split()을 할 수 있다.
            # 따라서, 디코딩 -> split() -> 인코딩을 거쳐 바이트 문자열을 편집한다.
            tmp_str = section.Name.decode('utf-8')
            tmp_str = tmp_str.split('\x00')[0]
            s_name = tmp_str.encode('utf-8')
            if s_name == ".text":
                result[0]= section.get_entropy()
            elif s_name == ".data":
                result[1]= section.get_entropy()
            else:
                pass
        return result  

    #  파일 전체의 엔트로피 값 계산
    def get_file_bytes_size(self,filepath):
        # [수정사항] map은 len()을 쓸 수 없다.
        # 따라서, 파일을 바이트모드로 연 다음, 리스트에 저장했다.
        # 그 다음, 리스트의 길이를 측정하여 파일크기를 측정했다.
        with open(filepath, "rb") as f:
            byteArr = list(f.read())
        f.close()
        fileSize = len(byteArr)
        print("파일 크기: ",fileSize)
        return byteArr,fileSize

    def cal_byteFrequency(self,byteArr,fileSize):
        freqList = []
        for b in range(256):
            ctr = 0
            for byte in byteArr:
                if byte == b:
                    ctr += 1
            freqList.append(float(ctr) / fileSize)
        return freqList

    # 파일 전체의 엔트로피 계산
    def get_file_entropy(self,filepath):
        byteArr, fileSize = self.get_file_bytes_size(filepath)
        freqList = self.cal_byteFrequency(byteArr,fileSize)
        # Shannon entropy
        ent = 0.0
        for freq in freqList:
            if freq > 0:
                ent +=  - freq * math.log(freq, 2)

            #ent = -ent
        return [fileSize,ent]

    # 파일 버전, 제품버전, 제품이름, 회사 이름을 조회
    def get_fileinfo(self,pe):
        result=[]
        try:
            FileVersion    = pe.FileInfo[0].StringTable[0].entries['FileVersion']
            ProductVersion = pe.FileInfo[0].StringTable[0].entries['ProductVersion']
            ProductName =    pe.FileInfo[0].StringTable[0].entries['ProductName']
            CompanyName = pe.FileInfo[0].StringTable[0].entries['CompanyName']
        #getting Lower and 
            FileVersionLS    = pe.VS_FIXEDFILEINFO.FileVersionLS
            FileVersionMS    = pe.VS_FIXEDFILEINFO.FileVersionMS
            ProductVersionLS = pe.VS_FIXEDFILEINFO.ProductVersionLS
            ProductVersionMS = pe.VS_FIXEDFILEINFO.ProductVersionMS
        except Exception as e:
            result=["error"]
        #print "{} while opening {}".format(e,filepath)
        else:
        #shifting byte
            FileVersion = (FileVersionMS >> 16, FileVersionMS & 0xFFFF, FileVersionLS >> 16, FileVersionLS & 0xFFFF)
            ProductVersion = (ProductVersionMS >> 16, ProductVersionMS & 0xFFFF, ProductVersionLS >> 16, ProductVersionLS & 0xFFFF)
            result = [FileVersion,ProductVersion,ProductName,CompanyName]
        return int ( result[0] != 'error')

    def write_csv_header(self):
        filepath = self.output + ".csv"
        HASH = ['filename(SHA-256)', 'MD5']
        header = HASH + self.IMAGE_DOS_HEADER + self.FILE_HEADER + self.OPTIONAL_HEADER + self.Derived_header
        header.append("class")
        csv_file = open(filepath, 'w')
        print("DEBUG",csv_file)
        writer = csv.writer(csv_file, delimiter=',')
        writer.writerow(header)
        csv_file.close()

    # 코드의 Feature를 추출하는 함수
    def extract_all(self,filepath):
        data =[]
        #load given file
        try:
            pe = pefile.PE(filepath)
        except Exception as e:
            print("{} while opening {}".format(e,filepath))
        else:
            # 단순 파싱으로 뽑아낼 수 있는 정보를 추출함 (Raw Features)
            data += self.extract_dos_header(pe)
            data += self.extract_file_header(pe)
            data += self.extract_optional_header(pe)
            # 필드 값의 의미를 한 번 더 해석해 특징 추출 (derived features)
            #number of suspicisou sections and non-suspicsious section
            num_ss_nss = self.get_count_suspicious_sections(pe)
            data += num_ss_nss
            # check for packer and packer type
            packer = self.check_packer(filepath)

            # Appending the packer info to the rest of features
            data += packer[0]
            entropy_sections = self.get_text_data_entropy(pe)
            data += entropy_sections
            f_size_entropy = self.get_file_entropy(filepath)
            data += f_size_entropy
            fileinfo = self.get_fileinfo(pe)
            data.append(fileinfo)
            data.append(self.type)
        
        return data  

    def write_csv_data(self,data):
        filepath = self.output
        csv_file= open(filepath,"a")
        writer = csv.writer(csv_file, delimiter=',')
        writer.writerow(data)
        csv_file.close()

    def getMD5(self, filepath):
        with open(filepath, 'rb') as fh:
            m = hashlib.md5()
            while True:
                data = fh.read(8192)
                if not data:
                    break
                m.update(data)
            return m.hexdigest()

    def create_dataset(self):
        self.write_csv_header() # CSV 헤더 생성
        count = 0

        #run through all file of source and extract features
        for file in os.listdir(self.source):      # source_path로 지정한 경로의 모든 파일을 가져옴   
                filepath = self.source + "/" + file 
                data = self.extract_all(filepath) # 특징 추출 함수
                hash_ = self.getMD5(filepath)
                print("hash: ", hash_)
                data.insert(0, hash_)
                data.insert(0, file)

                self.write_csv_data(data)
                count += 1
                print("Successfully Data extracted and written for {}.".format(file))
                print("Processed " + str(count) + " files")
        print("File Processing Complete!!")

def main():   
    # 샘플파일이 위치하는 경로
    source_path= input("Enter the path of samples >> ")
    # 특징 추출 결과를 저장할 CSV 파일 이름
    output_file= input("Give file name of output file. >>")
    # 라벨링: 악성코드인 경우 1, 정상 프로그램은 0
    label = input("Enter type of sample( malware(1)|benign(0))>>")
    features = pe_features(source_path,output_file,label)    
    features.create_dataset()
    
if __name__ == '__main__':
    main()


Enter the path of samples >> /mal
Give file name of output file. >>q
Enter type of sample( malware(1)|benign(0))>>1
DEBUG <_io.TextIOWrapper name='q.csv' mode='w' encoding='UTF-8'>
파일 크기:  168960
hash:  2337089f5225107923bd963581f8ab1e
Successfully Data extracted and written for c522418670b4efa6c754ea19bf18e60d31e8c17929038cd3f14317134230a6e6.
Processed 1 files
파일 크기:  786432
hash:  11cf5ca49a6c354eb005fb24bdf6b1f0
Successfully Data extracted and written for 4e87a0794bf73d06ac1ce4a37e33eb832ff4c89fb9e4266490c7cef9229d27a7.
Processed 2 files
파일 크기:  830728
hash:  b315c590c3ad691604597ea41f8dd84e
Successfully Data extracted and written for 37ea273266aa2d28430194fca27849170d609d338abc9c6c43c4e6be1bcf51f9.
Processed 3 files
파일 크기:  829392
hash:  78c9e98f51994a7af369db9a9ed6cdf9
Successfully Data extracted and written for 45a4bd970485ca539c95d746fbe8866f868972dcf7f1d196199ed7ea8b50be5b.
Processed 4 files
파일 크기:  403968
hash:  7fcbff331b40e7edcd4985a65a9ab621
Successfully Data extracted and 