In [6]:
import os
import pefile
#import pydasm
import time
import array
import dis
import operator
import csv
import hashlib

from itertools import chain
from capstone import *
from capstone.x86 import *

class NGRAM_features:
    # N-gram 특징 추출 클래스의 생성자
    def __init__(self, output):
        
        self.output = output
        self.gram = dict()
        self.imports = ""

    # N-gram 생성 제너레이터
    def gen_list_n_gram(self, num, asm_list):

        for i in range(0, len(asm_list), num):
            yield asm_list[i:i+num]

    def n_grams(self, num, asm_list, ex_mode):
        if ex_mode == 1:
            gram = self.gram
        elif ex_mode == 0:
            gram = dict()

        gen_list = self.gen_list_n_gram(num, asm_list)

        for lis in gen_list:
            lis = " ".join(lis)
            try:
                gram[lis] += 1 # 명령어 패턴의 갯수를 하나 늘림
            except: # 해당 명령어 패턴이 gram에 없는 경우 예외 발생
                gram[lis] = 1 # 해당 명령어 패턴을 새로운 키 값으로 추가
            
        return gram


    def get_ngram_count(self, headers, grams, label):

        patterns = list()

        for pat in headers:
            try:
                patterns.append(grams[pat])
            except:
                patterns.append(0)

        patterns.append(label)

        return patterns
    
    
    # N-Gram을 구하기 위한 어셈블리 mnemonic 추출하기
    def get_opcodes(self, mode, file):
        asm = []
        byte_all = []
        try:
            # 파일의 PE헤더 구하기
            pe = pefile.PE(file)
        except:
            print("Error!!")
            return asm
        else:
            # OPTIONAL_HEADER의 AddressOfEntryPoint 값
            ep = pe.OPTIONAL_HEADER.AddressOfEntryPoint
            # OPTIONAL_HEADER의 SizeOfCode 값
            end = pe.OPTIONAL_HEADER.SizeOfCode
            # OPTIONAL_HEADER의 ImageBase 값
            ep_ava = ep+pe.OPTIONAL_HEADER.ImageBase

            for section in pe.sections:
                addr = section.VirtualAddress
                size = section.Misc_VirtualSize

                if ep > addr and ep < (addr+size):
                    #print(section.Name)
                    ep = addr
                    end = size
            
            data = pe.get_memory_mapped_image()[ep:ep+end] # 지정된 범위 안의 TEXT Dump 저장
            offset = 0
            
            temp = data.hex() # "/xab/xcd" 형식으로 저장된 데이터를
            temp = [temp[i:i+2] for i in range(0,len(temp), 2)] # "abcd" 형식으로 변경

            if(mode): # mode가 1(참)인 경우, temp를 반환함.
                return temp

            md = Cs(CS_ARCH_X86, CS_MODE_32) # Capstone 코드로, x86 디스어셈블링 모드로 전환
            md.detail = False

            for insn in md.disasm(data, 0x401000): # 분석하는 프로그램의 0x401000번지부터 디스어셈블링
                #print("0x%x:\t%s\t%s" % (insn.address, insn.mnemonic, insn.op_str))
                #print(insn.mnemonic)
                asm.append(insn.mnemonic)  # 추출한 어셈블리코드의 mnemonic만을 리스트에 추가

            return asm

    # MD5 해시값 구하기
    def getMD5(self, filepath):
        with open(filepath, 'rb') as fh:
            m = hashlib.md5()
            while True:
                data = fh.read(8192)
                if not data:
                    break
                m.update(data)
            return m.hexdigest()

    # CSV파일 헤더 데이터 쓰기
    def write_csv_header(self, headers):
        filepath = self.output
        HASH = ['filename', 'MD5']
        class_ = ['class']
        headers = HASH + headers + class_

        csv_file= open(filepath,"w")
        writer = csv.writer(csv_file, delimiter=',')
        writer.writerow(headers)
        csv_file.close()
        
    # CSV파일에 데이터 쓰기
    def write_csv_data(self,data):
        filepath = self.output
        csv_file= open(filepath,"a")
        writer = csv.writer(csv_file, delimiter=',')
        writer.writerow(data)
        csv_file.close()


def main():    
    
    # 몇 개 이상으로 나와야 특징으로 분류할지 지정
    num_of_features = 100
    
    # 분석할 파일이나 결과파일을 저장할 경로 지정
    mal_path = '/mal/'
    nor_path = '/mal/'
    output_file = "./ngram.csv"

    print('[*] Extracting ngram patterns from files')

    ef = NGRAM_features(output_file)
    i = 0
    
    # 악성 프로그램 4-gram 추출
    for file in os.listdir(mal_path):  
        i += 1 
        print("%d file processed (%s)," % (i, file))
        file = mal_path + file       
        byte_code = ef.get_opcodes(0, file) 
        grams = ef.n_grams(4, byte_code, 1)
        print("%d patterns extracted" % (len(grams)))

    print('- Malware Completed')
    
    # 정상 프로그램 4-gram 추출
    for file in os.listdir(nor_path):  
        i += 1
        print("%d file processed (%s)," % (i, file))
        file = nor_path + file       
        byte_code = ef.get_opcodes(0, file) 
        grams = ef.n_grams(4, byte_code, 1)
        print("%d patterns extracted" % (len(grams)))
    print('- Normal Completed')
    
    # 상위 100개 패턴을 추출해 특징으로 만듦
    print("[*] Total length of 4-gram list :", len(grams))

    sorted_x = sorted(grams.items(), key=operator.itemgetter(1), reverse=True)
    print("[*] Using %s grams as features" % (num_of_features))
    features = sorted_x[0:num_of_features]
    headers = list(chain.from_iterable(zip(*features)))[0:num_of_features]
    ef.write_csv_header(headers)

    print("#" * 80)

    i = 0
    # 악성 프로그램의 4-Gram 정보를 CSV파일에 저장
    for file in os.listdir(mal_path):  
        i += 1
        print("%d file processed (%s)," % (i, file))
        filepath = mal_path + file
        byte_code = ef.get_opcodes(0, filepath)
        grams = ef.n_grams(4, byte_code, 0)
        
        gram_count = ef.get_ngram_count(headers, grams, 1)  
        hash_ = ef.getMD5(filepath)
        all_data = [file, hash_]
        all_data.extend(gram_count)   
        ef.write_csv_data(all_data)   

    # 정상 프로그램의 4-Gram 정보를 CSV파일에 저장
    for file in os.listdir(nor_path):  
        i += 1
        print("%d file processed (%s)," % (i, file))
        
        filepath = nor_path + file       
        byte_code = ef.get_opcodes(0, filepath) 
        grams = ef.n_grams(4, byte_code, 0) 
        gram_count = ef.get_ngram_count(headers, grams, 0) 

        hash_ = ef.getMD5(filepath)
        all_data = [file, hash_]
        all_data.extend(gram_count)   
        ef.write_csv_data(all_data)
    
    
if __name__ == '__main__':
    main()
    print("Get N-Gram Complete!!")

[*] Extracting ngram patterns from files
1 file processed (c522418670b4efa6c754ea19bf18e60d31e8c17929038cd3f14317134230a6e6),
8773
IMAGE_SECTION_HEADER 4096 71242
4096
IMAGE_SECTION_HEADER 77824 19158
4096
IMAGE_SECTION_HEADER 98304 14504
4096
IMAGE_SECTION_HEADER 114688 62496
4096
IMAGE_SECTION_HEADER 180224 5746
150 patterns extracted
2 file processed (4e87a0794bf73d06ac1ce4a37e33eb832ff4c89fb9e4266490c7cef9229d27a7),
5280
IMAGE_SECTION_HEADER 4096 758852
4096
IMAGE_SECTION_HEADER 765952 29500
4096
IMAGE_SECTION_HEADER 798720 14764
155 patterns extracted
3 file processed (37ea273266aa2d28430194fca27849170d609d338abc9c6c43c4e6be1bcf51f9),
1488752
IMAGE_SECTION_HEADER 4096 708608
1488752
IMAGE_SECTION_HEADER 712704 778240
712704
IMAGE_SECTION_HEADER 1490944 28672
164 patterns extracted
4 file processed (45a4bd970485ca539c95d746fbe8866f868972dcf7f1d196199ed7ea8b50be5b),
4794
IMAGE_SECTION_HEADER 4096 653543
4096
IMAGE_SECTION_HEADER 659456 693
4096
IMAGE_SECTION_HEADER 663552 73896
4096

415908
IMAGE_SECTION_HEADER 4096 411884
4096
IMAGE_SECTION_HEADER 417792 4868
4096
IMAGE_SECTION_HEADER 425984 3049
4096
IMAGE_SECTION_HEADER 430080 8688
4096
IMAGE_SECTION_HEADER 442368 16
4096
IMAGE_SECTION_HEADER 446464 24
4096
IMAGE_SECTION_HEADER 450560 28620
4096
IMAGE_SECTION_HEADER 479232 41428
5755 patterns extracted
34 file processed (a50f1be63ef1f51feac8f36a1b03664c4b6e8914633f6d71c5c7007984caf93c),
13069
IMAGE_SECTION_HEADER 4096 24636
4096
IMAGE_SECTION_HEADER 32768 4680
4096
IMAGE_SECTION_HEADER 40960 108600
4096
IMAGE_SECTION_HEADER 151552 81920
4096
IMAGE_SECTION_HEADER 233472 19232
5755 patterns extracted
35 file processed (2573b356452dd5ee24c10537fa4848d882fa40a2a8fa5a181624ba460e1f769a),
67105
IMAGE_SECTION_HEADER 4096 72267
4096
IMAGE_SECTION_HEADER 77824 256
4096
IMAGE_SECTION_HEADER 81920 20136
5755 patterns extracted
36 file processed (e796e64c5f9a7568773bd2924e992172f222957e039ab7b41ade44865d0a48e5),
6920
IMAGE_SECTION_HEADER 4096 465852
4096
IMAGE_SECTION_HEADE

11 file processed (91bfa2445d998425c81f30d293235429ca6a8c6c8f326536478952a2a6754aac),
25692
IMAGE_SECTION_HEADER 4096 39212
4096
IMAGE_SECTION_HEADER 45056 7140
4096
IMAGE_SECTION_HEADER 53248 388532
12 file processed (3329641a171508fa6b1ad7674b31431093d46be190d1a51acd77e486f42d9c8e),
51893
IMAGE_SECTION_HEADER 4096 56148
4096
IMAGE_SECTION_HEADER 61440 551176
4096
IMAGE_SECTION_HEADER 614400 262144
4096
IMAGE_SECTION_HEADER 876544 3440
13 file processed (775c7bd9e820c4dfd0fabdfeade2de901414bd46d2691ea5020a818f6a42eb83),
1471824
IMAGE_SECTION_HEADER 4096 696320
1471824
IMAGE_SECTION_HEADER 700416 774144
700416
IMAGE_SECTION_HEADER 1474560 28672
14 file processed (6f9fcfaa7d942dea200107857c51c4fbcd7ac5922f090a1b9dc91e0e67e03fa3),
285406
IMAGE_SECTION_HEADER 8192 277220
8192
IMAGE_SECTION_HEADER 286720 1024
8192
IMAGE_SECTION_HEADER 294912 12
15 file processed (b9079fb0fff9f40d7b5544f29d260b1659d8fcf019deadc72ec2c12882203a66),
102679
IMAGE_SECTION_HEADER 4096 204454
4096
IMAGE_SECTION_HE

57871
IMAGE_SECTION_HEADER 4096 146404
4096
IMAGE_SECTION_HEADER 151552 22294
4096
IMAGE_SECTION_HEADER 176128 14784
4096
IMAGE_SECTION_HEADER 192512 329156
4096
IMAGE_SECTION_HEADER 524288 11008
50 file processed (2030f0f9fa95e6e824d12664b48344c6e4fd58e607c96e6300c88a8292d1f743),
521613
IMAGE_SECTION_HEADER 4096 677061
4096
IMAGE_SECTION_HEADER 684032 192338
4096
IMAGE_SECTION_HEADER 876544 29132
4096
IMAGE_SECTION_HEADER 909312 1300
4096
IMAGE_SECTION_HEADER 913408 45240
51 file processed (240387329dee4f03f98a89a2feff9bf30dcba61fcf614cdac24129da54442762),
4768
IMAGE_SECTION_HEADER 4096 1092
4096
IMAGE_SECTION_HEADER 8192 1035
4096
IMAGE_SECTION_HEADER 12288 60
4096
IMAGE_SECTION_HEADER 16384 436
4096
IMAGE_SECTION_HEADER 20480 194
52 file processed (3a411c1f2e55d7e21318a32d1527f8ebd7ab76d873368acbb573e67b89257f5e),
7053
IMAGE_SECTION_HEADER 4096 3089
4096
IMAGE_SECTION_HEADER 8192 65780
4096
IMAGE_SECTION_HEADER 77824 720
4096
IMAGE_SECTION_HEADER 81920 25740
4096
IMAGE_SECTION_HEADE