In [1]:
pip install scikit-image

Note: you may need to restart the kernel to use updated packages.


In [2]:
import cv2
import numpy as np
from skimage.util import view_as_blocks

# Gabor 필터 생성 함수
def create_filters(scales, orientations):
    filters = []
    for scale in range(scales[0], scales[1] + 1):
        for orientation in np.arange(0, np.pi, np.pi / orientations):
            filt_real = cv2.getGaborKernel((scale, scale), 1, orientation, scale, 0, ktype=cv2.CV_32F)
            filt_imag = cv2.getGaborKernel((scale, scale), 1, orientation, scale, 0.5 * np.pi, ktype=cv2.CV_32F)
            filt = filt_real + filt_imag
            filt /= 2.0 * np.pi * scale * scale
            filters.append(filt)
    return filters

# GIST 디스크립터 계산 함수
def gist_descriptor_single_channel(image, scales=(8, 8), orientations=8, blocks=(4, 4)):    # Gabor 필터 생성
    filters = create_filters(scales, orientations)
    
    # 이미지 크기와 블록 크기 계산
    height, width = image.shape[:2]
    block_size = height // blocks[0], width // blocks[1]

    padding_size = blocks[0] * block_size[0] - height, blocks[1] * block_size[1] - width
    
    # 이미지 패딩 (필요한 경우)
    if padding_size != (0, 0):
        image = cv2.copyMakeBorder(image, 0, padding_size[0], 0, padding_size[1], cv2.BORDER_CONSTANT, value=0)
    
    # 이미지를 블록으로 분할
    block_shape = (block_size[0], block_size[1])
    blocks = view_as_blocks(image, block_shape=(block_size[0], block_size[1])).reshape(-1, *block_size, order='F')
    
    # 각 블록의 GIST 특성 추출
    features = []
    for block in blocks:
        feats = []
        for scale in filters:
            for filt in scale:
                filtered = cv2.filter2D(block, cv2.CV_64F, filt)
                feats.append(filtered.mean())
        features.append(feats)
    
    # 전체 GIST 디스크립터로 결합
    return np.concatenate(features)


def gist_descriptor(image, scales=(8, 8), orientations=8, blocks=(4, 4)):
    if len(image.shape) == 3:
    # 각 채널에 대해 GIST 디스크립터 계산
        descriptors = [gist_descriptor_single_channel(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY), scales, orientations, blocks)]            # 전체 GIST 디스크립터로 결합
        return np.concatenate(descriptors)
    else:
    # 단일 채널 이미지의 경우 GIST 디스크립터를 한 번만 계산
        return gist_descriptor_single_channel(image, scales, orientations, blocks)

In [3]:
cd /Users/parkhyunjae/Downloads/malware/semi_train

/Users/parkhyunjae/Downloads/malware/semi_train


In [4]:
# 각 악성코드 이미지 폴더에서 350개의 이미지에 대한 gist descriptor를 계산하여 반환
def get_gist_descriptors(root_dir):
    descriptors = []
    for subdir in sorted(os.listdir(root_dir)):
        subdir_path = os.path.join(root_dir, subdir)
        if os.path.isdir(subdir_path):
            print("Processing directory:", subdir_path)
            for i, filename in enumerate(os.listdir(subdir_path)):
                # 파일 경로 생성
                filepath = os.path.join(subdir_path, filename)
                # 이미지 로드
                image = cv2.imread(filepath)
                # 이미지에 대한 GIST 디스크립터 계산
                descriptor = gist_descriptor(image)
                descriptors.append(descriptor)
                
                if i % 10 == 9:
                    print("\tProcessed", i + 1, "images")
    return np.array(descriptors)


In [10]:
root_dir = "semi_train/"
# 각 폴더에서 350개의 이미지에 대한 gist descriptor 계산
descriptors = get_gist_descriptors(root_dir)
print(len(descriptors))
print('GIST Descriptor Shape:', descriptors.shape)

Processing directory: semi_train/Adposhel
	Processed 10 images
Processing directory: semi_train/Agent
Processing directory: semi_train/Dinwod
Processing directory: semi_train/Elex
Processing directory: semi_train/Vilsel
	Processed 10 images
43
GIST Descriptor Shape: (43, 1152)


In [11]:
pip install scikit-learn xgboost

Note: you may need to restart the kernel to use updated packages.


In [15]:
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from xgboost import XGBClassifier

# 데이터와 레이블 설정
X = descriptors

# 레이블 기록
labels_dict = {
    'Adposhel': 0,
    'Agent': 1,
    'Dinwod': 2,
    'Elex': 3,
    'Vilsel': 4
}

# 각 이미지에 맞는 레이블 생성
y = []
for subdir in sorted(os.listdir(root_dir)):
    subdir_path = os.path.join(root_dir, subdir)
    if os.path.isdir(subdir_path):
        for i, filename in enumerate(sorted(os.listdir(subdir_path))):
            y.append(labels_dict[subdir])
y = np.array(y)

# train-test 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 정규화
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 모델 정의
models = {
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(),
    'Linear SVM': LinearSVC(),
    'SMO': SVC(kernel='rbf'), # SMO는 일반적으로 서포트 벡터 머신 (SVM)이 rbf 커널을 사용
    'J48': DecisionTreeClassifier() # J48는 scikit-learn에서 Decision Tree에 해당
}

# 성능 지표
scores = {'accuracy': accuracy_score, 'FPR': confusion_matrix, 'precision': precision_score, 'recall': recall_score, 'f1score': f1_score}

# 각 모델에 대해 교차 검증 및 테스트 세트에서 성능 평가
for model_name, model_instance in models.items():
    print(model_name)
    model_instance.fit(X_train_scaled, y_train)
    y_pred = model_instance.predict(X_test_scaled)
    for score_name, score_func in scores.items():
        if score_name == 'FPR':
            cm = score_func(y_test, y_pred)
            fp = cm.sum(axis=0) - np.diag(cm)
            tn = cm.sum() - (cm.sum(axis=1) + fp)
            fpr = np.mean(fp / (fp + tn))
            print(score_name, fpr)
        elif score_name == 'accuracy':
            print(score_name, score_func(y_test, y_pred))
        else:
            print(score_name, score_func(y_test, y_pred, average='weighted'))


Random Forest
accuracy 0.8888888888888888
FPR 0.02222222222222222
precision 0.8888888888888888
recall 0.8888888888888888
f1score 0.8888888888888888
XGBoost


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


accuracy 0.7777777777777778
FPR 0.04722222222222222
precision 0.8888888888888888
recall 0.7777777777777778
f1score 0.8148148148148148
Linear SVM
accuracy 0.8888888888888888
FPR 0.03571428571428571
precision 0.8148148148148148
recall 0.8888888888888888
f1score 0.8444444444444444
SMO
accuracy 0.8888888888888888
FPR 0.02222222222222222
precision 0.8888888888888888
recall 0.8888888888888888
f1score 0.8888888888888888
J48
accuracy 0.7777777777777778
FPR 0.07142857142857142
precision 0.7037037037037037
recall 0.7777777777777778
f1score 0.7333333333333333


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
