<a href="https://colab.research.google.com/github/chlwnsxo00/BigDataSecurity/blob/main/TeamProject_MalwareDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 드라이브 설정

In [None]:
from google.colab import drive
drive.mount('/content/drive/') 

In [None]:
cd '/content/drive/Shareddrives/BigDataSecurity'

In [None]:
ls

## 모듈 불러오기

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import transforms, utils, datasets
from torch.utils.data import Dataset, DataLoader
import torch.utils.data
import os
import time


In [None]:
from PIL import ImageFile
from PIL import Image
ImageFile.LOAD_TRUNCATED_IMAGES = True # prevent truncate error

## 데이터셋 불러오기


In [None]:
# From https://gist.github.com/andrewjong/6b02ff237533b3b2c554701fb53d5c4d

class ImageFolderWithPaths(datasets.ImageFolder):
    """Custom dataset that includes image file paths. Extends
    torchvision.datasets.ImageFolder
    """

    # override the __getitem__ method. this is the method that dataloader calls
    def __getitem__(self, index):
        # this is what ImageFolder normally returns 
        original_tuple = super(ImageFolderWithPaths, self).__getitem__(index)
        # the image file path
        path = self.imgs[index][0]
        # make a new tuple that includes original and the path
        tuple_with_path = (original_tuple + (path,))
        return tuple_with_path

In [None]:
image_transforms = {
    "train": transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),        
    ]),
    "test": transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])
}

In [None]:
train_data = ImageFolderWithPaths(root = './malware/' + "train", # load train data
                                   transform = image_transforms['train'],
                                  )
train_loader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True) # make train loader

In [None]:
test_data = ImageFolderWithPaths(root = './malware/' + "val", # load test data
                                   transform = image_transforms['test'],
                                  )
test_loader = torch.utils.data.DataLoader(test_data, batch_size=64, shuffle=False) # make test loader

In [None]:
train_data

In [None]:
test_data

In [None]:
# val_size = int(0.5 * len(test_data))
# test_size = len(test_data) - val_size

# valid_data, test_data = torch.utils.data.random_split(test_data, [val_size, test_size])

In [None]:
classes = train_data.class_to_idx # class name
classes

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# functions to show an image


def imshow(img):
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()


# get some random training images
dataiter = iter(train_loader)
# print(dataiter.next())
images, labels, paths = next(dataiter)

batch_size = 16

# show images
imshow(torchvision.utils.make_grid(images))
# print labels
print()
labels = labels.tolist()
print(' '.join(f'{list(classes.keys())[list(classes.values()).index(j)]}' for j in labels))

## EDA 및 전처리

###T-SNE

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [None]:
# T-SNE를 수행할 데이터 로드
dataiter = iter(train_loader)  # train_loader는 데이터를 로드하는 DataLoader 객체입니다.
images, labels, paths = next(dataiter)

In [None]:
#Todo
# 데이터 전처리
# 필요한 전처리 작업을 수행합니다.
# 예시: 데이터에서 특정 열 추출

X = images.view(images.size(0), -1).numpy()

In [None]:
# T-SNE 모델 생성 및 학습
tsne = TSNE(n_components=2, perplexity=30, learning_rate=200, random_state=42)
tsne_result = tsne.fit_transform(X)

In [None]:
# T-SNE 결과를 시각화
plt.figure(figsize=(10, 10))
plt.scatter(tsne_result[:, 0], tsne_result[:, 1], c=labels, cmap='viridis')
plt.title('T-SNE Visualization')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.colorbar()
plt.show()

###GIST

In [None]:
pip install python-gist

In [None]:
pip install gists.py

In [None]:
import gist

## 모델 구성

## Machine Learning

In [None]:
# 훈련 데이터 준비
train_images = []
train_labels = []

for images, labels, paths in train_loader:
    # 이미지 데이터를 1차원 벡터로 변환
    images = images.view(images.size(0), -1)
    train_images.append(images.numpy())
    train_labels.append(labels.numpy())

train_images = np.concatenate(train_images, axis=0)
train_labels = np.concatenate(train_labels, axis=0)

# 테스트 데이터 준비
test_images = []
test_labels = []

for images, labels, paths in test_loader:
    # 이미지 데이터를 1차원 벡터로 변환
    images = images.view(images.size(0), -1)
    test_images.append(images.numpy())
    test_labels.append(labels.numpy())

test_images = np.concatenate(test_images, axis=0)
test_labels = np.concatenate(test_labels, axis=0)

# Random Forest 모델 생성 및 학습
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(train_images, train_labels)

# 예측
predictions = model.predict(test_images)

# 정확도 평가
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy:", accuracy)

## Deep Learning

In [None]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        
        ############### Conv2d, MaxPool2d, Linear 함수에 들어갈 파라미터를 채우세요 ##############
        self.conv1 = nn.Conv2d(3, 5, 3) # in_channel, out_channel, kernel size
        self.pool = nn.MaxPool2d(3, 2) # kernel_size, stride
        self.conv2 = nn.Conv2d(5, 10, 3)
        self.fc1 = nn.Linear(58320, 160) # in_features, out_features
        self.fc2 = nn.Linear(160, 120)
        self.fc3 = nn.Linear(120, 26)
        ###########################################################################################


    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Use GPU if it's available # colab 런타임 유형 변경에서 GPU 선택할 것

In [None]:
model = Net() # define the network
model = model.to(device) # send the network to the device

In [None]:
criterion = nn.CrossEntropyLoss() # loss function, 변경 가능
# optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9, weight_decay=0.001) # optimizer, 변경 가능
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)

In [None]:
def calculate_topk_accuracy(y_pred, y, k = 5):
    with torch.no_grad():
        batch_size = y.shape[0]
        _, top_pred = y_pred.topk(k, 1)
        top_pred = top_pred.t()
        corrects = top_pred.eq(y.view(1, -1).expand_as(top_pred))
        correct = corrects[:1].reshape(-1).float().sum(0, keepdim = True)
        acc = correct/ batch_size
    return acc

In [None]:
for idx, x in enumerate(train_loader):
  print(x[0])
  print(x[1])

  break

In [None]:
def train(model, iterator, optimizer, criterion, device):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for idx, data in enumerate(iterator):
        
        x = data[0].to(device)
        y = data[1].to(device)
        
        optimizer.zero_grad()
                
        y_pred = model(x)
        
        loss = criterion(y_pred, y)
        
        acc = calculate_topk_accuracy(y_pred, y)
        
        loss.backward()
        
        optimizer.step()
        
        # scheduler.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    epoch_loss /= len(iterator)
    epoch_acc /= len(iterator)
        
    return epoch_loss, epoch_acc

In [None]:
def evaluate(model, iterator, criterion, device):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():

        for idx, data in enumerate(iterator):

           x = data[0].to(device)
           y = data[1].to(device)

           y_pred = model(x)

           loss = criterion(y_pred, y)

           acc = calculate_topk_accuracy(y_pred, y)

           epoch_loss += loss.item()
           epoch_acc += acc.item()

          
    epoch_loss /= len(iterator)
    epoch_acc /= len(iterator)
        
    return epoch_loss, epoch_acc

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
import time

In [None]:
best_valid_loss = float('inf')
EPOCHS = 10
for epoch in range(EPOCHS):
    
    start_time = time.monotonic()
    
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
    valid_loss, valid_acc = evaluate(model, test_loader, criterion, device)
        
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut5-model.pt')

    end_time = time.monotonic()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:6.2f}%')
    print(f'\tValid Loss: {valid_loss:.3f} | Valid Acc: {valid_acc*100:6.2f}%')

## 모델 학습

## 모델 평가