# Laboratorio #4 - Familias de Malware

# Parte 1

In [1]:
import os
import pefile
import pandas as pd
import hashlib
import re

## Creación del dataset

In [3]:
MALWARE_DIR = "./MALWR"
malware_files = [os.path.join(MALWARE_DIR, f) for f in os.listdir(MALWARE_DIR) if os.path.isfile(os.path.join(MALWARE_DIR, f))]

print(f"🔍 Se encontraron {len(malware_files)} archivos de malware en el directorio.")

🔍 Se encontraron 41 archivos de malware en el directorio.


In [6]:
def detect_packed(pe):
    packed_sections = [b"UPX0", b"UPX1", b"UPX2"]
    for section in pe.sections:
        if section.Name.strip() in packed_sections:
            return "UPX Detected"
    return "Not Packed"

In [5]:
def get_sha256(file_path):
    sha256_hash = hashlib.sha256()
    with open(file_path, "rb") as f:
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)
    return sha256_hash.hexdigest()

In [7]:
def get_imported_functions(pe):
    imported_functions = []
    if hasattr(pe, "DIRECTORY_ENTRY_IMPORT"):
        for entry in pe.DIRECTORY_ENTRY_IMPORT:
            for function in entry.imports:
                imported_functions.append(function.name.decode("utf-8") if function.name else "N/A")
    return ", ".join(imported_functions)

In [8]:
def detect_suspicious_sections(pe):
    for section in pe.sections:
        name = section.Name.decode().strip()
        size = section.SizeOfRawData
        
        if name == ".text" and size < 1024:
            return "Small .text (Possible Packing)"
        if name == ".rsrc" and size > 1000000:
            return "Large .rsrc (Possible Malware)"
    return "Normal"

In [9]:
def extract_strings(file_path):
    with open(file_path, "rb") as f:
        data = f.read()
    
    strings = re.findall(b"[ -~]{4,}", data)
    decoded_strings = [s.decode("utf-8", "ignore") for s in strings]
    
    # Filtrar strings sospechosas (URLs, IPs, comandos maliciosos)
    suspicious_keywords = ["http", "ftp", "cmd", "powershell", "wget", "curl", "0.0.0.0"]
    suspicious_strings = [s for s in decoded_strings if any(k in s for k in suspicious_keywords)]
    
    return ", ".join(suspicious_strings[:10])  # Limitar la salida

In [10]:
def extract_pe_info(file_path):
    try:
        pe = pefile.PE(file_path)
        
        # Información del encabezado PE
        pe_info = {
            "Filename": os.path.basename(file_path),
            "SHA256": get_sha256(file_path),
            "TimeDateStamp": pe.FILE_HEADER.TimeDateStamp,
            "Subsystem": pe.OPTIONAL_HEADER.Subsystem,
            "DLLCharacteristics": pe.OPTIONAL_HEADER.DllCharacteristics,
            "NumberOfSections": pe.FILE_HEADER.NumberOfSections,
            "Packed": detect_packed(pe),
            "Imports": get_imported_functions(pe),
            "Suspicious Sections": detect_suspicious_sections(pe),
            "Strings": extract_strings(file_path)
        }
        
        pe.close()
        return pe_info
    
    except Exception as e:
        print(f"⚠️ Error analizando {file_path}: {e}")
        return None

In [17]:

dataset = []
for file in malware_files:
    if not os.path.isfile(file):  # Ignorar archivos inexistentes
        continue
    pe_data = extract_pe_info(file)
    if pe_data:
        dataset.append(pe_data)

df = pd.DataFrame(dataset)

In [18]:
df.to_csv("malware_dataset.csv", index=False)
print("✅ Dataset guardado como 'malware_dataset.csv'")

✅ Dataset guardado como 'malware_dataset.csv'


In [19]:
df.head()

Unnamed: 0,Filename,SHA256,TimeDateStamp,Subsystem,DLLCharacteristics,NumberOfSections,Packed,Imports,Suspicious Sections,Strings
0,JH78C0A33A1B472A8C16123FD696A5CE5EBB,5566759631e5eaa2833057a5a57caa31837ddee6587386...,1242321160,2,0,3,Not Packed,"LoadLibraryA, ExitProcess, GetProcAddress, Vir...",Normal,"1http://w, *(SY)# cmd"
1,NBV_8B75BCBFF174C25A0161F30758509A44,aa5cb0e2aab4b8544d43fdf1f26db234c2039caaddb412...,1242321160,2,0,3,Not Packed,"LoadLibraryA, ExitProcess, GetProcAddress, Vir...",Normal,"1http://w, *(SY)# cmd"
2,POL55_A4F1ECC4D25B33395196B5D51A06790,57ef37c30a4a10b334a745d2f6235b4703ae99cf8e64e1...,1311923431,2,0,3,Not Packed,"CreateFileA, LocalAlloc, Sleep, CreateThread, ...",Normal,"_acmdln, https, http, YYYYYXYYYYYhttp://eads.r..."
3,6FAA4740F99408D4D2DDDD0B09BBDEFD,41e14883b3a545de6c34c1f14d3856d0e070af9aee7d3b...,1242321160,2,0,3,Not Packed,"LoadLibraryA, ExitProcess, GetProcAddress, Vir...",Normal,"1http://w, *(SY)# cmd"
4,A316D5AECA269CA865077E7FFF356E7D,ea405f3a14b77297f8b15348a83b1680e72412adc1653d...,1242321160,2,0,3,Not Packed,"LoadLibraryA, ExitProcess, GetProcAddress, Vir...",Normal,"1http://w, *(SY)# cmd"
