# Laboratorio #4 - Familias de Malware

# Parte 1

In [26]:
import os
import pefile
import pandas as pd
import hashlib
import re
from datetime import datetime, timezone

## Creación del dataset

In [3]:
MALWARE_DIR = "./MALWR"
malware_files = [os.path.join(MALWARE_DIR, f) for f in os.listdir(MALWARE_DIR) if os.path.isfile(os.path.join(MALWARE_DIR, f))]

print(f"🔍 Se encontraron {len(malware_files)} archivos de malware en el directorio.")

🔍 Se encontraron 41 archivos de malware en el directorio.


In [29]:
def convert_timestamp(timestamp):
    try:
        return datetime.fromtimestamp(timestamp, timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
    except:
        return "Invalid Timestamp"

# 📌 Función para calcular el SHA-256 del archivo
def get_sha256(file_path):
    sha256_hash = hashlib.sha256()
    with open(file_path, "rb") as f:
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)
    return sha256_hash.hexdigest()

# 📌 Función para detectar empaquetado (UPX u otros)
def detect_packed(pe):
    packed_sections = [b"UPX0", b"UPX1", b"UPX2"]
    for section in pe.sections:
        if section.Name.strip() in packed_sections:
            return "UPX Detected"
    return "Not Packed"

# 📌 Función para obtener funciones importadas y APIs sospechosas
suspicious_api_calls = {
    "process hollowing": ["CreateProcess", "GetProcAddress", "VirtualAllocEx", "WriteProcessMemory"],
    "create remote thread": ["OpenProcess", "CreateRemoteThread", "WriteProcessMemory"],
    "networking": ["WSAStartup", "socket", "bind", "connect", "recv"],
    "code injection": ["VirtualProtect", "VirtualAlloc", "LoadLibrary"]
}

def get_imported_functions(pe):
    imported_functions = []
    detected_suspicious_apis = set()

    if hasattr(pe, "DIRECTORY_ENTRY_IMPORT"):
        for entry in pe.DIRECTORY_ENTRY_IMPORT:
            for function in entry.imports:
                func_name = function.name.decode("utf-8") if function.name else "N/A"
                imported_functions.append(func_name)

                # Verificamos si la función pertenece a alguna categoría sospechosa
                for category, apis in suspicious_api_calls.items():
                    if func_name in apis:
                        detected_suspicious_apis.add(func_name)

    return ", ".join(imported_functions), ", ".join(detected_suspicious_apis) if detected_suspicious_apis else "None"

# 📌 Función para detectar secciones sospechosas
def detect_suspicious_sections(pe):
    for section in pe.sections:
        name = section.Name.decode().strip()
        size = section.SizeOfRawData
        
        if name == ".text" and size < 1024:
            return "Small .text (Possible Packing)"
        if name == ".rsrc" and size > 1000000:
            return "Large .rsrc (Possible Malware)"
    return "Normal"

# 📌 Función para extraer strings sospechosas (URLs, IPs, comandos)
def extract_strings(file_path):
    with open(file_path, "rb") as f:
        data = f.read()
    
    strings = re.findall(b"[ -~]{4,}", data)
    decoded_strings = [s.decode("utf-8", "ignore") for s in strings]

    suspicious_keywords = ["http", "ftp", "cmd", "powershell", "wget", "curl", "0.0.0.0"]
    suspicious_strings = [s for s in decoded_strings if any(k in s for k in suspicious_keywords)]
    
    return ", ".join(suspicious_strings[:10])  # Máximo 10 resultados

In [30]:
def extract_pe_info(file_path):
    try:
        pe = pefile.PE(file_path)
        
        # Información del encabezado PE
        pe_info = {
            "Filename": os.path.basename(file_path),
            "SHA256": get_sha256(file_path),
            "TimeDateStamp": convert_timestamp(pe.FILE_HEADER.TimeDateStamp),
            "Subsystem": pe.OPTIONAL_HEADER.Subsystem,
            "DLLCharacteristics": pe.OPTIONAL_HEADER.DllCharacteristics,
            "NumberOfSections": pe.FILE_HEADER.NumberOfSections,
            "Packed": detect_packed(pe),
            "Imports": get_imported_functions(pe),
            "Suspicious Sections": detect_suspicious_sections(pe),
            "Strings": extract_strings(file_path)
        }
        
        pe.close()
        return pe_info
    
    except Exception as e:
        print(f"⚠️ Error analizando {file_path}: {e}")
        return None

In [31]:

dataset = []
for file in malware_files:
    if not os.path.isfile(file):  # Ignorar archivos inexistentes
        continue
    pe_data = extract_pe_info(file)
    if pe_data:
        dataset.append(pe_data)

df = pd.DataFrame(dataset)

In [32]:
df.to_csv("malware_dataset.csv", index=False)
print("✅ Dataset guardado como 'malware_dataset.csv'")

✅ Dataset guardado como 'malware_dataset.csv'


In [33]:
df.head()

Unnamed: 0,Filename,SHA256,TimeDateStamp,Subsystem,DLLCharacteristics,NumberOfSections,Packed,Imports,Suspicious Sections,Strings
0,JH78C0A33A1B472A8C16123FD696A5CE5EBB,5566759631e5eaa2833057a5a57caa31837ddee6587386...,2009-05-14 17:12:40,2,0,3,Not Packed,"(LoadLibraryA, ExitProcess, GetProcAddress, Vi...",Normal,"1http://w, *(SY)# cmd"
1,NBV_8B75BCBFF174C25A0161F30758509A44,aa5cb0e2aab4b8544d43fdf1f26db234c2039caaddb412...,2009-05-14 17:12:40,2,0,3,Not Packed,"(LoadLibraryA, ExitProcess, GetProcAddress, Vi...",Normal,"1http://w, *(SY)# cmd"
2,POL55_A4F1ECC4D25B33395196B5D51A06790,57ef37c30a4a10b334a745d2f6235b4703ae99cf8e64e1...,2011-07-29 07:10:31,2,0,3,Not Packed,"(CreateFileA, LocalAlloc, Sleep, CreateThread,...",Normal,"_acmdln, https, http, YYYYYXYYYYYhttp://eads.r..."
3,6FAA4740F99408D4D2DDDD0B09BBDEFD,41e14883b3a545de6c34c1f14d3856d0e070af9aee7d3b...,2009-05-14 17:12:40,2,0,3,Not Packed,"(LoadLibraryA, ExitProcess, GetProcAddress, Vi...",Normal,"1http://w, *(SY)# cmd"
4,A316D5AECA269CA865077E7FFF356E7D,ea405f3a14b77297f8b15348a83b1680e72412adc1653d...,2009-05-14 17:12:40,2,0,3,Not Packed,"(LoadLibraryA, ExitProcess, GetProcAddress, Vi...",Normal,"1http://w, *(SY)# cmd"


## Exploración y pre-procesamiento de datos

In [37]:
df = pd.read_csv("malware_dataset.csv")

# Mostrar las primeras filas para inspección
df.head()

Unnamed: 0,Filename,SHA256,TimeDateStamp,Subsystem,DLLCharacteristics,NumberOfSections,Packed,Imports,Suspicious Sections,Strings
0,JH78C0A33A1B472A8C16123FD696A5CE5EBB,5566759631e5eaa2833057a5a57caa31837ddee6587386...,2009-05-14 17:12:40,2,0,3,Not Packed,"('LoadLibraryA, ExitProcess, GetProcAddress, V...",Normal,"1http://w, *(SY)# cmd"
1,NBV_8B75BCBFF174C25A0161F30758509A44,aa5cb0e2aab4b8544d43fdf1f26db234c2039caaddb412...,2009-05-14 17:12:40,2,0,3,Not Packed,"('LoadLibraryA, ExitProcess, GetProcAddress, V...",Normal,"1http://w, *(SY)# cmd"
2,POL55_A4F1ECC4D25B33395196B5D51A06790,57ef37c30a4a10b334a745d2f6235b4703ae99cf8e64e1...,2011-07-29 07:10:31,2,0,3,Not Packed,"('CreateFileA, LocalAlloc, Sleep, CreateThread...",Normal,"_acmdln, https, http, YYYYYXYYYYYhttp://eads.r..."
3,6FAA4740F99408D4D2DDDD0B09BBDEFD,41e14883b3a545de6c34c1f14d3856d0e070af9aee7d3b...,2009-05-14 17:12:40,2,0,3,Not Packed,"('LoadLibraryA, ExitProcess, GetProcAddress, V...",Normal,"1http://w, *(SY)# cmd"
4,A316D5AECA269CA865077E7FFF356E7D,ea405f3a14b77297f8b15348a83b1680e72412adc1653d...,2009-05-14 17:12:40,2,0,3,Not Packed,"('LoadLibraryA, ExitProcess, GetProcAddress, V...",Normal,"1http://w, *(SY)# cmd"


In [38]:
# Revisar tipos de datos de cada columna
df.info()

# Contar valores nulos en cada columna
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Filename             40 non-null     object
 1   SHA256               40 non-null     object
 2   TimeDateStamp        40 non-null     object
 3   Subsystem            40 non-null     int64 
 4   DLLCharacteristics   40 non-null     int64 
 5   NumberOfSections     40 non-null     int64 
 6   Packed               40 non-null     object
 7   Imports              40 non-null     object
 8   Suspicious Sections  40 non-null     object
 9   Strings              38 non-null     object
dtypes: int64(3), object(7)
memory usage: 3.3+ KB


Filename               0
SHA256                 0
TimeDateStamp          0
Subsystem              0
DLLCharacteristics     0
NumberOfSections       0
Packed                 0
Imports                0
Suspicious Sections    0
Strings                2
dtype: int64

In [39]:
# Revisar estadísticas de columnas numéricas
df.describe()

# Revisar estadísticas de columnas categóricas
df.describe(include="object")

Unnamed: 0,Filename,SHA256,TimeDateStamp,Packed,Imports,Suspicious Sections,Strings
count,40,40,40,40,40,40,38
unique,40,40,15,1,9,1,12
top,JH78C0A33A1B472A8C16123FD696A5CE5EBB,5566759631e5eaa2833057a5a57caa31837ddee6587386...,2009-05-14 17:12:40,Not Packed,"('LoadLibraryA, ExitProcess, GetProcAddress, V...",Normal,"1http://w, *(SY)# cmd"
freq,1,1,12,40,15,40,15


In [40]:
# Convertir columnas categóricas a valores numéricos
df["Packed"] = df["Packed"].map({"Not Packed": 0, "UPX Detected": 1})

# Convertir 'Suspicious Sections' en valores numéricos
df["Suspicious Sections"] = df["Suspicious Sections"].map({
    "Normal": 0,
    "Small .text (Possible Packing)": 1,
    "Large .rsrc (Possible Malware)": 2
})

# Codificar 'Subsystem' y 'DLLCharacteristics'
df["Subsystem"] = df["Subsystem"].astype("category").cat.codes
df["DLLCharacteristics"] = df["DLLCharacteristics"].astype("category").cat.codes

In [41]:
# Contar el número de imports por archivo
df["Num_Imports"] = df["Imports"].apply(lambda x: len(str(x).split(",")) if pd.notnull(x) else 0)

# Contar el número de strings sospechosas por archivo
df["Num_Strings"] = df["Strings"].apply(lambda x: len(str(x).split(",")) if pd.notnull(x) else 0)

# Eliminar las columnas de texto, ya que ahora tenemos variables numéricas
df.drop(["Imports", "Strings"], axis=1, inplace=True)

In [42]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
columns_to_scale = ["NumberOfSections", "Num_Imports", "Num_Strings"]
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

df.head()

Unnamed: 0,Filename,SHA256,TimeDateStamp,Subsystem,DLLCharacteristics,NumberOfSections,Packed,Suspicious Sections,Num_Imports,Num_Strings
0,JH78C0A33A1B472A8C16123FD696A5CE5EBB,5566759631e5eaa2833057a5a57caa31837ddee6587386...,2009-05-14 17:12:40,0,0,0.0,0,0,0.012987,0.4
1,NBV_8B75BCBFF174C25A0161F30758509A44,aa5cb0e2aab4b8544d43fdf1f26db234c2039caaddb412...,2009-05-14 17:12:40,0,0,0.0,0,0,0.012987,0.4
2,POL55_A4F1ECC4D25B33395196B5D51A06790,57ef37c30a4a10b334a745d2f6235b4703ae99cf8e64e1...,2011-07-29 07:10:31,0,0,0.0,0,0,1.0,1.0
3,6FAA4740F99408D4D2DDDD0B09BBDEFD,41e14883b3a545de6c34c1f14d3856d0e070af9aee7d3b...,2009-05-14 17:12:40,0,0,0.0,0,0,0.012987,0.4
4,A316D5AECA269CA865077E7FFF356E7D,ea405f3a14b77297f8b15348a83b1680e72412adc1653d...,2009-05-14 17:12:40,0,0,0.0,0,0,0.012987,0.4


In [43]:
df.to_csv("malware_dataset_preprocessed.csv", index=False)
print("✅ Dataset preprocesado guardado como 'malware_dataset_preprocessed.csv'")

✅ Dataset preprocesado guardado como 'malware_dataset_preprocessed.csv'
