In [13]:
pip install pefile

Note: you may need to restart the kernel to use updated packages.


In [21]:
import pefile
import joblib
import numpy as np

# Load the trained model and scaler
loaded_model = joblib.load('best_xgb_model.joblib')
loaded_scaler = joblib.load('scaler.joblib')

# Function to extract features from a PE file
def extract_features(file_path):
    pe = pefile.PE(file_path)

    # Extract relevant features (you might need to adjust this based on your dataset)
    features = {
        'e_magic': pe.DOS_HEADER.e_magic,
        'e_cblp': pe.DOS_HEADER.e_cblp,
        'e_cp': pe.DOS_HEADER.e_cp,
        'e_crlc': pe.DOS_HEADER.e_crlc,
        'e_cparhdr': pe.DOS_HEADER.e_cparhdr,
        'e_minalloc': pe.DOS_HEADER.e_minalloc,
        'e_maxalloc': pe.DOS_HEADER.e_maxalloc,
        'e_ss': pe.DOS_HEADER.e_ss,
        'e_sp': pe.DOS_HEADER.e_sp,
        'e_csum': pe.DOS_HEADER.e_csum,
        'e_ip': pe.DOS_HEADER.e_ip,
        'e_cs': pe.DOS_HEADER.e_cs,
        'e_lfarlc': pe.DOS_HEADER.e_lfarlc,
        'e_ovno': pe.DOS_HEADER.e_ovno,
        'e_oemid': pe.DOS_HEADER.e_oemid,
        'e_oeminfo': pe.DOS_HEADER.e_oeminfo,
        'e_lfanew': pe.DOS_HEADER.e_lfanew,
        # Add more features as needed
        'Machine': pe.FILE_HEADER.Machine,
        'NumberOfSections': pe.FILE_HEADER.NumberOfSections,
        'TimeDateStamp': pe.FILE_HEADER.TimeDateStamp,
        'PointerToSymbolTable': pe.FILE_HEADER.PointerToSymbolTable,
        'NumberOfSymbols': pe.FILE_HEADER.NumberOfSymbols,
        'SizeOfOptionalHeader': pe.FILE_HEADER.SizeOfOptionalHeader,
        'Characteristics': pe.FILE_HEADER.Characteristics,
        # Add more features as needed
        'Magic': pe.OPTIONAL_HEADER.Magic,
        'MajorLinkerVersion': pe.OPTIONAL_HEADER.MajorLinkerVersion,
        'MinorLinkerVersion': pe.OPTIONAL_HEADER.MinorLinkerVersion,
        'SizeOfCode': pe.OPTIONAL_HEADER.SizeOfCode,
        'SizeOfInitializedData': pe.OPTIONAL_HEADER.SizeOfInitializedData,
        'SizeOfUninitializedData': pe.OPTIONAL_HEADER.SizeOfUninitializedData,
        'AddressOfEntryPoint': pe.OPTIONAL_HEADER.AddressOfEntryPoint,
        'BaseOfCode': pe.OPTIONAL_HEADER.BaseOfCode,
        'ImageBase': pe.OPTIONAL_HEADER.ImageBase,
        'SectionAlignment': pe.OPTIONAL_HEADER.SectionAlignment,
        'FileAlignment': pe.OPTIONAL_HEADER.FileAlignment,
        'MajorOperatingSystemVersion': pe.OPTIONAL_HEADER.MajorOperatingSystemVersion,
        'MinorOperatingSystemVersion': pe.OPTIONAL_HEADER.MinorOperatingSystemVersion,
        'MajorImageVersion': pe.OPTIONAL_HEADER.MajorImageVersion,
        'MinorImageVersion': pe.OPTIONAL_HEADER.MinorImageVersion,
        'MajorSubsystemVersion': pe.OPTIONAL_HEADER.MajorSubsystemVersion,
        'MinorSubsystemVersion': pe.OPTIONAL_HEADER.MinorSubsystemVersion,
        'SizeOfHeaders': pe.OPTIONAL_HEADER.SizeOfHeaders,
        'CheckSum': pe.OPTIONAL_HEADER.CheckSum,
        'SizeOfImage': pe.OPTIONAL_HEADER.SizeOfImage,
        'Subsystem': pe.OPTIONAL_HEADER.Subsystem,
        'DllCharacteristics': pe.OPTIONAL_HEADER.DllCharacteristics,
        'SizeOfStackReserve': pe.OPTIONAL_HEADER.SizeOfStackReserve,
        'SizeOfStackCommit': pe.OPTIONAL_HEADER.SizeOfStackCommit,
        'SizeOfHeapReserve': pe.OPTIONAL_HEADER.SizeOfHeapReserve,
        'SizeOfHeapCommit': pe.OPTIONAL_HEADER.SizeOfHeapCommit,
        'LoaderFlags': pe.OPTIONAL_HEADER.LoaderFlags,
        'NumberOfRvaAndSizes': pe.OPTIONAL_HEADER.NumberOfRvaAndSizes,
        # Add more features as needed
        'SuspiciousImportFunctions': 0,  # Replace with actual value
        'SuspiciousNameSection': 0,  # Replace with actual value
        # Add more features as needed
    }

    # Sections and directories
    # The following code assumes there are sections and directories present in the PE file.
    # You will need to add error handling in case these are not present.
    sections_entropy = [s.get_entropy() for s in pe.sections]
    features['SectionsLength'] = len(pe.sections)
    features['SectionMinEntropy'] = min(sections_entropy)
    features['SectionMaxEntropy'] = max(sections_entropy)
    features['SectionMinRawsize'] = min([s.SizeOfRawData for s in pe.sections])
    features['SectionMaxRawsize'] = max([s.SizeOfRawData for s in pe.sections])
    features['SectionMinVirtualsize'] = min([s.Misc_VirtualSize for s in pe.sections])
    features['SectionMaxVirtualsize'] = max([s.Misc_VirtualSize for s in pe.sections])
    features['SectionMinPhysical'] = min([s.SizeOfRawData for s in pe.sections])
    features['SectionMaxPhysical'] = max([s.SizeOfRawData for s in pe.sections])
    features['SectionMinVirtual'] = min([s.VirtualAddress for s in pe.sections])
    features['SectionMaxVirtual'] = max([s.VirtualAddress for s in pe.sections])
    features['SectionMinPointerData'] = min([s.PointerToRawData for s in pe.sections])
    features['SectionMaxPointerData'] = max([s.PointerToRawData for s in pe.sections])
    features['SectionMinChar'] = min([s.Characteristics for s in pe.sections])
    features['SectionMaxChar'] = max([s.Characteristics for s in pe.sections])
    features['SectionMainChar'] = pe.sections[0].Characteristics if len(pe.sections) > 0 else 0  # Assuming the first section is the main section

    # Directory Entries
    features['DirectoryEntryImport'] = pe.OPTIONAL_HEADER.DATA_DIRECTORY[1].VirtualAddress
    features['DirectoryEntryImportSize'] = pe.OPTIONAL_HEADER.DATA_DIRECTORY[1].Size
    features['DirectoryEntryExport'] = pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].VirtualAddress
    features['ImageDirectoryEntryExport'] = pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].VirtualAddress
    features['ImageDirectoryEntryImport'] = pe.OPTIONAL_HEADER.DATA_DIRECTORY[1].VirtualAddress
    features['ImageDirectoryEntryResource'] = pe.OPTIONAL_HEADER.DATA_DIRECTORY[2].VirtualAddress
    features['ImageDirectoryEntryException'] = pe.OPTIONAL_HEADER.DATA_DIRECTORY[3].VirtualAddress
    features['ImageDirectoryEntrySecurity'] = pe.OPTIONAL_HEADER.DATA_DIRECTORY[4].VirtualAddress
    # Add more features as needed

    # Close the PE file
    pe.close()

    return features

# Path to the file you want to check
file_to_check = 'malwares/1.exe'

# Extract features from the file
file_features = extract_features(file_to_check)

# Print the features
for feature_name, feature_value in file_features.items():
    print(f"{feature_name}: {feature_value}")


# Extract features from the file
file_features = extract_features(file_to_check)

# Print the features
print("Extracted Features:")
for feature_name, feature_value in file_features.items():
    print(f"{feature_name}: {feature_value}")

# Ensure the feature array has the same shape as the training data
if len(file_features) == len(loaded_model.feature_importances_):
    # Convert the features to a numpy array
    file_features_array = np.array(list(file_features.values())).reshape(1, -1)

    # Scale the features using the loaded scaler
    scaled_features = loaded_scaler.transform(file_features_array)

    # Use the trained model to predict if the file is malware or not
    prediction = loaded_model.predict(scaled_features)

    print("\nPrediction:")
    if prediction[0] == 1:
        print("The file is predicted to be malware.")
    else:
        print("The file is predicted to be benign.")
else:
    print("Feature extraction error: Ensure the extracted features match the training data.")

e_magic: 23117
e_cblp: 144
e_cp: 3
e_crlc: 0
e_cparhdr: 4
e_minalloc: 0
e_maxalloc: 65535
e_ss: 0
e_sp: 184
e_csum: 0
e_ip: 0
e_cs: 0
e_lfarlc: 64
e_ovno: 0
e_oemid: 0
e_oeminfo: 0
e_lfanew: 256
Machine: 332
NumberOfSections: 5
TimeDateStamp: 1359380869
PointerToSymbolTable: 0
NumberOfSymbols: 0
SizeOfOptionalHeader: 224
Characteristics: 271
Magic: 267
MajorLinkerVersion: 6
MinorLinkerVersion: 0
SizeOfCode: 82432
SizeOfInitializedData: 23072
SizeOfUninitializedData: 0
AddressOfEntryPoint: 73984
BaseOfCode: 704
ImageBase: 4194304
SectionAlignment: 16
FileAlignment: 16
MajorOperatingSystemVersion: 4
MinorOperatingSystemVersion: 0
MajorImageVersion: 0
MinorImageVersion: 0
MajorSubsystemVersion: 4
MinorSubsystemVersion: 0
SizeOfHeaders: 704
CheckSum: 0
SizeOfImage: 106208
Subsystem: 2
DllCharacteristics: 0
SizeOfStackReserve: 1048576
SizeOfStackCommit: 4096
SizeOfHeapReserve: 1048576
SizeOfHeapCommit: 4096
LoaderFlags: 0
NumberOfRvaAndSizes: 16
SuspiciousImportFunctions: 0
SuspiciousNameSe