In [1]:
import os
import pefile
import pandas as pd
import hashlib
import requests


In [6]:


def extract_features(file_path):
    try:
        pe = pefile.PE(file_path)

        # Extracting additional features
        features = {
            "file_name": os.path.basename(file_path),
            "number_of_sections": len(pe.sections),
            "number_of_imports": len(pe.DIRECTORY_ENTRY_IMPORT) if hasattr(pe, 'DIRECTORY_ENTRY_IMPORT') else 0,
            "number_of_exports": len(pe.DIRECTORY_ENTRY_EXPORT.symbols) if hasattr(pe, 'DIRECTORY_ENTRY_EXPORT') else 0,
            "entry_point": pe.OPTIONAL_HEADER.AddressOfEntryPoint,
            "image_base": pe.OPTIONAL_HEADER.ImageBase,
            "file_size": os.path.getsize(file_path),
            "timestamp": pe.FILE_HEADER.TimeDateStamp,
            "number_of_rva_and_sizes": pe.OPTIONAL_HEADER.NumberOfRvaAndSizes,
            "check_sum": pe.OPTIONAL_HEADER.CheckSum,
            "dll_characteristics": pe.OPTIONAL_HEADER.DllCharacteristics,
            "size_of_code": pe.OPTIONAL_HEADER.SizeOfCode,
            "size_of_initialized_data": pe.OPTIONAL_HEADER.SizeOfInitializedData,
            "size_of_uninitialized_data": pe.OPTIONAL_HEADER.SizeOfUninitializedData,
            "section_alignment": pe.OPTIONAL_HEADER.SectionAlignment,
            "file_alignment": pe.OPTIONAL_HEADER.FileAlignment,
            "size_of_image": pe.OPTIONAL_HEADER.SizeOfImage,
            "size_of_headers": pe.OPTIONAL_HEADER.SizeOfHeaders,
            "characteristics": pe.FILE_HEADER.Characteristics,
            "machine": pe.FILE_HEADER.Machine,
            "number_of_symbols": pe.FILE_HEADER.NumberOfSymbols
        }

        return features
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None



In [7]:
def get_file_hash(file_path):
    hash_md5 = hashlib.md5()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

In [8]:
def get_virustotal_label(file_hash, api_key):
    url = f"https://www.virustotal.com/vtapi/v2/file/report?apikey={api_key}&resource={file_hash}"
    response = requests.get(url)
    data = response.json()
    return "malware" if data.get('positives', 0) > 1 else "benign"  # Threshold of 5 is arbitrary


In [9]:
import requests


# Include this in the main function
def main():
    directory = "./"  # Replace with the directory of your PE files
    files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.exe')]

    all_features = []
    for file in files:
        features = extract_features(file)
        if features:
            file_hash = get_file_hash(file)
            features['hash'] = file_hash
            all_features.append(features)

    api_key = '790468fbdd20c76493fa1693bacbb1e793e84066ea604fb990a3b2ee9d291678'
    
    for features in all_features:
        features['label'] = get_virustotal_label(features['hash'], api_key)

    df = pd.DataFrame(all_features)
    df.to_csv("pe_features_labeled-exe.csv", index=False)

# ... [rest of your script]
if __name__ == "__main__":
    main()

In [10]:

# 1. **`file_name`**: The name of the file. It's a basic identifier but not typically used for analysis.
   
#    _Example_: `"example.exe"`

# 2. **`number_of_sections`**: The count of sections in the PE file. Malware might have an unusual number of sections.

#    _Example_: A typical PE file might have sections like `.text`, `.data`, `.rdata`, etc. An unusually high number might be suspicious.

# 3. **`average_entropy`**: Entropy is a measure of randomness or complexity. High entropy can indicate encryption or packing, common in malware to evade detection.

#    _Example_: An entropy value close to 8 might suggest encryption.

# 4. **`number_of_imports`**: The number of imported functions. Malware often uses certain API calls that are rare in benign software.

#    _Example_: A high number of system-level operations might be a red flag.

# 5. **`number_of_exports`**: The number of exported functions. Unusual export patterns can be indicative of malicious intent.

#    _Example_: Malware designed as a DLL might export suspicious functions.

# 6. **`entry_point`**: The memory address where the execution starts. Malware might alter this to disrupt execution flow.

#    _Example_: An entry point outside standard sections like `.text` could be suspicious.

# 7. **`image_base`**: The preferred address of the first byte of the image when loaded into memory. Malware might use non-standard image bases.

#    _Example_: A standard image base is `0x00400000` for 32-bit and `0x140000000` for 64-bit executables.

# 8. **`file_size`**: The size of the file in bytes. Extremely large or small sizes can be noteworthy in analysis.

#    _Example_: A very small `.exe` file might be a wrapper or downloader for other malicious payloads.

# 9. **`timestamp`**: The creation time of the file according to the PE header. Fake or unusual timestamps can be a tactic used by malware authors.

#    _Example_: A timestamp that predates the known existence of certain APIs or technologies used in the file could indicate tampering.

# 10. **`number_of_rva_and_sizes`**: Count of the data directory entries in the optional header. Anomalies here might suggest tampering.

#     _Example_: A number different from 16 (the standard for Windows executables) might be unusual.

# 11. **`check_sum`**: A checksum of the image. Can be used to detect corruption or modifications.

#     _Example_: A mismatch between the calculated and stored checksum can indicate that the file has been altered.

# 12. **`dll_characteristics`**: Flags that indicate certain characteristics of the DLL. Some flags might be more common in malicious files.

#     _Example_: The presence of `IMAGE_DLLCHARACTERISTICS_DYNAMIC_BASE` suggests the use of ASLR, a security feature.

# 13. **`size_of_code`**: Size of the code (text) section.

#     _Example_: An abnormally large code section might contain packed or obfuscated code.

# 14. **`size_of_initialized_data`**: Size of the initialized data section.

#     _Example_: Malware might have unusually sized data sections to hide payloads.

# 15. **`size_of_uninitialized_data`**: Size of the uninitialized data section.

# 16. **`section_alignment`**: Alignment of sections in memory. Non-standard alignments can be a sign of packing or obfuscation.

# 17. **`file_alignment`**: Alignment of sections in the file. Similar to section alignment, non-standard values can indicate tampering.

# 18. **`size_of_image`**: The size of the image in memory, including all headers and sections.

# 19. **`size_of_headers`**: The combined size of the PE headers and the section table, indicating how much space the headers take.

# 20. **`characteristics`**: Flags that define the characteristics of the image. For example, whether it's a system file, a DLL, etc.

#     _Example_: The `IMAGE_FILE_DLL` flag indicates the file is a DLL.

# 21. **`machine`**: The architecture type for which the file is compiled.

#     _Example_: Common values are `0x14c` (Intel 386) and `0x8664` (x64).

# 22. **`number_of_symbols`**: Number of symbols in the symbol table, mainly used in debugging.

# Each of these features provides insights into the PE file's structure and behavior, which can be crucial for malware analysis. 
# By comparing these features against known benign and malicious characteristics, it's possible to classify files as potentially malicious or benign.