In [1]:
import yara
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import matplotlib.pyplot as plt
import os
import re
import subprocess

SAMPLE_PATH = '/data/samples'
SIG_FILE = 'network_detect.yara'
SOURCE_FILE = 'network.go'

# Testing golang arch/platform compilation on Yara signature efficacy

In [2]:
# this calls the go binary to get all the supported architectures and platforms
# you should get 30-50 depending on version you are running
def get_arch_plat():
    p = subprocess.Popen (['go',  'tool', 'dist', 'list'], stdout=subprocess.PIPE)
    return [
        x.decode('utf-8').split('/')
        for x in p.stdout.read().split()
    ]
    
arch_plats = get_arch_plat()
print(f"Got {len(arch_plats)} plats") 

Got 46 plats


In [3]:
# This loops over the available arch/plats and builds the local file network.go
# network.go contains a number of strings and identifiers that malware analysts would normally signature on

# GOARCH=arm64 go build -ldflags "-s -w" -o test.exe network.go  
for (plat, arch) in arch_plats:
    filename = f"network_{plat}_{arch}.out"
    print(f"making {filename}")
    myenv = os.environ.copy()
    myenv['GOOS'] = plat
    myenv['GOARCH'] = arch
    p = subprocess.run(
        ['go', 'build', '-ldflags=-buildid=', '-o', f"/data/samples/{filename}", f"/data/{SOURCE_FILE}"],
        env=myenv,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )
    #print(p.stderr)

making network_aix_ppc64.out
making network_android_386.out
making network_android_amd64.out
making network_android_arm.out
making network_android_arm64.out
making network_darwin_amd64.out
making network_darwin_arm64.out
making network_dragonfly_amd64.out
making network_freebsd_386.out
making network_freebsd_amd64.out
making network_freebsd_arm.out
making network_freebsd_arm64.out
making network_illumos_amd64.out
making network_ios_amd64.out
making network_ios_arm64.out
making network_js_wasm.out
making network_linux_386.out
making network_linux_amd64.out
making network_linux_arm.out
making network_linux_arm64.out
making network_linux_loong64.out
making network_linux_mips.out
making network_linux_mips64.out
making network_linux_mips64le.out
making network_linux_mipsle.out
making network_linux_ppc64.out
making network_linux_ppc64le.out
making network_linux_riscv64.out
making network_linux_s390x.out
making network_netbsd_386.out
making network_netbsd_amd64.out
making network_netbsd_arm.o

In [4]:
# Here we go through every file, run yara and capture a dictionary of results for later analysis

filename_parser = re.compile(r'network\_(\w+)\_(\w+)\..+', re.S)

def process_file(yara, filename):
    m = filename_parser.match(filename)
    arch = m.group(2)
    plat = m.group(1)
    return {
        'filename': filename,
        'yara': y.match(f"{SAMPLE_PATH}/{filename}"),
        'os': plat,
        'arch': arch,
    }     

y = yara.compile(SIG_FILE)
files = os.listdir(SAMPLE_PATH)
print(f"Found {len(files)} samples to scan")

# then convert the results to a dataframe for easier viewing and analysis
df = pd.DataFrame([
    process_file(y, x)
    for x in files
])
df.head()

Found 41 samples to scan


Unnamed: 0,filename,yara,os,arch
0,network_linux_riscv64.out,"[network_detect_partial_GET, network_detect_fu...",linux,riscv64
1,network_linux_loong64.out,"[network_detect_partial_GET, network_detect_fu...",linux,loong64
2,network_android_arm64.out,"[network_detect_partial_GET, network_detect_fu...",android,arm64
3,network_illumos_amd64.out,"[network_detect_partial_GET, network_detect_c2...",illumos,amd64
4,network_darwin_amd64.out,"[network_detect_partial_GET, network_detect_c2...",darwin,amd64


In [5]:
# here we take each detection from the yara column and one-hot encode them to easily tell
# which strings are detected and which are not
mlb = MultiLabelBinarizer(sparse_output=True)

# let's one-hot encode the yara column for easier analysis
df = df.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(df.pop('yara')),
                index=df.index,
                columns=mlb.classes_))

In [6]:
# helper functions for pretty printing results
# colors the cells diff colors based on detection
def yara_detection(v):
    if v == 1:
        return "Detected"
    if v == 0:
        return "Not Detected"
    return v

def make_pretty(styler):
    styler.set_caption("Yara Detection Matrix")
    styler.format(yara_detection)
    styler.background_gradient(axis=None, vmin=0, vmax=1, cmap="YlGnBu")
    return styler

df.style.pipe(make_pretty)

Unnamed: 0,filename,os,arch,detect_go_binary_buildid,network_detect_c2_domain,network_detect_c2_ipv6,network_detect_error_message_string,network_detect_full_GET,network_detect_full_GET_fullword,network_detect_magic,network_detect_partial_GET
0,network_linux_riscv64.out,linux,riscv64,Detected,Detected,Detected,Detected,Detected,Not Detected,Not Detected,Detected
1,network_linux_loong64.out,linux,loong64,Detected,Detected,Detected,Detected,Detected,Not Detected,Not Detected,Detected
2,network_android_arm64.out,android,arm64,Detected,Detected,Detected,Detected,Detected,Not Detected,Not Detected,Detected
3,network_illumos_amd64.out,illumos,amd64,Detected,Detected,Detected,Detected,Not Detected,Not Detected,Detected,Detected
4,network_darwin_amd64.out,darwin,amd64,Detected,Detected,Detected,Detected,Not Detected,Not Detected,Detected,Detected
5,network_windows_386.out,windows,386,Detected,Detected,Detected,Detected,Detected,Not Detected,Detected,Detected
6,network_freebsd_386.out,freebsd,386,Detected,Detected,Detected,Detected,Detected,Not Detected,Detected,Detected
7,network_aix_ppc64.out,aix,ppc64,Detected,Detected,Detected,Detected,Detected,Not Detected,Not Detected,Detected
8,network_plan9_386.out,plan9,386,Detected,Detected,Detected,Detected,Detected,Not Detected,Detected,Detected
9,network_openbsd_386.out,openbsd,386,Detected,Detected,Detected,Detected,Detected,Not Detected,Detected,Detected


In [7]:
# finally we can do some manual "hunting" for the magic string that is only detected on x86
# we can see from the output here that for arm/risc/ppc the value is little endian, and for x86 it's big endian

for file in files:
    with open(f"{SAMPLE_PATH}/{file}", "rb") as f:
        data = f.read()
    if data.find(b'\x19\x80\x14\x06') > 0:
        print(f"Found big endian magic in {file}")
    if data.find(b'\x06\x14\x80\x19') > 0 or data.find(b'\x14\x06\x19\x80'): # byte and word swap'd for arch
        print(f"Found little endian magic in {file}")        

Found little endian magic in network_linux_riscv64.out
Found little endian magic in network_linux_loong64.out
Found little endian magic in network_android_arm64.out
Found big endian magic in network_illumos_amd64.out
Found little endian magic in network_illumos_amd64.out
Found big endian magic in network_darwin_amd64.out
Found little endian magic in network_darwin_amd64.out
Found big endian magic in network_windows_386.out
Found little endian magic in network_windows_386.out
Found big endian magic in network_freebsd_386.out
Found little endian magic in network_freebsd_386.out
Found little endian magic in network_aix_ppc64.out
Found big endian magic in network_plan9_386.out
Found little endian magic in network_plan9_386.out
Found big endian magic in network_openbsd_386.out
Found little endian magic in network_openbsd_386.out
Found big endian magic in network_netbsd_386.out
Found little endian magic in network_netbsd_386.out
Found little endian magic in network_openbsd_mips64.out
Found l