### Load the dataset


In [65]:
import os
import numpy as np
import pandas as pd


def read_binary_files(directory, label):
    data = []
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if file_path.endswith(".code") and os.path.isfile(file_path):
            with open(file_path, "rb") as file:
                byte_array = np.frombuffer(file.read(), dtype=np.uint8)
                data.append(
                    {
                        "filename": filename,
                        "byte_array": byte_array,
                        "instruction_set": label,
                    }
                )
    return pd.DataFrame(data)


mips_df = read_binary_files(
    "../../dataset/ISAdetect/ISAdetect_full_dataset/mips", label="mips"
)
mipsel_df = read_binary_files(
    "../../dataset/ISAdetect/ISAdetect_full_dataset/mipsel", label="mipsel"
)

data_df = pd.concat([mips_df, mipsel_df])

data_df.head()

Unnamed: 0,filename,byte_array,instruction_set
0,bb985a7161e0d0017cfc29a4026733eb.code,"[60, 28, 0, 4, 39, 156, 227, 240, 3, 153, 224,...",mips
1,8a74d331e33f5e88434b39f143a0b955.code,"[60, 28, 0, 2, 39, 156, 144, 12, 3, 153, 224, ...",mips
2,82afd73ec69466140b30c0376797285d.code,"[60, 28, 0, 2, 39, 156, 86, 72, 3, 153, 224, 3...",mips
3,f3161e2ab0199b09ce4f1aeae5ce3571.code,"[60, 28, 0, 2, 39, 156, 85, 52, 3, 153, 224, 3...",mips
4,91b889d38cb43b9430457599c0cca0f3.code,"[60, 28, 0, 3, 39, 156, 41, 252, 3, 153, 224, ...",mips


### Add byte value frequency features


In [2]:
# def count_normalized_bytes(byte_array):
#     counts = np.bincount(byte_array, minlength=256)
#     return counts / counts.sum()


# byte_features = data_df["byte_array"].apply(count_normalized_bytes)
# byte_columns = [f"byte_{i}" for i in range(256)]
# byte_df = pd.DataFrame(byte_features.tolist(), columns=byte_columns)
# byte_df.index = data_df.index
# df_with_byte_features = pd.concat([data_df, byte_df], axis=1)

# df_with_byte_features.head()

### Add heuristic features for endianness detection


In [66]:
def add_endian_features(df):
    def count_patterns(byte_array):
        arr = np.array(byte_array)
        pairs = np.column_stack((arr[:-1], arr[1:]))

        count_0001 = np.sum((pairs == [0x00, 0x01]).all(axis=1))
        count_0100 = np.sum((pairs == [0x01, 0x00]).all(axis=1))
        count_fffe = np.sum((pairs == [0xFF, 0xFE]).all(axis=1))
        count_feff = np.sum((pairs == [0xFE, 0xFF]).all(axis=1))

        return count_0001, count_0100, count_fffe, count_feff

    counts = df["byte_array"].apply(count_patterns)

    df["count_0001"], df["count_0100"], df["count_fffe"], df["count_feff"] = zip(
        *counts
    )

    return df


df_with_features = add_endian_features(data_df)
df_with_features.head()

Unnamed: 0,filename,byte_array,instruction_set,count_0001,count_0100,count_fffe,count_feff
0,bb985a7161e0d0017cfc29a4026733eb.code,"[60, 28, 0, 4, 39, 156, 227, 240, 3, 153, 224,...",mips,313,47,22,0
1,8a74d331e33f5e88434b39f143a0b955.code,"[60, 28, 0, 2, 39, 156, 144, 12, 3, 153, 224, ...",mips,24,4,1,0
2,82afd73ec69466140b30c0376797285d.code,"[60, 28, 0, 2, 39, 156, 86, 72, 3, 153, 224, 3...",mips,466,113,12,0
3,f3161e2ab0199b09ce4f1aeae5ce3571.code,"[60, 28, 0, 2, 39, 156, 85, 52, 3, 153, 224, 3...",mips,478,119,12,0
4,91b889d38cb43b9430457599c0cca0f3.code,"[60, 28, 0, 3, 39, 156, 41, 252, 3, 153, 224, ...",mips,253,15,0,0


### Prepare Torch tensors


In [186]:
import torch

X = torch.tensor(
    df_with_features[["count_0001", "count_0100", "count_fffe", "count_feff"]].values,
    dtype=torch.float,
)

# 0 for mips, 1 for mipsel
y = torch.tensor(
    df_with_features[["instruction_set"]].map(lambda x: 0 if x == "mips" else 1).values,
    dtype=torch.float,
)

X.size()

torch.Size([7340, 4])

### Split into train and test


In [188]:
from sklearn.preprocessing import StandardScaler

train_size = int(len(X) * 0.8)
indices = torch.randperm(len(X))

X_train = X[indices[:train_size]]
y_train = y[indices[:train_size]]

X_test = X[indices[train_size:]]
y_test = y[indices[train_size:]]

# Normalize X
scaler = StandardScaler()
X_train = torch.tensor(scaler.fit_transform(X_train), dtype=torch.float)
X_test = torch.tensor(scaler.transform(X_test), dtype=torch.float)

### Create Pytorch model


In [190]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, input_size, output_size):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(input_size, output_size)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        out = self.linear(x)
        out = self.sigmoid(out)
        return out


model = LogisticRegression(4, 1)
loss = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
epochs = 10000

# Training loop
for epoch in range(epochs):
    # Forward pass
    y_pred = model(X_train)

    # Compute loss
    l = loss(y_pred, y_train)

    # Backward pass
    l.backward()

    # Update weights
    optimizer.step()
    optimizer.zero_grad()

    # Print loss
    if (epoch + 1) % 1000 == 0:
        print(f"Epoch {epoch + 1}: loss {l.item()}")

y_test_pred = model(X_test)

model.eval()
with torch.no_grad():
    y_test_pred = model(X_test)
    y_test_pred_binary = (y_test_pred > 0.5).float()

    # Calculate accuracy
    accuracy = (y_test_pred_binary == y_test).float().mean()
    print(f"Accuracy: {accuracy.item():.4f}")

Epoch 1000: loss 0.20394288003444672
Epoch 2000: loss 0.14490991830825806
Epoch 3000: loss 0.11138653755187988
Epoch 4000: loss 0.08832225948572159
Epoch 5000: loss 0.07115404307842255
Epoch 6000: loss 0.05791636183857918
Epoch 7000: loss 0.04757949709892273
Epoch 8000: loss 0.03950663283467293
Epoch 9000: loss 0.03323638066649437
Epoch 10000: loss 0.02840394154191017
Accuracy: 0.9966
