### Load the dataset


In [3]:
import os
import numpy as np
import pandas as pd


def read_binary_files(directory, endiannes, instruction_set):
    data = []
    for filename in os.listdir(directory):
        if ".code" in filename:
            continue
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path):
            with open(file_path, "rb") as file:
                byte_array = np.frombuffer(file.read(), dtype=np.uint8)
                data.append(
                    {
                        "filename": filename,
                        "byte_array": byte_array,
                        "instruction_set": instruction_set,
                        "endianness": endiannes
                    }
                )
        if(len(data) >= 500):
            break
    return pd.DataFrame(data)

dfs = []
datasetdir = "../dataset/isa-detect/full-binaries"
for architecture in os.listdir(datasetdir):
    architecture_path = os.path.join(datasetdir, architecture)
    with open(os.path.join(architecture_path, f"{architecture}.json"), "rb") as json:
        frame = pd.read_json(json)
        dfs.append(read_binary_files(architecture_path, frame["endianness"][1], architecture))
    print(f"done {architecture}")
# mips_df = read_binary_files("../dataset/mips-binaries", label="mips")
# mipsel_df = read_binary_files("../dataset/mipsel-binaries", label="mipsel")

data_df = pd.concat(dfs)

data_df.head()

done sh4
done hppa
done sparc
done armel
done armhf
done s390x
done mips64el
done powerpc
done riscv64
done sparc64
done alpha
done arm64
done powerpcspe
done m68k
done mipsel
done i386
done amd64
done x32
done mips
done ppc64
done s390
done ppc64el
done ia64


Unnamed: 0,filename,byte_array,instruction_set,endianness
0,f1a7d1f5cdc5585cf36d15f44d529000,"[127, 69, 76, 70, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0...",sh4,little
1,fd52f26edf5fb6cf4f1702349d00edce,"[127, 69, 76, 70, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0...",sh4,little
2,45e05efd7b4fb113155cada98b1b76b9,"[127, 69, 76, 70, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0...",sh4,little
3,ee81f48ef9bc6d3e158b39e129373277,"[127, 69, 76, 70, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0...",sh4,little
4,7850e9fb64a382027d36203d95ca96f9,"[127, 69, 76, 70, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0...",sh4,little


### Add byte value frequency features


In [8]:
# def count_normalized_bytes(byte_array):
#     counts = np.bincount(byte_array, minlength=256)
#     return counts / counts.sum()


# byte_features = data_df["byte_array"].apply(count_normalized_bytes)
# byte_columns = [f"byte_{i}" for i in range(256)]
# byte_df = pd.DataFrame(byte_features.tolist(), columns=byte_columns)
# byte_df.index = data_df.index
# df_with_byte_features = pd.concat([data_df, byte_df], axis=1)

# df_with_byte_features.head()

### Add heuristic features for endianness detection


In [4]:
def add_endian_features(df):
    def count_patterns(byte_array):
        arr = np.array(byte_array)
        pairs = np.column_stack((arr[:-1], arr[1:]))

        count_0001 = np.sum((pairs == [0x00, 0x01]).all(axis=1))
        count_0100 = np.sum((pairs == [0x01, 0x00]).all(axis=1))
        count_fffe = np.sum((pairs == [0xFF, 0xFE]).all(axis=1))
        count_feff = np.sum((pairs == [0xFE, 0xFF]).all(axis=1))

        return count_0001, count_0100, count_fffe, count_feff

    counts = df["byte_array"].apply(count_patterns)

    df["count_0001"], df["count_0100"], df["count_fffe"], df["count_feff"] = zip(
        *counts
    )

    return df


df_with_features = add_endian_features(data_df)
df_with_features.head()

Unnamed: 0,filename,byte_array,instruction_set,endianness,count_0001,count_0100,count_fffe,count_feff
0,f1a7d1f5cdc5585cf36d15f44d529000,"[127, 69, 76, 70, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0...",sh4,little,53,185,0,77
1,fd52f26edf5fb6cf4f1702349d00edce,"[127, 69, 76, 70, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0...",sh4,little,57,104,0,3
2,45e05efd7b4fb113155cada98b1b76b9,"[127, 69, 76, 70, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0...",sh4,little,65,186,0,31
3,ee81f48ef9bc6d3e158b39e129373277,"[127, 69, 76, 70, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0...",sh4,little,38,123,0,35
4,7850e9fb64a382027d36203d95ca96f9,"[127, 69, 76, 70, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0...",sh4,little,94,241,0,92


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from collections import defaultdict

# Assume df_with_features has columns for instruction_set, endianness, and the count features

# Endianness features
endianness_features = ['count_0001', 'count_0100', 'count_fffe', 'count_feff']

# Prepare the features (X) and target variable (y)
X = df_with_features[endianness_features]
y = df_with_features['endianness']
architectures = df_with_features['instruction_set']

# Function to train and evaluate a model for a single architecture
def train_evaluate_architecture(X, y, architecture_name):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train the model
    model = LogisticRegression(random_state=42)
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"\nArchitecture: {architecture_name}")
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    return model, scaler

# Dictionary to store models and scalers for each architecture
architecture_models = {}

# Train and evaluate a model for each unique architecture
for arch in architectures.unique():
    arch_mask = architectures == arch
    X_arch = X[arch_mask]
    y_arch = y[arch_mask]
    
    model, scaler = train_evaluate_architecture(X_arch, y_arch, arch)
    architecture_models[arch] = (model, scaler)

# Function to predict endianness for a new sample
def predict_endianness(sample, architecture):
    if architecture not in architecture_models:
        raise ValueError(f"No model available for architecture: {architecture}")
    
    model, scaler = architecture_models[architecture]
    sample_scaled = scaler.transform([sample])
    prediction = model.predict(sample_scaled)
    probability = model.predict_proba(sample_scaled)
    
    return prediction[0], probability[0]

# Example usage
print("\nExample predictions:")
for arch in architectures.unique():
    sample = X[architectures == arch].iloc[0]  # Get first sample of each architecture
    prediction, probability = predict_endianness(sample, arch)
    print(f"Architecture: {arch}")
    print(f"Predicted endianness: {prediction}")
    print(f"Probabilities: {probability}")
    print()

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 'little'