### Load the dataset


In [60]:
import os
import numpy as np
import pandas as pd


def read_binary_files(directory, label):
    data = []
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path):
            with open(file_path, "rb") as file:
                byte_array = np.frombuffer(file.read(), dtype=np.uint8)
                data.append(
                    {
                        "filename": filename,
                        "byte_array": byte_array,
                        "instruction_set": label,
                    }
                )
    return pd.DataFrame(data)


mips_df = read_binary_files("../dataset/mips-binaries", label="mips")
mipsel_df = read_binary_files("../dataset/mipsel-binaries", label="mipsel")

data_df = pd.concat([mips_df, mipsel_df])

data_df.head()

Unnamed: 0,filename,byte_array,instruction_set
0,cpan,"[35, 33, 47, 117, 115, 114, 47, 98, 105, 110, ...",mips
1,apt-sortpkgs,"[127, 69, 76, 70, 1, 2, 1, 0, 0, 0, 0, 0, 0, 0...",mips
2,write,"[127, 69, 76, 70, 1, 2, 1, 0, 0, 0, 0, 0, 0, 0...",mips
3,systemd-detect-virt,"[127, 69, 76, 70, 1, 2, 1, 0, 0, 0, 0, 0, 0, 0...",mips
4,localectl,"[127, 69, 76, 70, 1, 2, 1, 0, 0, 0, 0, 0, 0, 0...",mips


### Add byte value frequency features


In [61]:
# def count_normalized_bytes(byte_array):
#     counts = np.bincount(byte_array, minlength=256)
#     return counts / counts.sum()


# byte_features = data_df["byte_array"].apply(count_normalized_bytes)
# byte_columns = [f"byte_{i}" for i in range(256)]
# byte_df = pd.DataFrame(byte_features.tolist(), columns=byte_columns)
# byte_df.index = data_df.index
# df_with_byte_features = pd.concat([data_df, byte_df], axis=1)

# df_with_byte_features.head()

### Add heuristic features for endianness detection


In [62]:
def add_endian_features(df):
    def count_patterns(byte_array):
        arr = np.array(byte_array)
        pairs = np.column_stack((arr[:-1], arr[1:]))

        count_0001 = np.sum((pairs == [0x00, 0x01]).all(axis=1))
        count_0100 = np.sum((pairs == [0x01, 0x00]).all(axis=1))
        count_fffe = np.sum((pairs == [0xFF, 0xFE]).all(axis=1))
        count_feff = np.sum((pairs == [0xFE, 0xFF]).all(axis=1))

        return count_0001, count_0100, count_fffe, count_feff

    counts = df["byte_array"].apply(count_patterns)

    df["count_0001"], df["count_0100"], df["count_fffe"], df["count_feff"] = zip(
        *counts
    )

    return df


df_with_features = add_endian_features(data_df)
df_with_features.head()

Unnamed: 0,filename,byte_array,instruction_set,count_0001,count_0100,count_fffe,count_feff
0,cpan,"[35, 33, 47, 117, 115, 114, 47, 98, 105, 110, ...",mips,0,0,0,0
1,apt-sortpkgs,"[127, 69, 76, 70, 1, 2, 1, 0, 0, 0, 0, 0, 0, 0...",mips,198,86,4,0
2,write,"[127, 69, 76, 70, 1, 2, 1, 0, 0, 0, 0, 0, 0, 0...",mips,186,55,2,0
3,systemd-detect-virt,"[127, 69, 76, 70, 1, 2, 1, 0, 0, 0, 0, 0, 0, 0...",mips,159,56,2,0
4,localectl,"[127, 69, 76, 70, 1, 2, 1, 0, 0, 0, 0, 0, 0, 0...",mips,247,78,8,0


In [63]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Prepare the features (X) and target variable (y)
X = df_with_features[
    # [f"byte_{i}" for i in range(256)] +
    ["count_0001", "count_0100", "count_fffe", "count_feff"]
]
y = (df_with_features["instruction_set"] == "mipsel").astype(int)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the logistic regression model
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["MIPS", "MIPSEL"]))

Accuracy: 0.8578

Classification Report:
              precision    recall  f1-score   support

        MIPS       1.00      0.69      0.82        95
      MIPSEL       0.79      1.00      0.88       109

    accuracy                           0.86       204
   macro avg       0.89      0.85      0.85       204
weighted avg       0.89      0.86      0.85       204

