In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from imblearn.combine import SMOTEENN
from skrebate import MultiSURF


## NSL-KDD Dataset Pipeline

In [None]:
# Load NSL-KDD
nsl_df = pd.read_csv("KDDTrain+.txt")
nsl_df.dropna(inplace=True)

# Encode categorical features
categorical_cols = nsl_df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    le = LabelEncoder()
    nsl_df[col] = le.fit_transform(nsl_df[col])

X_nsl = nsl_df.drop("label", axis=1)
y_nsl = nsl_df["label"]
y_nsl = y_nsl.apply(lambda x: 0 if x == 0 else 1)

X_train_nsl, X_test_nsl, y_train_nsl, y_test_nsl = train_test_split(X_nsl, y_nsl, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_nsl = scaler.fit_transform(X_train_nsl)
X_test_nsl = scaler.transform(X_test_nsl)

fs = MultiSURF()
X_train_nsl_fs = fs.fit_transform(X_train_nsl, y_train_nsl)
X_test_nsl_fs = fs.transform(X_test_nsl)

smote_enn = SMOTEENN(random_state=42)
X_nsl_resampled, y_nsl_resampled = smote_enn.fit_resample(X_train_nsl_fs, y_train_nsl)

rf_nsl = RandomForestClassifier(n_estimators=100, random_state=42)
start_time = time.time()
rf_nsl.fit(X_nsl_resampled, y_nsl_resampled)
latency_nsl = (time.time() - start_time) * 1000

preds_nsl = rf_nsl.predict(X_test_nsl_fs)
print("NSL-KDD Results:")
print("Accuracy:", round(accuracy_score(y_test_nsl, preds_nsl) * 100, 2), "%")
print("F1 Score:", round(f1_score(y_test_nsl, preds_nsl), 2))
print("Latency:", round(latency_nsl, 2), "ms")


## CSE-CIC-IDS2018 Dataset Pipeline

In [None]:
cic_df = pd.read_csv("CSE-CIC-IDS2018.csv")
cic_df.dropna(inplace=True)

categorical_cols = cic_df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    le = LabelEncoder()
    cic_df[col] = le.fit_transform(cic_df[col])

X_cic = cic_df.drop("Label", axis=1)
y_cic = cic_df["Label"]
y_cic = y_cic.apply(lambda x: 0 if x == 0 else 1)

X_train_cic, X_test_cic, y_train_cic, y_test_cic = train_test_split(X_cic, y_cic, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_cic = scaler.fit_transform(X_train_cic)
X_test_cic = scaler.transform(X_test_cic)

fs_cic = MultiSURF()
X_train_cic_fs = fs_cic.fit_transform(X_train_cic, y_train_cic)
X_test_cic_fs = fs_cic.transform(X_test_cic)

smote_enn = SMOTEENN(random_state=42)
X_cic_resampled, y_cic_resampled = smote_enn.fit_resample(X_train_cic_fs, y_train_cic)

rf_cic = RandomForestClassifier(n_estimators=100, random_state=42)
start_time = time.time()
rf_cic.fit(X_cic_resampled, y_cic_resampled)
latency_cic = (time.time() - start_time) * 1000

preds_cic = rf_cic.predict(X_test_cic_fs)
print("CSE-CIC-IDS2018 Results:")
print("Accuracy:", round(accuracy_score(y_test_cic, preds_cic) * 100, 2), "%")
print("F1 Score:", round(f1_score(y_test_cic, preds_cic), 2))
print("Latency:", round(latency_cic, 2), "ms")


## Custom Curated Dataset Pipeline

In [None]:
permissions_df = pd.read_csv("Permissions vector table 1,12,000 Apps, 1% perms (2).csv")
hardware_df = pd.read_csv("Hardware_comp_vector_table.csv")
intents_df = pd.read_csv("Intents 56000normal_56000mal_with_app_names_0_and_1_type.csv")

permissions_df.rename(columns={permissions_df.columns[0]: "apk"}, inplace=True)
hardware_df.rename(columns={hardware_df.columns[0]: "apk"}, inplace=True)
intents_df.rename(columns={intents_df.columns[0]: "apk"}, inplace=True)



def clean_apk_names(df):
    df["apk"] = df["apk"].astype(str).str.strip().str.replace('"', '').str.replace("0000- ", "").str.replace("0000-", "")
    return df

permissions_df = clean_apk_names(permissions_df)
hardware_df = clean_apk_names(hardware_df)
intents_df = clean_apk_names(intents_df)


# Use inner join to keep only common APKs
merged_df = permissions_df.merge(hardware_df, on="apk", how="inner")
merged_df = merged_df.merge(intents_df, on="apk", how="inner")

print("Shape of merged dataset:", merged_df.shape)
print("Number of unique APKs:", merged_df["apk"].nunique())


merged_df.to_csv("Custom_Curated_Android_Dataset.csv", index=False)


categorical_cols = merged_df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    le = LabelEncoder()
    merged_df[col] = le.fit_transform(merged_df[col])

X_custom = merged_df.drop("Label", axis=1)
y_custom = merged_df["Label"]
y_custom = y_custom.apply(lambda x: 0 if x == 0 else 1)

X_train_cus, X_test_cus, y_train_cus, y_test_cus = train_test_split(X_custom, y_custom, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_cus = scaler.fit_transform(X_train_cus)
X_test_cus = scaler.transform(X_test_cus)

fs_cus = MultiSURF()
X_train_cus_fs = fs_cus.fit_transform(X_train_cus, y_train_cus)
X_test_cus_fs = fs_cus.transform(X_test_cus)

smote_enn = SMOTEENN(random_state=42)
X_cus_resampled, y_cus_resampled = smote_enn.fit_resample(X_train_cus_fs, y_train_cus)

rf_cus = RandomForestClassifier(n_estimators=100, random_state=42)
start_time = time.time()
rf_cus.fit(X_cus_resampled, y_cus_resampled)
latency_cus = (time.time() - start_time) * 1000

preds_cus = rf_cus.predict(X_test_cus_fs)
print("Custom Dataset Results:")
print("Accuracy:", round(accuracy_score(y_test_cus, preds_cus) * 100, 2), "%")
print("F1 Score:", round(f1_score(y_test_cus, preds_cus), 2))
print("Latency:", round(latency_cus, 2), "ms")
