In [None]:
# ============================
#  High-Accuracy Attack-Type Classifier for TON-IoT
# ============================

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from catboost import CatBoostClassifier

# ---- 1. Load dataset ----
df = pd.read_csv("windows7_dataset.csv")

# ---- 2. Clean data (drop irrelevant columns) ----
cols_to_drop = ["timestamp", "id", "src_ip", "dst_ip", "label_binary"]  # change depending on dataset
df = df.drop([c for c in cols_to_drop if c in df.columns], axis=1)

# ---- 3. Split features / label ----
X = df.drop("type", axis=1)
y = df["type"]

# ---- 4. Train-test split ----
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# ---- 5. High-accuracy CatBoost model ----
model = CatBoostClassifier(
    iterations=2000,
    depth=10,
    learning_rate=0.03,
    loss_function='MultiClass',
    eval_metric='Accuracy',
    class_weights='Balanced',
    verbose=False
)

model.fit(X_train, y_train)

# ---- 6. Evaluation ----
y_pred = model.predict(X_test)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [7]:
# ============================
# High Accuracy Attack-Type Classifier (LightGBM)
# ============================

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import lightgbm as lgb
import re
df = pd.read_csv("windows7_dataset.csv")

# ---- Clean columns ----
cols_to_drop = ["timestamp", "id", "src_ip", "dst_ip", "label_binary"]
df = df.drop([c for c in cols_to_drop if c in df.columns], axis=1)
def clean_column_name(col):
    # حذف کاراکترهای غیرمجاز
    col = re.sub(r"[^A-Za-z0-9_]", "_", col)
    # جلوگیری از چندتا خط زیر همزمان
    col = re.sub(r"_+", "_", col)
    # حذف _ در ابتدا یا انتها
    col = col.strip("_")
    return col

df.columns = [clean_column_name(c) for c in df.columns]

# ---- Load Dataset ----


# ---- Split X / y ----
X = df.drop("label", axis=1)
y = df["label"]
# Clean object columns so that they become numeric
for col in X.columns:
    if X[col].dtype == "object":
        X[col] = (
            X[col]
            .astype(str)
            .str.replace("%", "", regex=False)
            .str.replace(",", "", regex=False)
            .str.strip()
            .replace(["nan", "None", "?", ""], pd.NA)
        )

        # Convert to numeric (invalid parsing → NaN)
        X[col] = pd.to_numeric(X[col], errors="coerce")

# Replace NaN with median
X = X.fillna(X.median(numeric_only=True))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ---- LightGBM model ----
model = lgb.LGBMClassifier(
    n_estimators=2000,
    learning_rate=0.03,
    num_leaves=120,
    class_weight='balanced',
    objective='multiclass',
)

model.fit(X_train, y_train)

# ---- Evaluation ----
y_pred = model.predict(X_test)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


  df = pd.read_csv("windows7_dataset.csv")


LightGBMError: Number of classes should be specified and greater than 1 for multiclass training