In [1]:
from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd

# Load and prepare data
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")

# Convert object columns to categorical
for col in df.select_dtypes(include="object"):
    df[col] = df[col].astype("category")

X = df.drop("cls_detected", axis=1)
y = df["cls_detected"]

# Show class labels
print("Unique classes:", y.unique())

# Train/test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Set up LightGBM model for multiclass
num_classes = y.nunique()

model = LGBMClassifier(
    objective="multiclass",
    num_class=num_classes,
    max_depth=10,
    n_estimators=1000
)

# Fit with early stopping callback
model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric="multi_logloss",
    callbacks=[lgb.early_stopping(stopping_rounds=10)],
)

# Predict and evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


Unique classes: [0 1 2 3 4 5 6 7 8 9]
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.998528 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22837
[LightGBM] [Info] Number of data points in the train set: 15915263, number of used features: 19
[LightGBM] [Info] Start training from score -1.774311
[LightGBM] [Info] Start training from score -6.903321
[LightGBM] [Info] Start training from score -8.009027
[LightGBM] [Info] Start training from score -2.219580
[LightGBM] [Info] Start training from score -1.833900
[LightGBM] [Info] Start training from score -2.193644
[LightGBM] [Info] Start training from score -2.086688
[LightGBM] [Info] Start training from score -2.096901
[LightGBM] [Info] Start training from score -2.391472
[LightGBM] [Info] Start training from score -2.201109
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[79]	valid_0's multi_logloss: 1.6

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, matthews_corrcoef

# Load data
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")

# Split features and label
X = df.drop("cls_detected", axis=1)
y = df["cls_detected"]

# Identify categorical columns
cat_cols = X.select_dtypes(include="object").columns.tolist()

# Define preprocessing (one-hot encode categorical features)
preprocessor = ColumnTransformer(
    transformers=[("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)],
    remainder="passthrough"  # numeric columns stay as is
)

# Create LinearSVC pipeline
svm_model = make_pipeline(
    preprocessor,
    LinearSVC(max_iter=10000, dual=False)  # dual=False is faster for large dense datasets
)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Fit
svm_model.fit(X_train, y_train)

# Predict
y_pred = svm_model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))
print("Matthews Correlation Coefficient (MCC):", round(matthews_corrcoef(y_test, y_pred), 4))


              precision    recall  f1-score   support

           0       0.31      0.60      0.41   1156814
           1       0.89      0.98      0.93      6851
           2       0.99      1.00      0.99      2268
           3       0.61      0.81      0.69    741116
           4       0.31      0.57      0.40   1089894
           5       0.28      0.15      0.19    760588
           6       0.33      0.15      0.21    846447
           7       0.30      0.29      0.29    837847
           8       0.43      0.00      0.00    624071
           9       0.30      0.02      0.03    754932

    accuracy                           0.35   6820828
   macro avg       0.48      0.46      0.42   6820828
weighted avg       0.35      0.35      0.30   6820828

Matthews Correlation Coefficient (MCC): 0.2542


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, matthews_corrcoef
from sklearn.feature_extraction import FeatureHasher
from sklearn.decomposition import TruncatedSVD
from scipy import sparse
from scipy.sparse import hstack
from scipy.stats import mode
import faiss

# Load data
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")

# Feature / label split
X = df.drop("cls_detected", axis=1)
y = df["cls_detected"].astype("category").cat.codes

# Identify categorical and numerical columns
cat_cols = X.select_dtypes(include="object").columns.tolist()
num_cols = [col for col in X.columns if col not in cat_cols]

# Fill missing values
X = X.fillna("missing")
X[num_cols] = X[num_cols].fillna(0)

# Hash categorical features to sparse matrix
X_cat_dicts = X[cat_cols].astype(str).to_dict(orient="records")
hasher = FeatureHasher(n_features=1024, input_type='dict')
X_cat_hashed_sparse = hasher.transform(X_cat_dicts)

# Scale numerical features
X_num_scaled = StandardScaler().fit_transform(X[num_cols]).astype("float32")
X_num_sparse = sparse.csr_matrix(X_num_scaled)

# Combine features
X_combined_sparse = hstack([X_cat_hashed_sparse, X_num_sparse]).tocsr()

# Print estimated memory if made dense
approx_gb = X_combined_sparse.shape[0] * X_combined_sparse.shape[1] * 4 / 1024**3
print(f"Estimated size if dense: {approx_gb:.2f} GB")

# Reduce dimensions to a manageable size
svd = TruncatedSVD(n_components=128, random_state=42)
X_reduced = svd.fit_transform(X_combined_sparse).astype("float32")

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X_reduced, y, test_size=0.3, stratify=y, random_state=42
)

# FAISS
d = X_train.shape[1]
index = faiss.IndexFlatL2(d)
index.add(X_train)

# Search
k = 5
_, indices = index.search(X_test, k)

# Predict
y_pred = mode(y_train.to_numpy()[indices], axis=1).mode.flatten()

# Evaluation
print(classification_report(y_test, y_pred))
print("MCC:", round(matthews_corrcoef(y_test, y_pred), 4))


: 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, matthews_corrcoef
from sklearn.preprocessing import OrdinalEncoder

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder

# Load dataset
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")

# Separate features and target
X = df.drop("cls_detected", axis=1)
y = df["cls_detected"]

# Identify categorical columns
cat_cols = X.select_dtypes(include="object").columns.tolist()
num_cols = [col for col in X.columns if col not in cat_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", make_pipeline(
            SimpleImputer(strategy="constant", fill_value="missing"),
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
        ), cat_cols),
        ("num", SimpleImputer(strategy="mean"), num_cols)
    ]
) 

# Random Forest model pipeline
rf_model = make_pipeline(
    preprocessor,
    RandomForestClassifier(
        n_estimators=100,           # Increase for better accuracy, decrease for speed
        max_depth=20,               # Set to None for full growth
        n_jobs=-1,                  # Use all CPU cores
        class_weight="balanced",   # Useful for imbalanced classes
        random_state=42
    )
)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Fit model
rf_model.fit(X_train, y_train)

# Predict
y_pred = rf_model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))
print("Matthews Correlation Coefficient (MCC):", round(matthews_corrcoef(y_test, y_pred), 4))


              precision    recall  f1-score   support

           0       0.74      0.49      0.59   1156814
           1       0.33      0.98      0.50      6851
           2       1.00      1.00      1.00      2268
           3       0.29      0.93      0.44    741116
           4       0.64      0.57      0.60   1089894
           5       0.53      0.51      0.52    760588
           6       0.64      0.48      0.55    846447
           7       0.77      0.36      0.49    837847
           8       0.58      0.52      0.55    624071
           9       0.71      0.33      0.45    754932

    accuracy                           0.52   6820828
   macro avg       0.62      0.62      0.57   6820828
weighted avg       0.62      0.52      0.53   6820828

Matthews Correlation Coefficient (MCC): 0.4742


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report, matthews_corrcoef

# Load dataset
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")

# Separate features and target
X = df.drop("cls_detected", axis=1)
y = df["cls_detected"]

# ❗ Convert all object columns to pandas 'category' dtype BEFORE train-test split
for col in X.select_dtypes(include="object").columns:
    X[col] = X[col].astype("category")
assert not any(X.dtypes == "object"), "Object columns remain!"

print(X.dtypes) 

# Fill missing in categorical columns with a special label
for col in X.select_dtypes(include="category").columns:
    X[col] = X[col].cat.add_categories(["__missing__"]).fillna("__missing__")

print(X.isna().sum())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Define model
model = HistGradientBoostingClassifier(
    max_iter=100,
    max_depth=10,
    learning_rate=0.1,
    early_stopping=True,
    random_state=42
)
print("Final dtypes before training:\n", X_train.dtypes)
print("Sample row:\n", X_train.head(1).to_dict())

# Fit
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))
print("Matthews Correlation Coefficient (MCC):", round(matthews_corrcoef(y_test, y_pred), 4))


timestep                  float64
cpm_id                   category
vehicle_id               category
detected_vehicle_id      category
object_type              category
detected_object_type     category
label_detected              int64
x_detected                float64
y_detected                float64
speed_detected            float64
heading_detected          float64
acceleration_detected     float64
is_self                     int64
sensor_id                category
sensor_type              category
sensor_range                int64
weather_conditions       category
visibility_range            int64
detection_confidence      float64
dtype: object
timestep                 0
cpm_id                   0
vehicle_id               0
detected_vehicle_id      0
object_type              0
detected_object_type     0
label_detected           0
x_detected               0
y_detected               0
speed_detected           0
heading_detected         0
acceleration_detected    0
is_self         

ValueError: could not convert string to float: 'CPM_truck43_586.0'

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, matthews_corrcoef

# Load data
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")

# Split features and label
X = df.drop("cls_detected", axis=1)
y = df["cls_detected"]

# Identify categorical columns
cat_cols = X.select_dtypes(include="object").columns.tolist()

# Define preprocessing (OneHotEncoding)
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ],
    remainder="passthrough"
)

# Create SGDClassifier pipeline
sgd_model = make_pipeline(
    preprocessor,
    SGDClassifier(
        loss="hinge",        # Linear SVM
        max_iter=1000,
        tol=1e-3,
        class_weight="balanced",  # optional if class imbalance
        random_state=42,
        n_jobs=-1
    )
)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)


# Fit
sgd_model.fit(X_train, y_train)

# Predict
y_pred = sgd_model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))
print("Matthews Correlation Coefficient (MCC):", round(matthews_corrcoef(y_test, y_pred), 4))


              precision    recall  f1-score   support

           0       0.45      0.00      0.00   1156814
           1       0.09      0.69      0.16      6851
           2       0.13      0.95      0.23      2268
           3       0.26      0.42      0.32    741116
           4       0.54      0.04      0.08   1089894
           5       0.34      0.03      0.06    760588
           6       0.52      0.03      0.06    846447
           7       0.43      0.03      0.06    837847
           8       0.11      0.87      0.19    624071
           9       0.27      0.05      0.09    754932

    accuracy                           0.15   6820828
   macro avg       0.31      0.31      0.12   6820828
weighted avg       0.39      0.15      0.09   6820828

Matthews Correlation Coefficient (MCC): 0.0914


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, matthews_corrcoef

# Load data
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")

# Split features and target
X = df.drop("cls_detected", axis=1)
y = df["cls_detected"]

# Identify categorical columns
cat_cols = X.select_dtypes(include="object").columns.tolist()

# Preprocessing with OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ],
    remainder="passthrough"
)

# RidgeClassifier pipeline
ridge_model = make_pipeline(
    preprocessor,
    RidgeClassifier(
        alpha=1.0,
        class_weight="balanced",  # good for imbalanced classes
        random_state=42
    )
)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Fit model
ridge_model.fit(X_train, y_train)

# Predict
y_pred = ridge_model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))
print("Matthews Correlation Coefficient (MCC):", round(matthews_corrcoef(y_test, y_pred), 4))


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, matthews_corrcoef
from xgboost import XGBClassifier

# Load data
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")

# Features and target
X = df.drop("cls_detected", axis=1)
y = df["cls_detected"]

# Categorical columns
cat_cols = X.select_dtypes(include="object").columns.tolist()

# Preprocessing: OneHotEncode categoricals
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ],
    remainder="passthrough"
)

# XGBoost classifier pipeline
xgb_model = make_pipeline(
    preprocessor,
    XGBClassifier(
        objective="multi:softmax",         # or "multi:softprob" if you want probabilities
        num_class=len(y.unique()),
        eval_metric="mlogloss",
        max_depth=10,
        n_estimators=300,
        learning_rate=0.1,
        use_label_encoder=False,
        tree_method="hist",                # use "gpu_hist" if running on GPU
        random_state=42
    )
)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Train model
xgb_model.fit(X_train, y_train)

# Predict
y_pred = xgb_model.predict(X_test)

# Evaluation
print(classification_report(y_test, y_pred))
print("Matthews Correlation Coefficient (MCC):", round(matthews_corrcoef(y_test, y_pred), 4))


Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0       0.49      0.72      0.58   1156814
           1       0.82      0.93      0.87      6851
           2       1.00      0.99      1.00      2268
           3       0.41      0.65      0.50    741116
           4       0.41      0.76      0.53   1089894
           5       0.61      0.24      0.35    760588
           6       0.55      0.47      0.51    846447
           7       0.61      0.34      0.44    837847
           8       0.69      0.24      0.36    624071
           9       0.70      0.19      0.29    754932

    accuracy                           0.48   6820828
   macro avg       0.63      0.55      0.54   6820828
weighted avg       0.55      0.48      0.46   6820828

Matthews Correlation Coefficient (MCC): 0.4115


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, matthews_corrcoef
from catboost import CatBoostClassifier, Pool

# Load data
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")

# Features and target
X = df.drop("cls_detected", axis=1)
y = df["cls_detected"]

# Detect categorical columns (CatBoost accepts column names or indices)
cat_cols = X.select_dtypes(include="object").columns.tolist()
X[cat_cols] = X[cat_cols].fillna("missing").astype(str)  # Avoid NaNs + force string

# Fill missing values in categorical columns with a string
X[cat_cols] = X[cat_cols].fillna("missing")
# Optional: fill missing numeric values
num_cols = X.select_dtypes(include=["float64", "int64"]).columns
X[num_cols] = X[num_cols].fillna(0)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Define CatBoost model
catboost_model = CatBoostClassifier(
    iterations=300,
    learning_rate=0.1,
    depth=10,
    loss_function='MultiClass',
    eval_metric='MultiClass',
    random_seed=42,
    verbose=100,
    auto_class_weights="Balanced"
)

# Train the model (using Pool to specify categorical features)
catboost_model.fit(
    Pool(X_train, y_train, cat_features=cat_cols),
    eval_set=Pool(X_test, y_test, cat_features=cat_cols),
    use_best_model=True
)

# Predict
y_pred = catboost_model.predict(X_test)
y_pred = y_pred.flatten()  # Ensure it is 1D

# Evaluation
print(classification_report(y_test, y_pred))
print("Matthews Correlation Coefficient (MCC):", round(matthews_corrcoef(y_test, y_pred), 4))


Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


: 

In [1]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, matthews_corrcoef
import pandas as pd

# Load data
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")
X = df.drop("cls_detected", axis=1)
y = df["cls_detected"]

# Identify categorical and numeric columns
cat_cols = X.select_dtypes(include="object").columns.tolist()
num_cols = [col for col in X.columns if col not in cat_cols]

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        # Fill missing, then ordinal encode categoricals
        ("cat", make_pipeline(
            SimpleImputer(strategy="constant", fill_value="missing"),
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
        ), cat_cols),
        # Fill missing in numeric columns
        ("num", SimpleImputer(strategy="mean"), num_cols)
    ]
)

# Final model pipeline
nb_model = make_pipeline(
    preprocessor,
    GaussianNB()
)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Fit and evaluate
nb_model.fit(X_train, y_train)
y_pred = nb_model.predict(X_test)

print(classification_report(y_test, y_pred))
print("Matthews Correlation Coefficient (MCC):", round(matthews_corrcoef(y_test, y_pred), 4))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.18      0.64      0.28   1156814
           1       0.30      0.24      0.27      6851
           2       1.00      0.92      0.96      2268
           3       0.17      0.03      0.06    741116
           4       0.17      0.27      0.21   1089894
           5       0.00      0.00      0.00    760588
           6       0.20      0.09      0.13    846447
           7       0.21      0.10      0.14    837847
           8       0.21      0.00      0.01    624071
           9       0.00      0.00      0.00    754932

    accuracy                           0.18   6820828
   macro avg       0.25      0.23      0.21   6820828
weighted avg       0.15      0.18      0.12   6820828

Matthews Correlation Coefficient (MCC): 0.03


In [None]:
#TabNet
#pip install pytorch-tabnet
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, matthews_corrcoef
from pytorch_tabnet.tab_model import TabNetClassifier

# Load data
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")

# Target variable
y = df["cls_detected"]
X = df.drop("cls_detected", axis=1)

# Handle missing values
X = X.fillna("missing")

# Encode categorical columns and collect their indices
cat_cols = X.select_dtypes(include="object").columns.tolist()
cat_idxs = [X.columns.get_loc(col) for col in cat_cols]
cat_dims = []

# LabelEncode categorical columns
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    cat_dims.append(len(le.classes_))

# Encode target variable
y = LabelEncoder().fit_transform(y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X.values, y, test_size=0.3, stratify=y, random_state=42
)

# TabNet model
tabnet_model = TabNetClassifier(
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=1,  # or use int or list like [3, 3, ...]
    n_d=32,
    n_a=32,
    n_steps=5,
    gamma=1.5,
    lambda_sparse=1e-3,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    seed=42,
    verbose=1
)

# Fit
tabnet_model.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_test, y_test)],
    eval_name=["val"],
    eval_metric=["accuracy"],
    max_epochs=200,
    patience=20,
    batch_size=16384,
    virtual_batch_size=512
)

# Predict
y_pred = tabnet_model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))
print("Matthews Correlation Coefficient (MCC):", round(matthews_corrcoef(y_test, y_pred), 4))


In [None]:
#fastai
#pip install fastai
import pandas as pd
from fastai.tabular.all import *

# Load your dataset
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")

# Fill missing values
df = df.fillna("missing")

# Define target and categorical/numeric columns
target = 'cls_detected'
cat_names = df.select_dtypes(include='object').columns.tolist()
cont_names = df.select_dtypes(exclude='object').drop(columns=[target]).columns.tolist()

# Split train/valid sets (random 70/30 split)
splits = RandomSplitter(seed=42)(range_of(df))

# Create DataLoaders
to = TabularPandas(
    df,
    procs=[Categorify, FillMissing, Normalize],
    cat_names=cat_names,
    cont_names=cont_names,
    y_names=target,
    splits=splits
)

dls = to.dataloaders(bs=1024)

# Build and train the model
learn = tabular_learner(
    dls,
    metrics=[accuracy, RocAuc()],
    layers=[200, 100],
    emb_drop=0.1
)

learn.fit_one_cycle(5)

# Evaluate
preds, targs = learn.get_preds()
interp = ClassificationInterpretation.from_learner(learn)
interp.print_classification_report()


In [None]:
#deepctr/deepfm
#pip install deepctr[pytorch]
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, matthews_corrcoef
from deepctr_torch.models import DeepFM  # or use WideDeep
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
import torch

# Load your data
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")

# Target
target = 'cls_detected'

# Fill missing values
df = df.fillna("missing")

# Encode categorical features
cat_features = df.select_dtypes(include='object').columns.tolist()
for col in cat_features:
    df[col] = df[col].astype(str).astype("category").cat.codes

# Encode target
df[target] = df[target].astype("category").cat.codes

# Identify feature types
sparse_features = cat_features
dense_features = [f for f in df.columns if f not in sparse_features + [target]]

# Feature columns config
fixlen_feature_columns = [
    SparseFeat(feat, vocabulary_size=df[feat].nunique(), embedding_dim=4)
    for feat in sparse_features
] + [
    DenseFeat(feat, 1,)
    for feat in dense_features
]

# Get feature names
feature_names = get_feature_names(fixlen_feature_columns)

# Split train/test
train, test = train_test_split(df, test_size=0.3, stratify=df[target], random_state=42)

# Build model input
train_model_input = {name: train[name].values for name in feature_names}
test_model_input = {name: test[name].values for name in feature_names}

# Labels
y_train = train[target].values
y_test = test[target].values

# Define DeepFM (can replace with WideDeep or xDeepFM)
model = DeepFM(
    linear_feature_columns=fixlen_feature_columns,
    dnn_feature_columns=fixlen_feature_columns,
    task='multiclass',
    l2_reg_embedding=1e-5,
    dnn_hidden_units=(256, 128),
    dnn_dropout=0.3,
    device='cuda' if torch.cuda.is_available() else 'cpu',
    seed=42
)

# Train
model.fit(train_model_input, y_train, batch_size=1024, epochs=10, verbose=2, validation_split=0.1)

# Predict
y_pred = model.predict(test_model_input, batch_size=1024)
y_pred_labels = np.argmax(y_pred, axis=1)

# Evaluate
print(classification_report(y_test, y_pred_labels))
print("MCC:", round(matthews_corrcoef(y_test, y_pred_labels), 4))
