In [1]:
from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd

# Load and prepare data
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")

# Convert object columns to categorical
for col in df.select_dtypes(include="object"):
    df[col] = df[col].astype("category")

X = df.drop("cls_detected", axis=1)
y = df["cls_detected"]

# Show class labels
print("Unique classes:", y.unique())

# Train/test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Set up LightGBM model for multiclass
num_classes = y.nunique()

model = LGBMClassifier(
    objective="multiclass",
    num_class=num_classes,
    max_depth=10,
    n_estimators=1000
)

# Fit with early stopping callback
model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric="multi_logloss",
    callbacks=[lgb.early_stopping(stopping_rounds=10)],
)

# Predict and evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


Unique classes: [0 1 2 3 4 5 6 7 8 9]
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.998528 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22837
[LightGBM] [Info] Number of data points in the train set: 15915263, number of used features: 19
[LightGBM] [Info] Start training from score -1.774311
[LightGBM] [Info] Start training from score -6.903321
[LightGBM] [Info] Start training from score -8.009027
[LightGBM] [Info] Start training from score -2.219580
[LightGBM] [Info] Start training from score -1.833900
[LightGBM] [Info] Start training from score -2.193644
[LightGBM] [Info] Start training from score -2.086688
[LightGBM] [Info] Start training from score -2.096901
[LightGBM] [Info] Start training from score -2.391472
[LightGBM] [Info] Start training from score -2.201109
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[79]	valid_0's multi_logloss: 1.6

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, matthews_corrcoef

# Load data
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")

# Split features and label
X = df.drop("cls_detected", axis=1)
y = df["cls_detected"]

# Identify categorical columns
cat_cols = X.select_dtypes(include="object").columns.tolist()

# Define preprocessing (one-hot encode categorical features)
preprocessor = ColumnTransformer(
    transformers=[("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)],
    remainder="passthrough"  # numeric columns stay as is
)

# Create LinearSVC pipeline
svm_model = make_pipeline(
    preprocessor,
    LinearSVC(max_iter=10000, dual=False)  # dual=False is faster for large dense datasets
)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Fit
svm_model.fit(X_train, y_train)

# Predict
y_pred = svm_model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))
print("Matthews Correlation Coefficient (MCC):", round(matthews_corrcoef(y_test, y_pred), 4))


              precision    recall  f1-score   support

           0       0.31      0.60      0.41   1156814
           1       0.89      0.98      0.93      6851
           2       0.99      1.00      0.99      2268
           3       0.61      0.81      0.69    741116
           4       0.31      0.57      0.40   1089894
           5       0.28      0.15      0.19    760588
           6       0.33      0.15      0.21    846447
           7       0.30      0.29      0.29    837847
           8       0.43      0.00      0.00    624071
           9       0.30      0.02      0.03    754932

    accuracy                           0.35   6820828
   macro avg       0.48      0.46      0.42   6820828
weighted avg       0.35      0.35      0.30   6820828

Matthews Correlation Coefficient (MCC): 0.2542


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, matthews_corrcoef
from sklearn.feature_extraction import FeatureHasher
from sklearn.decomposition import TruncatedSVD
from scipy import sparse
from scipy.sparse import hstack
from scipy.stats import mode
import faiss

# Load data
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")

# Feature / label split
X = df.drop("cls_detected", axis=1)
y = df["cls_detected"].astype("category").cat.codes

# Identify categorical and numerical columns
cat_cols = X.select_dtypes(include="object").columns.tolist()
num_cols = [col for col in X.columns if col not in cat_cols]

# Fill missing values
X = X.fillna("missing")
X[num_cols] = X[num_cols].fillna(0)

# Hash categorical features to sparse matrix
X_cat_dicts = X[cat_cols].astype(str).to_dict(orient="records")
hasher = FeatureHasher(n_features=1024, input_type='dict')
X_cat_hashed_sparse = hasher.transform(X_cat_dicts)

# Scale numerical features
X_num_scaled = StandardScaler().fit_transform(X[num_cols]).astype("float32")
X_num_sparse = sparse.csr_matrix(X_num_scaled)

# Combine features
X_combined_sparse = hstack([X_cat_hashed_sparse, X_num_sparse]).tocsr()

# Print estimated memory if made dense
approx_gb = X_combined_sparse.shape[0] * X_combined_sparse.shape[1] * 4 / 1024**3
print(f"Estimated size if dense: {approx_gb:.2f} GB")

# Reduce dimensions to a manageable size
svd = TruncatedSVD(n_components=128, random_state=42)
X_reduced = svd.fit_transform(X_combined_sparse).astype("float32")

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X_reduced, y, test_size=0.3, stratify=y, random_state=42
)

# FAISS
d = X_train.shape[1]
index = faiss.IndexFlatL2(d)
index.add(X_train)

# Search
k = 5
_, indices = index.search(X_test, k)

# Predict
y_pred = mode(y_train.to_numpy()[indices], axis=1).mode.flatten()

# Evaluation
print(classification_report(y_test, y_pred))
print("MCC:", round(matthews_corrcoef(y_test, y_pred), 4))


: 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, matthews_corrcoef
from sklearn.preprocessing import OrdinalEncoder

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder

# Load dataset
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")

# Separate features and target
X = df.drop("cls_detected", axis=1)
y = df["cls_detected"]

# Identify categorical columns
cat_cols = X.select_dtypes(include="object").columns.tolist()
num_cols = [col for col in X.columns if col not in cat_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", make_pipeline(
            SimpleImputer(strategy="constant", fill_value="missing"),
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
        ), cat_cols),
        ("num", SimpleImputer(strategy="mean"), num_cols)
    ]
) 

# Random Forest model pipeline
rf_model = make_pipeline(
    preprocessor,
    RandomForestClassifier(
        n_estimators=100,           # Increase for better accuracy, decrease for speed
        max_depth=20,               # Set to None for full growth
        n_jobs=-1,                  # Use all CPU cores
        class_weight="balanced",   # Useful for imbalanced classes
        random_state=42
    )
)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Fit model
rf_model.fit(X_train, y_train)

# Predict
y_pred = rf_model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))
print("Matthews Correlation Coefficient (MCC):", round(matthews_corrcoef(y_test, y_pred), 4))


              precision    recall  f1-score   support

           0       0.74      0.49      0.59   1156814
           1       0.33      0.98      0.50      6851
           2       1.00      1.00      1.00      2268
           3       0.29      0.93      0.44    741116
           4       0.64      0.57      0.60   1089894
           5       0.53      0.51      0.52    760588
           6       0.64      0.48      0.55    846447
           7       0.77      0.36      0.49    837847
           8       0.58      0.52      0.55    624071
           9       0.71      0.33      0.45    754932

    accuracy                           0.52   6820828
   macro avg       0.62      0.62      0.57   6820828
weighted avg       0.62      0.52      0.53   6820828

Matthews Correlation Coefficient (MCC): 0.4742


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report, matthews_corrcoef

# Load dataset
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")

# Separate features and target
X = df.drop("cls_detected", axis=1)
y = df["cls_detected"]

# ❗ Convert all object columns to pandas 'category' dtype BEFORE train-test split
for col in X.select_dtypes(include="object").columns:
    X[col] = X[col].astype("category")
assert not any(X.dtypes == "object"), "Object columns remain!"

print(X.dtypes) 

# Fill missing in categorical columns with a special label
for col in X.select_dtypes(include="category").columns:
    X[col] = X[col].cat.add_categories(["__missing__"]).fillna("__missing__")

print(X.isna().sum())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Define model
model = HistGradientBoostingClassifier(
    max_iter=100,
    max_depth=10,
    learning_rate=0.1,
    early_stopping=True,
    random_state=42
)
print("Final dtypes before training:\n", X_train.dtypes)
print("Sample row:\n", X_train.head(1).to_dict())

# Fit
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))
print("Matthews Correlation Coefficient (MCC):", round(matthews_corrcoef(y_test, y_pred), 4))


timestep                  float64
cpm_id                   category
vehicle_id               category
detected_vehicle_id      category
object_type              category
detected_object_type     category
label_detected              int64
x_detected                float64
y_detected                float64
speed_detected            float64
heading_detected          float64
acceleration_detected     float64
is_self                     int64
sensor_id                category
sensor_type              category
sensor_range                int64
weather_conditions       category
visibility_range            int64
detection_confidence      float64
dtype: object
timestep                 0
cpm_id                   0
vehicle_id               0
detected_vehicle_id      0
object_type              0
detected_object_type     0
label_detected           0
x_detected               0
y_detected               0
speed_detected           0
heading_detected         0
acceleration_detected    0
is_self         

ValueError: could not convert string to float: 'CPM_truck43_586.0'

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, matthews_corrcoef

# Load data
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")

# Split features and label
X = df.drop("cls_detected", axis=1)
y = df["cls_detected"]

# Identify categorical columns
cat_cols = X.select_dtypes(include="object").columns.tolist()

# Define preprocessing (OneHotEncoding)
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ],
    remainder="passthrough"
)

# Create SGDClassifier pipeline
sgd_model = make_pipeline(
    preprocessor,
    SGDClassifier(
        loss="hinge",        # Linear SVM
        max_iter=1000,
        tol=1e-3,
        class_weight="balanced",  # optional if class imbalance
        random_state=42,
        n_jobs=-1
    )
)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)


# Fit
sgd_model.fit(X_train, y_train)

# Predict
y_pred = sgd_model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))
print("Matthews Correlation Coefficient (MCC):", round(matthews_corrcoef(y_test, y_pred), 4))


              precision    recall  f1-score   support

           0       0.45      0.00      0.00   1156814
           1       0.09      0.69      0.16      6851
           2       0.13      0.95      0.23      2268
           3       0.26      0.42      0.32    741116
           4       0.54      0.04      0.08   1089894
           5       0.34      0.03      0.06    760588
           6       0.52      0.03      0.06    846447
           7       0.43      0.03      0.06    837847
           8       0.11      0.87      0.19    624071
           9       0.27      0.05      0.09    754932

    accuracy                           0.15   6820828
   macro avg       0.31      0.31      0.12   6820828
weighted avg       0.39      0.15      0.09   6820828

Matthews Correlation Coefficient (MCC): 0.0914


In [None]:
#RidgeClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, matthews_corrcoef

# Load data
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")

# Split features and target
X = df.drop("cls_detected", axis=1)
y = df["cls_detected"]

# Identify categorical columns
cat_cols = X.select_dtypes(include="object").columns.tolist()

# Preprocessing with OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ],
    remainder="passthrough"
)

# RidgeClassifier pipeline
ridge_model = make_pipeline(
    preprocessor,
    RidgeClassifier(
        alpha=1.0,
        class_weight="balanced",  # good for imbalanced classes
        random_state=42
    )
)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Fit model
ridge_model.fit(X_train, y_train)

# Predict
y_pred = ridge_model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))
print("Matthews Correlation Coefficient (MCC):", round(matthews_corrcoef(y_test, y_pred), 4))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1156814
           1       0.35      1.00      0.52      6851
           2       0.91      1.00      0.95      2268
           3       1.00      1.00      1.00    741116
           4       1.00      1.00      1.00   1089894
           5       1.00      1.00      1.00    760588
           6       1.00      1.00      1.00    846447
           7       1.00      1.00      1.00    837847
           8       1.00      1.00      1.00    624071
           9       1.00      1.00      1.00    754932

    accuracy                           1.00   6820828
   macro avg       0.93      1.00      0.95   6820828
weighted avg       1.00      1.00      1.00   6820828

Matthews Correlation Coefficient (MCC): 0.9977


In [1]:
#RidgeClassifier - No Node Id
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, matthews_corrcoef

# Load data
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")

# Split features and target
# timestep,cpm_id,vehicle_id,detected_vehicle_id,object_type,detected_object_type,cls_detected,label_detected,x_detected,y_detected,speed_detected,heading_detected,acceleration_detected,is_self,sensor_id,sensor_type,sensor_range,weather_conditions,visibility_range,detection_confidence
X = df.drop(["cls_detected","vehicle_id","detected_vehicle_id"], axis=1)
y = df["cls_detected"]

# Identify categorical columns
cat_cols = X.select_dtypes(include="object").columns.tolist()

# Preprocessing with OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ],
    remainder="passthrough"
)

# RidgeClassifier pipeline
ridge_model = make_pipeline(
    preprocessor,
    RidgeClassifier(
        alpha=1.0,
        class_weight="balanced",  # good for imbalanced classes
        random_state=42
    )
)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Fit model
ridge_model.fit(X_train, y_train)

# Predict
y_pred = ridge_model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))
print("Matthews Correlation Coefficient (MCC):", round(matthews_corrcoef(y_test, y_pred), 4))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1156814
           1       0.34      0.98      0.51      6851
           2       0.27      1.00      0.43      2268
           3       1.00      1.00      1.00    741116
           4       1.00      1.00      1.00   1089894
           5       1.00      1.00      1.00    760588
           6       1.00      1.00      1.00    846447
           7       1.00      1.00      1.00    837847
           8       1.00      1.00      1.00    624071
           9       1.00      1.00      1.00    754932

    accuracy                           1.00   6820828
   macro avg       0.86      1.00      0.89   6820828
weighted avg       1.00      1.00      1.00   6820828

Matthews Correlation Coefficient (MCC): 0.9964


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, matthews_corrcoef
from xgboost import XGBClassifier

# Load data
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")

# Features and target
X = df.drop("cls_detected", axis=1)
y = df["cls_detected"]

# Categorical columns
cat_cols = X.select_dtypes(include="object").columns.tolist()

# Preprocessing: OneHotEncode categoricals
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ],
    remainder="passthrough"
)

# XGBoost classifier pipeline
xgb_model = make_pipeline(
    preprocessor,
    XGBClassifier(
        objective="multi:softmax",         # or "multi:softprob" if you want probabilities
        num_class=len(y.unique()),
        eval_metric="mlogloss",
        max_depth=10,
        n_estimators=300,
        learning_rate=0.1,
        use_label_encoder=False,
        tree_method="hist",                # use "gpu_hist" if running on GPU
        random_state=42
    )
)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Train model
xgb_model.fit(X_train, y_train)

# Predict
y_pred = xgb_model.predict(X_test)

# Evaluation
print(classification_report(y_test, y_pred))
print("Matthews Correlation Coefficient (MCC):", round(matthews_corrcoef(y_test, y_pred), 4))


Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0       0.49      0.72      0.58   1156814
           1       0.82      0.93      0.87      6851
           2       1.00      0.99      1.00      2268
           3       0.41      0.65      0.50    741116
           4       0.41      0.76      0.53   1089894
           5       0.61      0.24      0.35    760588
           6       0.55      0.47      0.51    846447
           7       0.61      0.34      0.44    837847
           8       0.69      0.24      0.36    624071
           9       0.70      0.19      0.29    754932

    accuracy                           0.48   6820828
   macro avg       0.63      0.55      0.54   6820828
weighted avg       0.55      0.48      0.46   6820828

Matthews Correlation Coefficient (MCC): 0.4115


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, matthews_corrcoef
from catboost import CatBoostClassifier, Pool

# Load data
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")

# Features and target
X = df.drop("cls_detected", axis=1)
y = df["cls_detected"]

# Detect categorical columns (CatBoost accepts column names or indices)
cat_cols = X.select_dtypes(include="object").columns.tolist()
X[cat_cols] = X[cat_cols].fillna("missing").astype(str)  # Avoid NaNs + force string

# Fill missing values in categorical columns with a string
X[cat_cols] = X[cat_cols].fillna("missing")
# Optional: fill missing numeric values
num_cols = X.select_dtypes(include=["float64", "int64"]).columns
X[num_cols] = X[num_cols].fillna(0)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Define CatBoost model
catboost_model = CatBoostClassifier(
    iterations=300,
    learning_rate=0.1,
    depth=10,
    loss_function='MultiClass',
    eval_metric='MultiClass',
    random_seed=42,
    verbose=100,
    auto_class_weights="Balanced"
)

# Train the model (using Pool to specify categorical features)
catboost_model.fit(
    Pool(X_train, y_train, cat_features=cat_cols),
    eval_set=Pool(X_test, y_test, cat_features=cat_cols),
    use_best_model=True
)

# Predict
y_pred = catboost_model.predict(X_test)
y_pred = y_pred.flatten()  # Ensure it is 1D

# Evaluation
print(classification_report(y_test, y_pred))
print("Matthews Correlation Coefficient (MCC):", round(matthews_corrcoef(y_test, y_pred), 4))


Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


: 

In [1]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, matthews_corrcoef
import pandas as pd

# Load data
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")
X = df.drop("cls_detected", axis=1)
y = df["cls_detected"]

# Identify categorical and numeric columns
cat_cols = X.select_dtypes(include="object").columns.tolist()
num_cols = [col for col in X.columns if col not in cat_cols]

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        # Fill missing, then ordinal encode categoricals
        ("cat", make_pipeline(
            SimpleImputer(strategy="constant", fill_value="missing"),
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
        ), cat_cols),
        # Fill missing in numeric columns
        ("num", SimpleImputer(strategy="mean"), num_cols)
    ]
)

# Final model pipeline
nb_model = make_pipeline(
    preprocessor,
    GaussianNB()
)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Fit and evaluate
nb_model.fit(X_train, y_train)
y_pred = nb_model.predict(X_test)

print(classification_report(y_test, y_pred))
print("Matthews Correlation Coefficient (MCC):", round(matthews_corrcoef(y_test, y_pred), 4))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.18      0.64      0.28   1156814
           1       0.30      0.24      0.27      6851
           2       1.00      0.92      0.96      2268
           3       0.17      0.03      0.06    741116
           4       0.17      0.27      0.21   1089894
           5       0.00      0.00      0.00    760588
           6       0.20      0.09      0.13    846447
           7       0.21      0.10      0.14    837847
           8       0.21      0.00      0.01    624071
           9       0.00      0.00      0.00    754932

    accuracy                           0.18   6820828
   macro avg       0.25      0.23      0.21   6820828
weighted avg       0.15      0.18      0.12   6820828

Matthews Correlation Coefficient (MCC): 0.03


In [1]:
#TabNet
#pip install pytorch-tabnet
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, matthews_corrcoef
from pytorch_tabnet.tab_model import TabNetClassifier
import torch

# Load data
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")

# Target variable
y = df["cls_detected"]
X = df.drop("cls_detected", axis=1)

# Handle missing values
X = X.fillna("missing")

# Encode categorical columns and collect their indices
cat_cols = X.select_dtypes(include="object").columns.tolist()
cat_idxs = [X.columns.get_loc(col) for col in cat_cols]
cat_dims = []

# LabelEncode categorical columns
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    cat_dims.append(len(le.classes_))

# Encode target variable
y = LabelEncoder().fit_transform(y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X.values, y, test_size=0.3, stratify=y, random_state=42
)

# TabNet model
tabnet_model = TabNetClassifier(
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=1,  # or use int or list like [3, 3, ...]
    n_d=32,
    n_a=32,
    n_steps=5,
    gamma=1.5,
    lambda_sparse=1e-3,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    seed=42,
    verbose=1
)

# Fit
tabnet_model.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_test, y_test)],
    eval_name=["val"],
    eval_metric=["accuracy"],
    max_epochs=10,
    patience=1,
    batch_size=16384,
    virtual_batch_size=512
)

# Predict
y_pred = tabnet_model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))
print("Matthews Correlation Coefficient (MCC):", round(matthews_corrcoef(y_test, y_pred), 4))




epoch 0  | loss: 1.46554 | val_accuracy: 0.71183 |  0:07:09s
epoch 1  | loss: 0.79507 | val_accuracy: 0.71679 |  0:13:45s
epoch 2  | loss: 0.791   | val_accuracy: 0.75038 |  0:21:23s
epoch 3  | loss: 0.72347 | val_accuracy: 0.78978 |  0:28:25s
epoch 4  | loss: 0.68141 | val_accuracy: 0.81268 |  0:35:32s
epoch 5  | loss: 0.6061  | val_accuracy: 0.82589 |  0:42:51s
epoch 6  | loss: 0.58179 | val_accuracy: 0.85227 |  0:50:49s
epoch 7  | loss: 0.54063 | val_accuracy: 0.84328 |  0:59:21s

Early stopping occurred at epoch 7 with best_epoch = 6 and best_val_accuracy = 0.85227




              precision    recall  f1-score   support

           0       0.89      0.86      0.87   1156814
           1       0.61      0.62      0.62      6851
           2       1.00      0.89      0.94      2268
           3       0.88      0.84      0.86    741116
           4       0.79      0.90      0.84   1089894
           5       0.88      0.79      0.83    760588
           6       0.89      0.87      0.88    846447
           7       0.82      0.87      0.85    837847
           8       0.80      0.81      0.81    624071
           9       0.88      0.84      0.86    754932

    accuracy                           0.85   6820828
   macro avg       0.85      0.83      0.84   6820828
weighted avg       0.85      0.85      0.85   6820828

Matthews Correlation Coefficient (MCC): 0.8304


In [1]:
#fastai
#pip install fastai
import pandas as pd
from fastai.tabular.all import *

# Load your dataset
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")

# Fill missing values
df = df.fillna("missing")

# Define target and categorical/numeric columns
target = 'cls_detected'
cat_names = df.select_dtypes(include='object').columns.tolist()
cont_names = df.select_dtypes(exclude='object').drop(columns=[target]).columns.tolist()

# Split train/valid sets (random 70/30 split)
splits = RandomSplitter(seed=42)(range_of(df))

# Create DataLoaders
to = TabularPandas(
    df,
    procs=[Categorify, FillMissing, Normalize],
    cat_names=cat_names,
    cont_names=cont_names,
    y_names=target,
    splits=splits
)

dls = to.dataloaders(bs=1024)

# Build and train the model
learn = tabular_learner(
    dls,
    layers=[200, 100],
    metrics=[accuracy, RocAuc()],
    config={'ps': 0.1}  # Correct way to pass dropout
)

learn.fit_one_cycle(5)

# Evaluate
preds, targs = learn.get_preds()
interp = ClassificationInterpretation.from_learner(learn)
interp.print_classification_report()


epoch,train_loss,valid_loss,accuracy,roc_auc_score,time


OutOfMemoryError: CUDA out of memory. Tried to allocate 476.00 MiB (GPU 0; 1.95 GiB total capacity; 1.39 GiB already allocated; 451.62 MiB free; 1.40 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [1]:
#fastai no GPU
#pip install fastai
import pandas as pd
from fastai.tabular.all import *

# Load your dataset
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")

# Fill missing values
df = df.fillna("missing")

# Define target and categorical/numeric columns
target = 'cls_detected'
cat_names = df.select_dtypes(include='object').columns.tolist()
cont_names = df.select_dtypes(exclude='object').drop(columns=[target]).columns.tolist()

# Split train/valid sets (random 70/30 split)
splits = RandomSplitter(seed=42)(range_of(df))

# Create DataLoaders
to = TabularPandas(
    df,
    procs=[Categorify, FillMissing, Normalize],
    cat_names=cat_names,
    cont_names=cont_names,
    y_names=target,
    splits=splits
)

dls = to.dataloaders(bs=1024)

# Build and train the model
learn = tabular_learner(
    dls,
    layers=[200, 100],
    metrics=[accuracy, RocAuc()],
    config={'ps': 0.1}  # Correct way to pass dropout
)

learn.model.to("cpu")
learn.dls.device = torch.device("cpu")

learn.fit_one_cycle(5)

# Evaluate
preds, targs = learn.get_preds()
interp = ClassificationInterpretation.from_learner(learn)
interp.print_classification_report()


epoch,train_loss,valid_loss,accuracy,roc_auc_score,time


KeyboardInterrupt: 

In [1]:
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, precision_score, accuracy_score, matthews_corrcoef
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models.deepfm import DeepFM
from torch.utils.data import DataLoader, TensorDataset

# Load dataset
df = pd.read_csv("burst_adma_with_cpm_multi_sensors_cls/burst_adma_with_cpm_multi_sensors999_cls_all.csv")
df = df.fillna("missing")

# Setup
target = 'cls_detected'
sparse_features = df.select_dtypes(include='object').columns.tolist()
dense_features = df.select_dtypes(exclude='object').drop(columns=[target]).columns.tolist()

# Label encode categoricals
for col in sparse_features:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# Encode target as int
df[target] = LabelEncoder().fit_transform(df[target])

# Split
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Feature columns
fixlen_feature_columns = [SparseFeat(feat, df[feat].nunique(), 4) for feat in sparse_features] + \
                         [DenseFeat(feat, 1) for feat in dense_features]
feature_names = get_feature_names(fixlen_feature_columns)
num_classes = df[target].nunique()

# Inputs
train_input = {name: train[name].values for name in feature_names}
test_input = {name: test[name].values for name in feature_names}
X_train = [torch.tensor(train_input[name]) for name in feature_names]
X_test = [torch.tensor(test_input[name]) for name in feature_names]
y_train = torch.tensor(train[target].values).long()
y_test = torch.tensor(test[target].values).long()

# Custom DeepFM model for multiclass
class DeepFMMultiClass(nn.Module):
    def __init__(self, linear_feature_columns, dnn_feature_columns, num_classes, dnn_hidden_units=(128, 64)):
        super().__init__()
        self.dnn_hidden_units = dnn_hidden_units
        self.model = DeepFM(
            linear_feature_columns, dnn_feature_columns,
            task='binary', device='cpu',
            dnn_hidden_units=self.dnn_hidden_units
        )
        self.dnn = self.model.dnn
        self.linear_model = self.model.linear_model
        self.embedding_dict = self.model.embedding_dict
        self.feature_index = self.model.feature_index
        self.bn = nn.BatchNorm1d(self.dnn_hidden_units[-1])  # ✅ Now it works
        self.output = nn.Linear(self.dnn_hidden_units[-1], num_classes)


    def forward(self, X):
        return self.model(X)

# Instantiate model
model = DeepFMMultiClass(fixlen_feature_columns, fixlen_feature_columns, num_classes=num_classes)

# Optimizer & loss
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Dataloader
train_ds = TensorDataset(*X_train, y_train)
train_dl = DataLoader(train_ds, batch_size=1024, shuffle=True)

# Training loop
model.train()
for epoch in range(5):
    total_loss = 0
    for batch in train_dl:
        *x_batch, y_batch = batch
        x_batch = [x if x.dtype == torch.float32 else x.long() for x in x_batch]

        # zip inputs to feature_names
        x_dict = {name: x.view(-1) if x.ndim == 2 and x.shape[1] == 1 else x for name, x in zip(feature_names, x_batch)}

        preds = model(x_dict)

        loss = criterion(preds, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# Evaluation
model.eval()
with torch.no_grad():
    x_test_batch = [x.long() if x.dtype == torch.int64 else x.float() for x in X_test]
    logits = model(x_test_batch)
    y_pred = torch.argmax(logits, dim=1).numpy()
    y_true = y_test.numpy()

from sklearn.metrics import classification_report

print("\nPer-Class Classification Report:")
print(classification_report(y_true, y_pred, digits=4))

# Metrics
print(f"Accuracy:         {accuracy_score(y_true, y_pred):.4f}")
print(f"Macro Precision:  {precision_score(y_true, y_pred, average='macro'):.4f}")
print(f"Micro Precision:  {precision_score(y_true, y_pred, average='micro'):.4f}")
print(f"Macro F1-score:   {f1_score(y_true, y_pred, average='macro'):.4f}")
print(f"Micro F1-score:   {f1_score(y_true, y_pred, average='micro'):.4f}")
print(f"Matthews Corrcoef:{matthews_corrcoef(y_true, y_pred):.4f}")


2025-06-17 16:07:09.494855: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Please check the latest version manually on https://pypi.org/project/deepctr-torch/#history


TypeError: unhashable type: 'slice'