In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')


In [None]:
train = pd.read_csv('/kaggle/input/firsttry-train/training_set.csv')
test = pd.read_csv('/kaggle/input/firsttry-train/test.csv')

In [None]:

print(train.info())
print(train.head())

print(train.isnull().sum())
print(train["traffic_label"].value_counts(normalize=True) * 100)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175341 entries, 0 to 175340
Data columns (total 44 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Id                 175341 non-null  int64  
 1   time_span          175341 non-null  float64
 2   proto_label        175341 non-null  object 
 3   svc_type           175341 non-null  object 
 4   conn_state         175341 non-null  object 
 5   src_pkts           175341 non-null  int64  
 6   dst_pkts           175341 non-null  int64  
 7   src_bytes          175341 non-null  int64  
 8   dst_bytes          175341 non-null  int64  
 9   conn_rate          175341 non-null  float64
 10  src_ttl            175341 non-null  int64  
 11  dst_ttl            175341 non-null  int64  
 12  src_load           175341 non-null  float64
 13  dst_load           175341 non-null  float64
 14  src_loss           175341 non-null  int64  
 15  dst_loss           175341 non-null  int64  
 16  sr

In [None]:
test_ids = test["Id"].copy()
train.drop(columns=['Id'], inplace=True)
test.drop(columns=['Id'], inplace=True)
y = train['traffic_label']
X = train.drop(columns=['traffic_label'])

In [None]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['proto_label', 'svc_type', 'conn_state']

for col in categorical_cols:
    le = LabelEncoder()

    X[col] = le.fit_transform(X[col])
    test[col] = test[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

    X[col] = X[col].astype(int)
    test[col] = test[col].astype(int)

print("Updated Data Types:\n", X.dtypes)


Updated Data Types:
 time_span            float64
proto_label            int64
svc_type               int64
conn_state             int64
src_pkts               int64
dst_pkts               int64
src_bytes              int64
dst_bytes              int64
conn_rate            float64
src_ttl                int64
dst_ttl                int64
src_load             float64
dst_load             float64
src_loss               int64
dst_loss               int64
src_intpkt           float64
dst_intpkt           float64
src_jitter           float64
dst_jitter           float64
src_win                int64
src_tcp_base           int64
dst_tcp_base           int64
dst_win                int64
tcp_rtt              float64
tcp_synack           float64
tcp_ackdata          float64
src_mean               int64
dst_mean               int64
trans_level            int64
resp_body_len          int64
ct_srv_srcX            int64
ct_st_ttlX             int64
ct_dst_ltmX            int64
ct_src_dport_ltmX     

In [None]:
print(X.info())
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175341 entries, 0 to 175340
Data columns (total 42 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   time_span          175341 non-null  float64
 1   proto_label        175341 non-null  int64  
 2   svc_type           175341 non-null  int64  
 3   conn_state         175341 non-null  int64  
 4   src_pkts           175341 non-null  int64  
 5   dst_pkts           175341 non-null  int64  
 6   src_bytes          175341 non-null  int64  
 7   dst_bytes          175341 non-null  int64  
 8   conn_rate          175341 non-null  float64
 9   src_ttl            175341 non-null  int64  
 10  dst_ttl            175341 non-null  int64  
 11  src_load           175341 non-null  float64
 12  dst_load           175341 non-null  float64
 13  src_loss           175341 non-null  int64  
 14  dst_loss           175341 non-null  int64  
 15  src_intpkt         175341 non-null  float64
 16  ds

In [None]:
from sklearn.preprocessing import LabelEncoder

le_traffic = LabelEncoder()
y = le_traffic.fit_transform(train["traffic_label"])

print("Label Encoding Mapping:")
print(dict(zip(le_traffic.classes_, le_traffic.transform(le_traffic.classes_))))

print("Before Encoding:", train["traffic_label"].unique()[:10])
print("After Encoding:", y[:10])


Label Encoding Mapping:
{'Analysis': 0, 'Backdoor': 1, 'DoS': 2, 'Exploits': 3, 'Fuzzers': 4, 'Generic': 5, 'Normal': 6, 'Reconnaissance': 7, 'Shellcode': 8, 'Worms': 9}
Before Encoding: ['Normal' 'Backdoor' 'Analysis' 'Fuzzers' 'Shellcode' 'Reconnaissance'
 'Exploits' 'DoS' 'Worms' 'Generic']
After Encoding: [6 6 6 6 6 6 6 6 6 6]


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.1, stratify=y, random_state=42
)


In [None]:
print("Remaining object columns in X_train:", X_train.select_dtypes(include=['object']).columns.tolist())
print("Remaining object columns in X_val:", X_val.select_dtypes(include=['object']).columns.tolist())


Remaining object columns in X_train: []
Remaining object columns in X_val: []


In [None]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, ExtraTreesClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
#best 79.93 aaya
base_models = [
    ('xgb', XGBClassifier(
        objective='multi:softprob',
        max_depth=6,
        learning_rate=0.05,
        n_estimators=500,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )),
 ('lgbm', LGBMClassifier(
        objective='multiclass',
        random_state=42
    )),
    ('extratrees', ExtraTreesClassifier(
        n_estimators=100,
        random_state=42
    )),
    ('mlp', MLPClassifier(
        hidden_layer_sizes=(64, 32),
        alpha=0.01,
        learning_rate_init=0.001,
        max_iter=500,
        random_state=42
    )),
    ('hgb', HistGradientBoostingClassifier(
        learning_rate=0.03,
        max_iter=300,
        max_depth=6,
        random_state=42
    )),
    ('catboost', CatBoostClassifier(
        loss_function='MultiClass',
        random_state=42,
        verbose=0
    ))
]

meta_model = LogisticRegression(max_iter=1000, multi_class='ovr')
stack = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=2
)

In [None]:
stack.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018948 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6143
[LightGBM] [Info] Number of data points in the train set: 157806, number of used features: 42
[LightGBM] [Info] Start training from score -4.473580
[LightGBM] [Info] Start training from score -4.609654
[LightGBM] [Info] Start training from score -2.660023
[LightGBM] [Info] Start training from score -1.658404
[LightGBM] [Info] Start training from score -2.266222
[LightGBM] [Info] Start training from score -1.477847
[LightGBM] [Info] Start training from score -1.141375
[LightGBM] [Info] Start training from score -2.816199
[LightGBM] [Info] Start training from score -5.041564
[LightGBM] [Info] Start training from score -7.206948
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005879 seconds.
Yo

In [None]:
y_val_pred = stack.predict(X_val)

In [None]:
from sklearn.metrics import f1_score

final_f1_score = f1_score(y_val, y_val_pred, average='weighted')
print("Final Weighted F1-Score:", final_f1_score)



Final Weighted F1-Score: 0.8271828797785613


In [None]:
test_preds = stack.predict(test)


In [None]:
test_preds_decoded = le_traffic.inverse_transform(test_preds)

submission = pd.DataFrame({"Id": test_ids, "traffic_label": test_preds_decoded})
submission.to_csv("submission.csv", index=False)


In [None]:
from IPython.display import FileLink
FileLink("submission.csv")

In [None]:

import pandas as pd

label_counts = pd.Series(test_preds_decoded).value_counts()

print(label_counts)

Normal            30069
Generic           18421
Exploits          13082
Fuzzers           11072
DoS                3420
Reconnaissance     3049
Analysis           1679
Backdoor            907
Shellcode           617
Worms                16
Name: count, dtype: int64
