In [57]:
import pandas as pd
import os

df = pd.read_csv(os.path.join(os.getcwd(), 'uci_malware_detection.csv'))
X = df.drop('Label', axis=1)
y = df['Label']

df.head(), df['Label'].value_counts()

(           Label  F_1  F_2  F_3  F_4  F_5  F_6  F_7  F_8  F_9  ...  F_522  \
 0  non-malicious    1    0    1    0    1    0    1    0    1  ...      0   
 1  non-malicious    1    0    1    0    1    0    1    0    1  ...      0   
 2  non-malicious    1    0    1    0    1    0    1    0    1  ...      0   
 3  non-malicious    1    0    1    0    1    0    1    0    1  ...      0   
 4  non-malicious    1    0    1    0    1    0    1    0    1  ...      0   
 
    F_523  F_524  F_525  F_526  F_527  F_528  F_529  F_530  F_531  
 0      0      0      0      0      0      0      0      0      0  
 1      0      0      0      0      0      0      0      0      0  
 2      0      0      0      0      0      0      0      0      0  
 3      0      0      0      0      0      0      0      0      0  
 4      0      0      0      0      0      0      0      0      0  
 
 [5 rows x 532 columns],
 Label
 malicious        301
 non-malicious     72
 Name: count, dtype: int64)

In [90]:
# logistic

from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE, ADASYN
from collections import Counter


from sklearn.model_selection import cross_validate

scale = X.select_dtypes(include=['int64', 'Float64']).columns

preprocessor = ColumnTransformer([
    ('scaler', StandardScaler(), scale)
])

pipeline_logistic = Pipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('logistic', LogisticRegression(random_state=42, multi_class='auto', max_iter=1000))
])

pipeline_logistic.fit(X, y)
smote_step = pipeline_logistic.named_steps['smote']
X_rexampled, y_resampled = smote_step.fit_resample(preprocessor.fit_transform(X), y)

Distribution of classes before ADASYN: Counter({1: 301, 0: 72})
after ADASYN: Counter({0: 301, 1: 301}) 



In [40]:
# rf

from sklearn.ensemble import RandomForestClassifier

pipeline_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('smote', ADASYN(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42))
])

rf_cross = cross_validate(pipeline_rf, X, y, scoring = scoring)

rf_mean = {key: value.mean() for key, value in rf_cross.items()}

for metric, mean in rf_mean.items():
    print(f'{metric}: {mean:.4f}')

fit_time: 0.5270
score_time: 0.0409
test_accuracy: 0.9920
test_precision_macro: 0.9901
test_recall_macro: 0.9845
test_f1_macro: 0.9871


In [74]:
# rf

from xgboost import XGBClassifier

pipeline_xgb = Pipeline([
    ('preprocessor', preprocessor),
    ('smote', ADASYN(random_state=42)),
    ('xbg', XGBClassifier(random_state=42))
])
y, _ = pd.factorize(y)
xgb_cross = cross_validate(pipeline_xgb, X, y, scoring = scoring)

xgb_mean = {key: value.mean() for key, value in xgb_cross.items()}

for metric, mean in xgb_mean.items():
    print(f'{metric}: {mean:.4f}')

fit_time: 0.5401
score_time: 0.0340
test_accuracy: 0.9920
test_precision_macro: 0.9906
test_recall_macro: 0.9840
test_f1_macro: 0.9866


In [76]:
from catboost import CatBoostClassifier

pipeline_cat = Pipeline([
    ('preprocessor', preprocessor),
    ('smote', ADASYN(random_state=42)),
    ('xbg', CatBoostClassifier(random_state=42))
])
# y, _ = pd.factorize(y)
cat_cross = cross_validate(pipeline_cat, X, y, scoring = scoring)

cat_mean = {key: value.mean() for key, value in cat_cross.items()}

for metric, mean in cat_mean.items():
    print(f'{metric}: {mean:.4f}')

Learning rate set to 0.007551
0:	learn: 0.6756573	total: 56.1ms	remaining: 56s
1:	learn: 0.6573516	total: 99.3ms	remaining: 49.6s
2:	learn: 0.6389056	total: 140ms	remaining: 46.5s
3:	learn: 0.6214674	total: 175ms	remaining: 43.7s
4:	learn: 0.6046172	total: 218ms	remaining: 43.3s
5:	learn: 0.5893257	total: 258ms	remaining: 42.7s
6:	learn: 0.5729385	total: 299ms	remaining: 42.5s
7:	learn: 0.5566486	total: 338ms	remaining: 41.9s
8:	learn: 0.5440777	total: 373ms	remaining: 41s
9:	learn: 0.5289392	total: 423ms	remaining: 41.9s
10:	learn: 0.5143547	total: 472ms	remaining: 42.5s
11:	learn: 0.5012590	total: 523ms	remaining: 43.1s
12:	learn: 0.4882578	total: 567ms	remaining: 43s
13:	learn: 0.4755800	total: 616ms	remaining: 43.4s
14:	learn: 0.4623877	total: 664ms	remaining: 43.6s
15:	learn: 0.4530367	total: 717ms	remaining: 44.1s
16:	learn: 0.4407571	total: 760ms	remaining: 43.9s
17:	learn: 0.4300570	total: 815ms	remaining: 44.4s
18:	learn: 0.4174051	total: 875ms	remaining: 45.2s
19:	learn: 0.40

In [83]:
from sklearn.ensemble import VotingClassifier

preprocess_voting = ColumnTransformer([
    ('scaler', StandardScaler(), scale)
])

ensemble = VotingClassifier([
    ('log', LogisticRegression(random_state=42, multi_class='auto', max_iter=1000)),
    ('rf', RandomForestClassifier(random_state=42)),
    ('cat', CatBoostClassifier(random_state=42))
])

vc_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('adasyn', ADASYN(random_state=42)),
    ('ensemble', ensemble)
])

vc_cross = cross_validate(vc_pipeline, X, y, scoring=scoring)

vc_mean = {key: value.mean() for key, value in vc_cross.items()}

for metric, mean in vc_mean.items():
    print(f'{metric}: {mean:.4f}')

Learning rate set to 0.007551
0:	learn: 0.6756573	total: 40.5ms	remaining: 40.5s
1:	learn: 0.6573516	total: 68.8ms	remaining: 34.3s
2:	learn: 0.6389056	total: 97.8ms	remaining: 32.5s
3:	learn: 0.6214674	total: 125ms	remaining: 31.1s
4:	learn: 0.6046172	total: 150ms	remaining: 29.8s
5:	learn: 0.5893257	total: 175ms	remaining: 29.1s
6:	learn: 0.5729385	total: 214ms	remaining: 30.3s
7:	learn: 0.5566486	total: 247ms	remaining: 30.7s
8:	learn: 0.5440777	total: 278ms	remaining: 30.6s
9:	learn: 0.5289392	total: 313ms	remaining: 31s
10:	learn: 0.5143547	total: 344ms	remaining: 30.9s
11:	learn: 0.5012590	total: 374ms	remaining: 30.8s
12:	learn: 0.4882578	total: 400ms	remaining: 30.4s
13:	learn: 0.4755800	total: 423ms	remaining: 29.8s
14:	learn: 0.4623877	total: 450ms	remaining: 29.5s
15:	learn: 0.4530367	total: 474ms	remaining: 29.2s
16:	learn: 0.4407571	total: 507ms	remaining: 29.3s
17:	learn: 0.4300570	total: 539ms	remaining: 29.4s
18:	learn: 0.4174051	total: 569ms	remaining: 29.4s
19:	learn: