In [1]:
import pandas as pd
from pycaret.classification import setup, compare_models, tune_model, finalize_model, predict_model

# Load Linux dataset
file_path = 'dataset/system-logs/multiple-system-log-dataset/preprocessed-data/Linux_preprocessed.csv'
df = pd.read_csv(file_path)

# Check class distribution
print("Class distribution in Linux:")
print(df['error'].value_counts())

# Setup the data in PyCaret without automatic imbalance handling
setup_data = setup(data=df, target='error', session_id=42, fold=10, fix_imbalance=False, verbose=True)

# Compare and evaluate models
best_model = compare_models()
tuned_best_model = tune_model(best_model, optimize='AUC', n_iter=30)
final_model = finalize_model(tuned_best_model)

# Displaying the final model
print("Final tuned model performance for Linux:")
predict_model(final_model)


Class distribution in Linux:
0    25152
1      415
Name: error, dtype: int64


Unnamed: 0,Description,Value
0,Session id,42
1,Target,error
2,Target type,Binary
3,Original data shape,"(25567, 5)"
4,Transformed data shape,"(25567, 5)"
5,Transformed train set shape,"(17896, 5)"
6,Transformed test set shape,"(7671, 5)"
7,Numeric features,1
8,Categorical features,3
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.9887,0.0,0.5621,0.7041,0.6173,0.6117,0.6196,0.019
knn,K Neighbors Classifier,0.9877,0.0,0.2414,1.0,0.3847,0.3809,0.4839,0.091
dt,Decision Tree Classifier,0.9875,0.0,0.231,1.0,0.3703,0.3666,0.4722,0.021
svm,SVM - Linear Kernel,0.9875,0.9107,0.231,1.0,0.3703,0.3666,0.4722,0.018
ridge,Ridge Classifier,0.9875,0.9104,0.231,1.0,0.3703,0.3666,0.4722,0.019
rf,Random Forest Classifier,0.9875,0.0,0.231,1.0,0.3703,0.3666,0.4722,0.037
ada,Ada Boost Classifier,0.9875,0.6155,0.231,1.0,0.3703,0.3666,0.4722,0.019
gbc,Gradient Boosting Classifier,0.9875,0.6155,0.231,1.0,0.3703,0.3666,0.4722,0.037
lda,Linear Discriminant Analysis,0.9875,0.894,0.231,1.0,0.3703,0.3666,0.4722,0.019
et,Extra Trees Classifier,0.9875,0.0,0.231,1.0,0.3703,0.3666,0.4722,0.033


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9899,0.0,0.5517,0.7619,0.64,0.635,0.6436
1,0.9866,0.0,0.4828,0.6087,0.5385,0.5318,0.5354
2,0.9888,0.0,0.6552,0.6552,0.6552,0.6495,0.6495
3,0.986,0.0,0.5172,0.5769,0.5455,0.5384,0.5392
4,0.9905,0.0,0.6897,0.7143,0.7018,0.6969,0.697
5,0.9883,0.0,0.5862,0.6538,0.6182,0.6122,0.6132
6,0.9883,0.0,0.5172,0.6818,0.5882,0.5824,0.5881
7,0.9905,0.0,0.4483,0.9286,0.6047,0.6004,0.6416
8,0.9922,0.0,0.5862,0.8947,0.7083,0.7045,0.7208
9,0.9888,0.0,0.5517,0.6957,0.6154,0.6098,0.614


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 28 candidates, totalling 280 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
Final tuned model performance for Linux:


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Naive Bayes,0.9867,0.866,0.52,0.6075,0.5603,0.5536,0.5554


Unnamed: 0,timestamp,tokens,warning,Label,error,prediction_label,prediction_score
17753,Dec 3 09:46:33,"['combo', 'kernel:', 'Out', 'of', 'Memory:', '...",0,Linux,0,0,1.0
17482,Dec 2 23:35:43,"['combo', 'kernel:', 'Out', 'of', 'Memory:', '...",0,Linux,0,0,1.0
9773,Nov 20 05:40:50,"['combo', 'kernel:', 'Out', 'of', 'Memory:', '...",0,Linux,0,0,1.0
705,Jun 11 04:09:53,"['combo', 'su(pam_unix)[5961]:', 'session', 'c...",0,Linux,0,0,1.0
12292,Nov 22 01:25:26,"['combo', 'kernel:', 'Out', 'of', 'Memory:', '...",0,Linux,0,0,1.0
...,...,...,...,...,...,...,...
13327,Nov 22 22:20:23,"['combo', 'kernel:', 'Out', 'of', 'Memory:', '...",0,Linux,0,0,1.0
12133,Nov 21 18:00:20,"['combo', 'kernel:', 'Out', 'of', 'Memory:', '...",0,Linux,0,0,1.0
1311,Jun 24 18:55:18,"['combo', 'ftpd[28574]:', 'connection', 'from'...",0,Linux,0,0,1.0
17224,Dec 1 20:40:48,"['combo', 'kernel:', 'Out', 'of', 'Memory:', '...",0,Linux,0,0,1.0
