In [1]:
import pandas as pd
from pycaret.classification import setup, compare_models, tune_model, finalize_model, predict_model

# Load Mac dataset
file_path = 'dataset/system-logs/multiple-system-log-dataset/preprocessed-data/Mac_preprocessed.csv'
df = pd.read_csv(file_path)

# Check class distribution
print("Class distribution in Mac:")
print(df['error'].value_counts())

# Setup the data in PyCaret without automatic imbalance handling
setup_data = setup(data=df, target='error', session_id=42, fold=10, fix_imbalance=False, verbose=True)

# Compare and evaluate models
best_model = compare_models()
tuned_best_model = tune_model(best_model, optimize='AUC', n_iter=30)
final_model = finalize_model(tuned_best_model)

# Displaying the final model
print("Final tuned model performance for Mac:")
predict_model(final_model)


Class distribution in Mac:
0    90540
1    16661
Name: error, dtype: int64


Unnamed: 0,Description,Value
0,Session id,42
1,Target,error
2,Target type,Binary
3,Original data shape,"(107201, 5)"
4,Transformed data shape,"(107201, 5)"
5,Transformed train set shape,"(75040, 5)"
6,Transformed test set shape,"(32161, 5)"
7,Numeric features,1
8,Categorical features,3
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.94,0.9714,0.6147,0.9993,0.7611,0.7291,0.7573,0.198
knn,K Neighbors Classifier,0.9398,0.0,0.6126,0.9999,0.7597,0.7275,0.7561,0.141
et,Extra Trees Classifier,0.9216,0.0,0.4956,0.9999,0.6549,0.6177,0.6697,0.121
lda,Linear Discriminant Analysis,0.9211,0.9626,0.4975,0.9901,0.6622,0.623,0.6705,0.062
ridge,Ridge Classifier,0.9195,0.9626,0.4868,0.9904,0.6527,0.6131,0.6628,0.061
dt,Decision Tree Classifier,0.9033,0.0,0.3775,1.0,0.548,0.506,0.5819,0.061
svm,SVM - Linear Kernel,0.9033,0.9731,0.3775,1.0,0.548,0.506,0.5819,0.063
rf,Random Forest Classifier,0.9033,0.0,0.3775,1.0,0.548,0.506,0.5819,0.137
ada,Ada Boost Classifier,0.9033,0.6888,0.3775,1.0,0.548,0.506,0.5819,0.068
gbc,Gradient Boosting Classifier,0.9033,0.7341,0.3775,1.0,0.548,0.506,0.5819,0.159


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9391,0.97,0.6081,1.0,0.7563,0.7238,0.7531
1,0.9398,0.9719,0.6123,1.0,0.7596,0.7274,0.756
2,0.9414,0.9738,0.6226,1.0,0.7674,0.736,0.763
3,0.9411,0.9785,0.6218,0.9986,0.7664,0.7348,0.7618
4,0.9394,0.9689,0.6098,1.0,0.7576,0.7252,0.7543
5,0.9426,0.9712,0.6304,1.0,0.7733,0.7423,0.7683
6,0.9392,0.9732,0.6089,1.0,0.7569,0.7245,0.7537
7,0.9384,0.9753,0.6041,1.0,0.7532,0.7205,0.7504
8,0.9388,0.9693,0.6067,1.0,0.7552,0.7226,0.7521
9,0.9396,0.9738,0.6118,1.0,0.7592,0.7269,0.7556


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 30 candidates, totalling 300 fits
Final tuned model performance for Mac:


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.9409,0.9746,0.6198,1.0,0.7653,0.7336,0.7611


Unnamed: 0,timestamp,tokens,warning,Label,error,prediction_label,prediction_score
59629,Jul 4 23:22:09,"['calvisitor-10-105-162-105', 'Microsoft', 'Wo...",0,Mac,1,1,0.9606
71127,Jul 5 18:20:12,"['calvisitor-10-105-162-98', 'secd[276]:', 'SO...",0,Mac,1,0,0.9922
76356,Jul 6 08:44:23,"['calvisitor-10-105-163-253', 'QQ[10018]:', 'F...",0,Mac,0,0,0.9905
30645,Jul 3 16:07:32,"['calvisitor-10-105-160-237', 'kernel[0]:', 'I...",0,Mac,0,0,1.0000
103985,Jul 8 03:32:51,"['calvisitor-10-105-162-228', 'QQ[10018]:', 'b...",0,Mac,0,0,0.9927
...,...,...,...,...,...,...,...
68885,Jul 5 16:12:49,"['authorMacBook-Pro', 'kernel[0]:', 'hibernate...",0,Mac,0,0,0.9944
9620,Jul 1 19:43:22,"['calvisitor-10-105-160-95', 'kernel[0]:', 'Ap...",0,Mac,0,0,1.0000
48573,Jul 4 11:42:12,"['calvisitor-10-105-162-105', 'kernel[0]:', 'A...",0,Mac,0,0,0.9970
78196,Jul 6 12:00:52,"['authorMacBook-Pro', 'kernel[0]:', 'USBMSC', ...",0,Mac,0,0,1.0000
