In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
import warnings

warnings.filterwarnings('ignore')

# Define datasets paths
datasets = {
    'Android': 'dataset/system-logs/multiple-system-log-dataset/preprocessed-data/Android_preprocessed.csv',
    'Linux': 'dataset/system-logs/multiple-system-log-dataset/preprocessed-data/Linux_preprocessed.csv',
    'Mac': 'dataset/system-logs/multiple-system-log-dataset/preprocessed-data/Mac_preprocessed.csv',
    'Windows': 'dataset/system-logs/multiple-system-log-dataset/preprocessed-data/Windows_preprocessed.csv'
}

# Define models to compare
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Support Vector Machine': SVC(probability=True),
    'Multinomial Naive Bayes': MultinomialNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Extra Trees': ExtraTreesClassifier()
}

for system_name, file_path in datasets.items():
    df = pd.read_csv(file_path)
    
    # Check class distribution
    print(f"Class distribution in {system_name}:")
    print(df['error'].value_counts())
    
    # Split data into features and target
    X = df.drop('error', axis=1)
    y = df['error']
    
    # Identify categorical and numerical columns
    categorical_cols = X.select_dtypes(include=['object']).columns
    numerical_cols = X.select_dtypes(include=['number']).columns
    
    # Define a preprocessor with OneHotEncoder for categorical columns and StandardScaler for numerical columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ])
    
    # Preprocess the data
    X_preprocessed = preprocessor.fit_transform(X)
    
    # Apply SMOTE to handle class imbalance
    smote = SMOTE(random_state=42)
    X_smote, y_smote = smote.fit_resample(X_preprocessed, y)
    
    # Split the preprocessed data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42, stratify=y_smote)
    
    # Compare models
    best_model = None
    best_score = 0
    for name, model in models.items():
        scores = cross_val_score(model, X_train, y_train, cv=10, scoring='roc_auc')
        print(f"{name} AUC: {scores.mean()} (+/- {scores.std()})")
        if scores.mean() > best_score:
            best_score = scores.mean()
            best_model = model
    
    # Tune the best model
    param_grid = {
        'Logistic Regression': {'C': [0.01, 0.1, 1, 10]},
        'Random Forest': {'n_estimators': [50, 100, 200]},
        'Gradient Boosting': {'n_estimators': [50, 100, 200]},
        'Support Vector Machine': {'C': [0.1, 1, 10]},
        'Decision Tree': {'max_depth': [None, 10, 20, 30]},
        'K-Nearest Neighbors': {'n_neighbors': [3, 5, 7, 9]},
        'AdaBoost': {'n_estimators': [50, 100, 200]},
        'Extra Trees': {'n_estimators': [50, 100, 200]},
        'Multinomial Naive Bayes': {'alpha': [0.1, 0.5, 1.0, 2.0]}
    }
    
    grid_search = GridSearchCV(best_model, param_grid.get(type(best_model).__name__, {}), cv=10, scoring='roc_auc', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    final_model = grid_search.best_estimator_
    
    # Evaluate the final model
    y_pred = final_model.predict(X_test)
    y_pred_proba = final_model.predict_proba(X_test)[:, 1]
    print(f"Final tuned model performance for {system_name}:")
    print(classification_report(y_test, y_pred))
    print(f"AUC: {roc_auc_score(y_test, y_pred_proba)}")


Class distribution in Android:
0    1400773
1     154232
Name: error, dtype: int64


NameError: name 'SMote' is not defined