In [1]:
from sklearn.ensemble import VotingClassifier
from operator import itemgetter
import json
import importlib
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

class MiniAutoML:
    def __init__(self, models_path, n_jobs=1, voting=False):
        with open(models_path, 'r') as f:
            self.models_config = json.load(f)
        self.best_model_pipeline = None
        self.best_score = -1
        self.results = []
        self.n_jobs = n_jobs
        self.voting = voting

    def _get_clf_class(self, class_string):
        module_name, class_name = class_string.rsplit('.', 1)
        module = importlib.import_module(module_name)
        return getattr(module, class_name)

    def _create_preprocessing_pipeline(self, X):
        numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
        categorical_features = X.select_dtypes(include=['object', 'bool', 'category']).columns

        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ])

        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ])

        return ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ])

    def fit(self, X, y):
        preprocessor = self._create_preprocessing_pipeline(X)
        self.results = [] 
        
        for i, model_entry in enumerate(self.models_config):

            name = model_entry['name']
            clf_class = self._get_clf_class(model_entry['class'])
            params = model_entry['params'].copy()
            
            # Poprawki parametrów
            if 'HistGradientBoosting' in model_entry['class'] and params.get('loss') == 'auto':
                params['loss'] = 'log_loss'
            
            tree_based_models = ['RandomForest', 'ExtraTrees', 'DecisionTree']
            if any(tree_model in model_entry['class'] for tree_model in tree_based_models):
                if 'min_impurity_split' in params: del params['min_impurity_split']
                if 'tol' in params: del params['tol']
                if params.get('max_features') == 'auto': params['max_features'] = 'sqrt'

            clf = clf_class(**params)
            
            temp_pipeline = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('classifier', clf)
            ])
            
            try:
                scores = cross_val_score(temp_pipeline, X, y, cv=5, scoring='balanced_accuracy', n_jobs = self.n_jobs)
                mean_score = np.mean(scores)
                
                self.results.append({
                    'name': name,
                    'score': mean_score,
                    'clf': clf 
                })
                
                print(f" {i+1}/{len(self.models_config)} Model: {name} | Średnie Balanced Accuracy: {mean_score:.4f}")
                
            except Exception as e:
                print(f"Błąd przy modelu {name}: {e}")

        if not self.results:
            raise ValueError("Nie udało się wytrenować żadnego modelu.")


        if self.voting and len(self.results) >= 5:
            self.results.sort(key=itemgetter('score'), reverse=True)
            best_single = self.results[0]

            top_5_results = self.results[:5]
            estimators = [(res['name'], res['clf']) for res in top_5_results]
            voting_clf = VotingClassifier(estimators=estimators, voting='soft', n_jobs = self.n_jobs)
            
            voting_pipeline = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('classifier', voting_clf)
            ])

            print(f"\n Porównanie najlepszego dotychczas modelu z VotingClassifier z najlepszych 5 modeli:")
            
            try:
                v_scores = cross_val_score(voting_pipeline, X, y, cv=5, scoring='balanced_accuracy', n_jobs = self.n_jobs)
                v_score = np.mean(v_scores)
                
                print(f" Wynik Voting: {v_score:.4f}")
                print(f" Wynik Najlepszego Pojedynczego Modelu ({best_single['name']}): {best_single['score']:.4f}")

                if v_score > best_single['score']:
                    print("Wybrano VotingClassifier.")
                    self.best_model_pipeline = voting_pipeline
                    self.best_score = v_score
                else:
                    print(f"Pozostawiono pojedynczy model: {best_single['name']}")
                    self.best_model_pipeline = Pipeline(steps=[
                        ('preprocessor', preprocessor),
                        ('classifier', best_single['clf'])
                    ])
                    self.best_score = best_single['score']

            except Exception as e:
                print(f" Błąd przy ocenie Votinga: {e}. Wybieram najlepszy model.")
                self.best_model_pipeline = Pipeline(steps=[
                    ('preprocessor', preprocessor),
                    ('classifier', best_single['clf'])
                ])
                self.best_score = best_single['score']

        self.best_model_pipeline.fit(X, y)

    def predict(self, X):
        if self.best_model_pipeline is None:
            raise ValueError("Model nie został jeszcze dopasowany.")
        return self.best_model_pipeline.predict(X)
    
    def predict_proba(self, X):
        if self.best_model_pipeline is None:
            raise ValueError("Model nie został jeszcze dopasowany.")
        if not hasattr(self.best_model_pipeline.named_steps['classifier'], 'predict_proba'):
            raise ValueError(f"Wybrany model {self.best_model_pipeline.named_steps['classifier'].__class__.__name__} nie obsługuje predict_proba.")
        return self.best_model_pipeline.predict_proba(X)

In [2]:
X = pd.read_csv('X.csv')
y = pd.read_csv('y.csv').values.ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

selector = MiniAutoML('models.json', voting=True, n_jobs=-1)
selector.fit(X_train, y_train)

 1/34 Model: kr_vs_kp_HistGradientBoostingClassifier | Średnie Balanced Accuracy: 0.5308
 2/34 Model: credit_approval_HistGradientBoostingClassifier | Średnie Balanced Accuracy: 0.5606
 3/34 Model: credit_g_RandomForestClassifier | Średnie Balanced Accuracy: 0.5540
 4/34 Model: diabetes_RandomForestClassifier | Średnie Balanced Accuracy: 0.5496
 5/34 Model: spambase_RandomForestClassifier | Średnie Balanced Accuracy: 0.5544
 6/34 Model: tic_tac_toe_RandomForestClassifier | Średnie Balanced Accuracy: 0.5536
 7/34 Model: electricity_HistGradientBoostingClassifier | Średnie Balanced Accuracy: 0.5552
 8/34 Model: sick_RandomForestClassifier | Średnie Balanced Accuracy: 0.5369
 9/34 Model: pc4_RandomForestClassifier | Średnie Balanced Accuracy: 0.5517
 10/34 Model: pc3_RandomForestClassifier | Średnie Balanced Accuracy: 0.5639
 11/34 Model: jm1_HistGradientBoostingClassifier | Średnie Balanced Accuracy: 0.5538
 12/34 Model: kc2_RandomForestClassifier | Średnie Balanced Accuracy: 0.5593
 13/

In [4]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import balanced_accuracy_score

selector.predict(X_test)
balanced_accuracy = balanced_accuracy_score(y_test, selector.predict(X_test))
print(f"Dokładność na zbiorze testowym: {balanced_accuracy:.4f}")

probs = selector.predict_proba(X_test)
probs_positive = probs[:, 1]
auc_score = roc_auc_score(y_test, probs_positive)
print(f"ROC AUC Score: {auc_score:.4f}")

Dokładność na zbiorze testowym: 0.5624
ROC AUC Score: 0.5803
