In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np


from sklearn.feature_selection import SelectFromModel

from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler,FunctionTransformer
from sklearn.compose import make_column_transformer, make_column_selector, ColumnTransformer

from sklearn.metrics import recall_score, accuracy_score, precision_score, roc_auc_score, roc_curve, balanced_accuracy_score, mean_squared_error, f1_score

In [8]:
def weighted_mean(x: np.array, theta: float) -> float:
    return np.mean(x) - theta * np.std(x)

In [9]:
def top10(topk, all_models): # zmieniamy by miec wektor 50 wymiarowy 0 i 1
    return np.array([1 if m in topk else 0 for m in all_models])

In [None]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_val_score
import lightgbm as lgb
from lightgbm import LGBMClassifier
import importlib
from sklearn.ensemble import VotingClassifier, StackingClassifier
class MiniAutoML:

    def __init__(self, models_config, theta=0.25):
        self.models_config = models_config
        self.models = []
        self.models_best = None
        self.models_load()
        self.models_build()
        self.theta = theta
        self.skew_pos_data = None
        self.skew_neg_data = None


    def models_load(self):
        with open(self.models_config, 'r') as file:
            self.cfg= json.load(file)
    def models_build(self):
        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy = 'median')),
            ('scaler', StandardScaler())
                ])
        categorical_transformer = Pipeline(steps = [
            ('imputer', SimpleImputer(strategy = 'most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown = 'ignore',sparse_output=False))
            ])
        preprocessor = ColumnTransformer([
            ('numerical', numerical_transformer, make_column_selector(dtype_include=np.number)),
            ('categorical', categorical_transformer, make_column_selector(dtype_include=np.object_))
            ])

        for m in self.cfg:
            name = m.get('name')
            module_name, class_name = m.get('class').rsplit(".", 1)
            module = importlib.import_module(module_name)
            Class = getattr(module, class_name)
            params = m.get('params',{})
            model = Pipeline([
                ('pre', preprocessor),
                (name, Class(**params))
            ])
            self.models.append((name, model, class_name))


    def fit(self, X_train, y_train):
        meta_df = pd.read_csv("meta_dataset.csv")
        y = np.vstack([top10(t, [m.get('name') for m in self.cfg]) for t in meta_df["top10_models"]])
        X = meta_df.drop(columns=["top10_models"])
        knn= Pipeline([
            ('scaler',StandardScaler()),
            ('knn',KNeighborsClassifier(n_neighbors=3))
        ])
        knn.fit(X,y)

        X_train= pd.DataFrame(X_train)
        y_train = pd.Series(y_train)
        selector = make_column_selector(dtype_include=['object', 'category'])
        selected_cols = selector(X_train)
        p=np.mean(y_train==y_train.unique()[1])

        X_new = pd.DataFrame([{
            'n_features':X_train.shape[1],
            "n_samples": X_train.shape[0],
            "class_imbalance":  min(1-p,p),
            'n_skew_pos':len(list(X_train.drop(selected_cols,axis=1).loc[:, X_train.skew(numeric_only=True) > 1].dtypes.index)),
            'n_skew_neg':len(list(X_train.drop(selected_cols,axis=1).loc[:, X_train.skew(numeric_only=True) < - 1].dtypes.index))
            }])
        probs = []
        proba = knn.predict_proba(X_new)
        for i, p in enumerate(proba):
          classes = knn.named_steps["knn"].classes_[i]
          if len(classes) == 2:
            prob = p[0, 1]
          else:
            prob = 1.0 if classes[0] == 1 else 0.0
          probs.append(prob)

        probs = np.array(probs)
        top5_idx = np.argsort(probs)[-5:][::-1]
        top5_models_new = [ [m.get('name') for m in self.cfg][i] for i in top5_idx]
        X_train_new = X_train
        print(top5_models_new)
        # sprawdzamy czy istnieja cechy ktore sa istotnie skosne, jesli tak to logarytmujemy
        self.skew_neg_data=list(X_train.drop(selected_cols,axis=1).loc[:, X_train.skew(numeric_only=True) < -1].dtypes.index)
        if len(self.skew_neg_data)>0:
          X_train_new[self.skew_neg_data] = np.log1p(np.max(X_train[self.skew_neg_data])-X_train[self.skew_neg_data])
        self.skew_pos_data=list(X_train.drop(selected_cols,axis=1).loc[:, X_train.skew(numeric_only=True) > 1].dtypes.index)
        if len(self.skew_pos_data)>0:
          X_train_new[self.skew_pos_data] = np.log1p(X_train[self.skew_pos_data])


        score = []
        for name, model, class_name in self.models:
            if X_train.shape[1] > 20 and class_name == "LogisticRegression": # za duzo zmiennych regresja logistyczna liczy sie za dlugo...
              score.append(0)
            else:
              if name in top5_models_new:
                scr = cross_val_score(model, X_train_new, y_train, cv=3, scoring='balanced_accuracy')
                score.append(weighted_mean(scr, self.theta))
                # metryka stabilnosci? Chcemy by stablnosc modelu rowniez miala znaczenie w rozwanaiu modelu, parametrem theta bedziemy decydowac jak duzy to wplyw
                print(f"{name}: score = {weighted_mean(scr, self.theta):.4f}")
              else:
                score.append(0)
        score_np = np.array(score)
        idx = np.argsort(score_np)[-3:][::-1]
        est = [(self.models[i][0],self.models[i][1]) for i in idx]

        # --------------------------- VOTING -----------------------------------
        Voting = Pipeline([
                ('voting', VotingClassifier(estimators=est))
            ])
        self.models.append(('voting', Voting, 'VotingClassifier'))
        scr=cross_val_score(Voting, X_train_new, y_train, cv=3, scoring='balanced_accuracy')
        score.append(weighted_mean(scr, self.theta))
        print(f"Voting: score = {weighted_mean(scr, self.theta):.4f}")

        # --------------------------- STACKING ---------------------------------
        Stacking = Pipeline([
                ('stacking', StackingClassifier(estimators=est))
            ])
        self.models.append(('stacking', Stacking, 'StackingClassifier'))
        scr=cross_val_score(Stacking, X_train_new, y_train, cv=3, scoring='balanced_accuracy')
        score.append(weighted_mean(scr, self.theta))
        print(f"Stacking: score = {weighted_mean(scr, self.theta):.4f}")


        for name, model,_ in self.models:
              model.fit(X_train_new, y_train)
        name, self.models_best, _ = self.models[np.argmax(score)]



    def predict(self, X_test):
        X_test= pd.DataFrame(X_test)
        X_test_new = X_test
        if len(self.skew_neg_data)>0:
          X_test_new[self.skew_neg_data] = np.log1p(np.max(X_test[self.skew_neg_data])-X_test[self.skew_neg_data])
        if len(self.skew_pos_data)>0:
          X_test_new[self.skew_pos_data] = np.log1p(X_test[self.skew_pos_data])

        return self.models_best.predict(X_test_new)

    def predict_proba(self, X_test):
        X_test = pd.DataFrame(X_test)
        X_test_new = X_test
        if len(self.skew_neg_data)>0:
          X_test_new[self.skew_neg_data] = np.log1p(np.max(X_test[self.skew_neg_data])-X_test[self.skew_neg_data])
        if len(self.skew_pos_data)>0:
          X_test_new[self.skew_pos_data] = np.log1p(X_test[self.skew_pos_data])

        return self.models_best.predict_proba(X_test_new)