In [1]:
import h2o
import pandas as pd
from Data import Data

In [None]:
# Data handling
import datetime
import h2o
from h2o.automl import H2OAutoML
import pandas as pd
from sklearn.model_selection import train_test_split
from supervised.automl import AutoML
from autogluon.tabular import TabularPredictor, TabularDataset
from autogluon.core.metrics import mean_absolute_error
import autogluon.core as ag

import numpy as np

# Data science
from sklearn.preprocessing import MinMaxScaler


class Data:
    target: pd.Series = None
    train: pd.DataFrame = None
    frame: pd.DataFrame = None
    frame_without_tuning_data: pd.DataFrame = None
    tune_data: pd.DataFrame = None
    test: pd.DataFrame = None
    location: str = None

    X_scaler: MinMaxScaler = MinMaxScaler()
    Y_scaler: MinMaxScaler = MinMaxScaler()
    frame_scaler: MinMaxScaler = MinMaxScaler()

    def __init__(self, location: str):
        self.location = location

        self.target = pd.read_parquet(f"../../{location}/train_targets.parquet")["pv_measurement"]
        self.test = pd.read_parquet(f"../../{location}/X_test_estimated.parquet")
        self.test["observed_or_estimated"] = 1
        observed = pd.read_parquet(f"../../{location}/X_train_observed.parquet")
        observed["observed_or_estimated"] = 0
        estimated = pd.read_parquet(f"../../{location}/X_train_estimated.parquet")
        estimated["observed_or_estimated"] = 1
        self.train = pd.concat(
            [
                observed,
                estimated,
            ],
            ignore_index=True,
        ).reset_index()
        
        # self.train = self.train.groupby(pd.Grouper(key="date_forecast", freq="1H")).first().reset_index()
        # self.test = self.test.groupby(pd.Grouper(key="date_forecast", freq="1H")).first().reset_index()

        self.frame = self.train.copy()
        self.frame["pv_measurement"] = self.target
        # self.frame = self.frame.loc[self.frame["pv_measurement"].notna()]
        
        # self.drop_consequtives()
        # self.frame = self.frame.loc[self.frame["pv_measurement"].notna()]
        # self.tune_data = self.tune_data.loc[self.tune_data["pv_measurement"].notna()]
        # self.frame_without_tuning_data = self.frame_without_tuning_data.loc[self.frame_without_tuning_data["pv_measurement"].notna()]
        
        self.set_neg_to_zero()
        self.set_dtypes()
        self.drop_index()
        self.exstract_tuning_data()
        # self.reduce_test_to_submission_length()
        # self.fit_scalers()
        # self.transform_frame()
        
    def exstract_tuning_data(self):
        # exstract tuning data from self.frame
        tuning_start_date = '2021-05-01'
        tuning_end_date = '2021-08-31'
        tuning_condition = (self.frame['date_forecast'] >= tuning_start_date) & (self.frame['date_forecast'] <= tuning_end_date)

        self.tune_data = self.frame.loc[tuning_condition]
        self.frame_without_tuning_data = self.frame.loc[~tuning_condition]

    def set_dtypes(self):
        categorical_colummns = [c for c in self.frame.columns if "idx" in c]
        self.frame[categorical_colummns] = self.frame[categorical_colummns].astype("category")
        self.test[categorical_colummns] = self.test[categorical_colummns].astype("category")
        self.frame["date_forecast"] = pd.to_datetime(self.frame["date_forecast"])
        self.test["date_forecast"] = pd.to_datetime(self.test["date_forecast"])
        self.frame["date_calc"] = pd.to_datetime(self.frame["date_calc"])
        self.test["date_calc"] = pd.to_datetime(self.test["date_calc"])

    def transform_frame(self):
        self.frame = self.frame_scaler.fit_transform(self.frame.copy())

    def fit_transform_test(self):
        return self.X_scaler.transform(self.test)

    def fit_scalers(self):
        self.Y_scaler.fit(self.target.to_numpy().reshape(-1, 1))
        self.X_scaler.fit(self.train)

    def drop_consequtives(self, consecutive_threshold=int(24)):
        column_to_check = "pv_measurement"
        df = self.frame
        mask = (df[column_to_check] != df[column_to_check].shift(2)).cumsum()

        df["consecutive_count"] = df.groupby(mask).transform("count")[column_to_check]
        mask = df["consecutive_count"] > consecutive_threshold
        df.drop(columns=["consecutive_count"], inplace=True)
        self.frame = df.loc[~mask]

    def set_neg_to_zero(self):
        self.frame["pv_measurement"] = self.frame["pv_measurement"].apply(lambda x: max(0, x))

    def drop_index(self):
        self.train = self.train.drop(columns=["index"])
        self.frame = self.frame.drop(columns=["index"])
        # self.frame_without_tuning_data = self.frame_without_tuning_data.drop(columns=["index"])
        # self.tune_data = self.tune_data.drop(columns=["index"])

    def fit_predictions_to_submission_length(self, predictions: np.ndarray):
        submission_skeleton = pd.read_csv("../../test.csv")
        submission_skeleton = submission_skeleton.loc[submission_skeleton["location"] == self.location]
        submission_skeleton.drop(columns=["id", "location", "prediction"], inplace=True)
        submission_skeleton.rename(columns={"time": "date_forecast"}, inplace=True)
        submission_skeleton["date_forecast"] = pd.to_datetime(submission_skeleton["date_forecast"])

        test = self.test.copy()
        test["predictions"] = predictions

        test_shortened = pd.merge(submission_skeleton, test, on="date_forecast", how="left")
        return test_shortened["predictions"]
    
    def reduce_test_to_submission_length(self):
        submission_skeleton = pd.read_csv("../../test.csv")
        submission_skeleton = submission_skeleton.loc[submission_skeleton["location"] == self.location]
        submission_skeleton.drop(columns=["id", "location", "prediction"], inplace=True)
        submission_skeleton.rename(columns={"time": "date_forecast"}, inplace=True)
        submission_skeleton["date_forecast"] = pd.to_datetime(submission_skeleton["date_forecast"])

        test = self.test.copy()
        test["date_forecast"] = pd.to_datetime(test["date_forecast"])

        self.test = pd.merge(submission_skeleton, test, on="date_forecast", how="left")

    def predict_location_H2O(self):
        frame = h2o.H2OFrame(self.frame)

        y = "C47"
        x = frame.columns
        x.remove(y)

        aml = H2OAutoML(
            max_models=20,
            project_name="regression_" + str(self.location),
        )

        aml.train(x=x, y=y, training_frame=frame)
        return aml

    def predict_location_MLJAR(self):
        columns = [f"C_{i}" for i in range(len(self.frame[0]))]
        frame = pd.DataFrame(self.frame, columns=columns)
        frame.fillna(value=np.nan, inplace=True)

        y = frame["C_46"]
        X = frame.drop(columns=["C_46"])
        automl = AutoML(
            ml_task="regression",
            mode="Compete",
            algorithms=[
                "Random Forest",
                "Extra Trees",
                "LightGBM",
                "Xgboost",
                "CatBoost",
            ],
            # eval_metric="rmse",
            # optuna_time_budget=3600,
            # optuna_init_params={},
            # algorithms=["LightGBM", "Xgboost", "Extra Trees"],
            # total_time_limit=8 * 3600,
        )
        automl.fit(X=X, y=y)

        test = self.test.rename(
            columns={self.test.columns[i]: f"C_{i}" for i in range(len(self.frame[0]) - 1)}
        )  # not possible to use frame[0] when not scaling! Rename is thus unecessary
        predictions = automl.predict(test)
        return predictions
    
    def predict_location_GLUON(self, num_bag_folds=7, num_bag_sets=2, num_stack_levels=1):
        frame = self.frame_without_tuning_data.copy().drop(columns=[c for c in self.frame.columns if "index" in c])
        tune = self.tune_data.copy().drop(columns=[c for c in self.frame.columns if "index" in c])
        self.test = self.test.drop(columns=[c for c in self.test.columns if "index" in c])
        
        # frame = frame.loc[frame["pv_measurement"].notna()]
        # tune = tune.loc[tune["pv_measurement"].notna()]
        
        train = TabularDataset(frame)   
        tune = TabularDataset(tune)

        time_limit = 3 * 60 * 60
        # path = f"autogluon_models_{self.location}_f{num_bag_folds}_s{num_bag_sets}_s{num_stack_levels}"
        path = f"autogluon_models_{self.location}"

        predictor = TabularPredictor(
            problem_type="regression", 
            eval_metric=mean_absolute_error, 
            label="pv_measurement", 
            path=path,
            ).fit(
            train,
            tuning_data=tune,
            presets="best_quality",
            time_limit=time_limit,
            hyperparameters={
                'NN_TORCH': {},
                'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
                'CAT': {},
                'XGB': {},
                'FASTAI': {},
                'RF': [{'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression']}}],
                'XT': [{'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression']}}],
            },
            use_bag_holdout=True
            # num_bag_folds=num_bag_folds, 
            # num_bag_sets=num_bag_sets, 
            # num_stack_levels=num_stack_levels
        )

        return predictor

if __name__ == "__main__":
    num_folds = [5, 7]
    num_sets = [2, 4, 7, 30]
    num_stacks = [0, 1, 2]
    
    round = 0
    for num_fold in num_folds:
        for num_set in num_sets:
            for num_stack in num_stacks:
                print(f""".... predicting\n\n\n
                      num folds: {num_fold}
                      num set: {num_set}
                      num stack: {num_stack}
                      
                      Round: {round + 1} of {len(num_stacks) * len(num_folds) * len(num_sets)}
                      
                      """)
                prediction = []
                for location in ["A", "B", "C"]:
                    data = Data(location=location)
                    predictor = data.predict_location_GLUON(num_bag_folds=num_fold, num_bag_sets=num_set, num_stack_levels=num_stack)
                    test = TabularDataset(data.test)
                    y_pred = predictor.predict(test, as_pandas=False)
                    y_pred = data.fit_predictions_to_submission_length(y_pred).to_numpy()
                    prediction += list(y_pred)
                df = pd.DataFrame({"prediction": prediction}).rename_axis(index="id")
                df.to_csv(f"submissions/auto/auto_f{num_fold}_s{num_set}_s{num_stack}_submission.csv")

In [2]:
submission = "auto_ml_submission_H2O_"+str(2)

In [3]:
h2o.init(nthreads=-1, max_mem_size=12)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "17.0.8" 2023-07-18; OpenJDK Runtime Environment Temurin-17.0.8+7 (build 17.0.8+7); OpenJDK 64-Bit Server VM Temurin-17.0.8+7 (build 17.0.8+7, mixed mode)
  Starting server from /Users/miksx/.pyenv/versions/3.11.4/envs/TDT4173/lib/python3.11/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/y8/9glnt_6110xftbsjpdqd12280000gp/T/tmpvv7cmlnq
  JVM stdout: /var/folders/y8/9glnt_6110xftbsjpdqd12280000gp/T/tmpvv7cmlnq/h2o_miksx_started_from_python.out
  JVM stderr: /var/folders/y8/9glnt_6110xftbsjpdqd12280000gp/T/tmpvv7cmlnq/h2o_miksx_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Europe/Oslo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.1
H2O_cluster_version_age:,6 days
H2O_cluster_name:,H2O_from_python_miksx_qtzr3j
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,12 Gb
H2O_cluster_total_cores:,10
H2O_cluster_allowed_cores:,10


In [4]:
data_a = Data("A")
data_b = Data("B")
data_c = Data("C")

### Prediction A

In [5]:
aml_a = data_a.predict_location_H2O()

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
18:40:30.35: AutoML: XGBoost is not available; skipping it.
18:40:30.56: _train param, Dropping bad and constant columns: [C33, C14]

████████████
18:41:54.404: XRT_1_AutoML_1_20231022_184030 [DRF XRT (Extremely Randomized Trees)] failed: java.lang.AssertionError

███
18:42:00.784: _train param, Dropping bad and constant columns: [C33, C14]

████████████████████████████████████████████████| (done) 100%


In [6]:
y_pred_aml_a = aml_a.leader.predict(h2o.H2OFrame(data_a.fit_transform_test()))

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


In [7]:
y_pred_frame_a = y_pred_aml_a.as_data_frame().to_numpy().reshape(-1, 1)
inversed_a = data_a.Y_scaler.inverse_transform(y_pred_frame_a).flatten()

In [8]:
len(inversed_a)

720

### Prediction B

In [9]:
aml_b = data_b.predict_location_H2O()

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
19:07:11.8: AutoML: XGBoost is not available; skipping it.
19:07:11.10: _train param, Dropping bad and constant columns: [C46, C14]

████████████
19:08:36.212: XRT_1_AutoML_2_20231022_190711 [DRF XRT (Extremely Randomized Trees)] failed: java.lang.AssertionError

███
19:08:45.234: _train param, Dropping bad and constant columns: [C46, C14]

████████████████████████████████████████████████| (done) 100%


In [10]:
y_pred_aml_b = aml_b.leader.predict(h2o.H2OFrame(data_b.fit_transform_test()))

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


In [11]:
y_pred_frame_b = y_pred_aml_b.as_data_frame().to_numpy().reshape(-1, 1)
inversed_b = data_b.Y_scaler.inverse_transform(y_pred_frame_b).flatten()

In [12]:
len(inversed_b)

720

### Predict C

In [13]:
aml_c = data_c.predict_location_H2O()

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
19:32:13.359: AutoML: XGBoost is not available; skipping it.
19:32:13.361: _train param, Dropping bad and constant columns: [C46, C33, C14]

█████████████
19:33:13.415: XRT_1_AutoML_3_20231022_193213 [DRF XRT (Extremely Randomized Trees)] failed: java.lang.AssertionError

██
19:33:18.666: _train param, Dropping bad and constant columns: [C46, C33, C14]

████████████████████████████████████████████████| (done) 100%


In [14]:
y_pred_aml_c = aml_c.leader.predict(h2o.H2OFrame(data_c.fit_transform_test()))

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


In [15]:
y_pred_frame_c = y_pred_aml_c.as_data_frame().to_numpy().reshape(-1, 1)
inversed_c = data_c.Y_scaler.inverse_transform(y_pred_frame_c).flatten()

In [16]:
len(inversed_c)

720

## Creating prediction

In [17]:
prediction = list(inversed_a) + list(inversed_b) + list(inversed_c)

In [18]:
df = pd.DataFrame({"prediction": prediction}).rename_axis(index="id")
df.to_csv(f"submissions/{submission}_submission.csv")

In [19]:
h2o.cluster().shutdown()

H2O session _sid_817c closed.


In [1]:
import pandas as pd
from Data import Data

In [2]:
submission = "auto_ml_submission_MLJAR_"+str(2)

In [3]:
data_a = Data("A")
data_b = Data("B")
data_c = Data("C")

In [4]:
y_pred_a = data_a.predict_location_MLJAR()

AutoML directory: AutoML_1
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['Decision Tree', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree rmse 0.100282 trained in 0.35 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 10-fold CV Shuffle
* Step simple_algorithms will try to check up to 3 models
1_DecisionTree rmse 0.100808 trained in 2.67 seconds
2_DecisionTree rmse 0.096431 trained in 3.06 seconds
3_DecisionTree rmse 0.096431 trained in 3.08 seconds
* Step default_algorithms will try to check up to 5 models



5_Default_Xgboost rmse 0.072548 trained in 25.16 seconds
6_Default_CatBoost rmse 0.072161 trained in 31.09 seconds
7_Default_RandomForest rmse 0.093978 trained in 23.23 seconds
8_Default_ExtraTrees rmse 0.095084 trained in 11.32 seconds
* Step not_so_random will try to check up to 45 models
18_LightGBM rmse 0.072202 trained in 50.71 seconds




9_Xgboost rmse 0.071328 trained in 41.83 seconds
27_CatBoost rmse 0.068798 trained in 140.74 seconds
36_RandomForest rmse 0.093936 trained in 20.11 seconds
45_ExtraTrees rmse 0.096371 trained in 6.27 seconds
19_LightGBM rmse 0.071577 trained in 49.93 seconds




10_Xgboost rmse 0.071764 trained in 35.49 seconds
28_CatBoost rmse 0.075125 trained in 83.43 seconds
37_RandomForest rmse 0.096768 trained in 16.22 seconds
46_ExtraTrees rmse 0.098662 trained in 6.88 seconds
20_LightGBM rmse 0.068725 trained in 140.5 seconds




11_Xgboost rmse 0.072314 trained in 26.44 seconds
29_CatBoost rmse 0.07003 trained in 55.53 seconds
38_RandomForest rmse 0.089148 trained in 26.94 seconds
47_ExtraTrees rmse 0.088883 trained in 11.17 seconds
21_LightGBM rmse 0.068629 trained in 256.22 seconds




12_Xgboost rmse 0.075828 trained in 12.24 seconds
30_CatBoost rmse 0.077184 trained in 43.03 seconds
39_RandomForest rmse 0.089113 trained in 30.59 seconds
48_ExtraTrees rmse 0.090905 trained in 14.31 seconds
22_LightGBM rmse 0.071744 trained in 56.88 seconds




13_Xgboost rmse 0.073875 trained in 20.75 seconds
31_CatBoost rmse 0.078759 trained in 61.33 seconds
40_RandomForest rmse 0.086912 trained in 31.95 seconds
49_ExtraTrees rmse 0.087552 trained in 9.11 seconds
23_LightGBM rmse 0.067825 trained in 391.25 seconds
Skip golden_features because of the time limit.
* Step kmeans_features will try to check up to 3 models




4_Default_LightGBM_KMeansFeatures rmse 0.070467 trained in 203.57 seconds
Not enough time to perform features selection. Skip
Time needed for features selection ~ 642.0 seconds
Please increase total_time_limit to at least (6484 seconds) to have features selection
Skip insert_random_feature because no parameters were generated.
Skip features_selection because no parameters were generated.
* Step hill_climbing_1 will try to check up to 29 models
50_LightGBM rmse 0.067958 trained in 370.24 seconds
51_LightGBM rmse 0.068242 trained in 437.06 seconds
Skip hill_climbing_2 because of the time limit.
* Step boost_on_errors will try to check up to 1 model
4_Default_LightGBM_BoostOnErrors rmse 0.068512 trained in 158.93 seconds
* Step ensemble will try to check up to 1 model
Ensemble rmse 0.067004 trained in 0.99 seconds
* Step stack will try to check up to 33 models
23_LightGBM_Stacked rmse 0.067159 trained in 30.55 seconds
27_CatBoost_Stacked rmse 0.06633 trained in 27.49 seconds




9_Xgboost_Stacked rmse 0.067021 trained in 9.18 seconds
40_RandomForest_Stacked rmse 0.066214 trained in 86.75 seconds
49_ExtraTrees_Stacked rmse 0.066326 trained in 19.03 seconds
50_LightGBM_Stacked rmse 0.067034 trained in 31.41 seconds
29_CatBoost_Stacked rmse 0.06641 trained in 10.27 seconds




10_Xgboost_Stacked rmse 0.067269 trained in 10.54 seconds
39_RandomForest_Stacked rmse 0.066271 trained in 74.16 seconds
47_ExtraTrees_Stacked rmse 0.066475 trained in 18.46 seconds
51_LightGBM_Stacked rmse 0.06702 trained in 27.78 seconds
6_Default_CatBoost_Stacked rmse 0.066483 trained in 7.86 seconds




11_Xgboost_Stacked rmse 0.066507 trained in 6.7 seconds
38_RandomForest_Stacked not trained. Stop training after the first fold. Time needed to train on the first fold 12.0 seconds. The time estimate for training on all folds is larger than total_time_limit.
48_ExtraTrees_Stacked rmse 0.066554 trained in 16.24 seconds
4_Default_LightGBM_Stacked rmse 0.066802 trained in 21.32 seconds
28_CatBoost_Stacked rmse 0.068156 trained in 26.61 seconds




5_Default_Xgboost_Stacked rmse 0.066545 trained in 7.53 seconds
36_RandomForest_Stacked not trained. Stop training after the first fold. Time needed to train on the first fold 2.0 seconds. The time estimate for training on all folds is larger than total_time_limit.
8_Default_ExtraTrees_Stacked not trained. Stop training after the first fold. Time needed to train on the first fold 3.0 seconds. The time estimate for training on all folds is larger than total_time_limit.
* Step ensemble_stacked will try to check up to 1 model
Ensemble_Stacked rmse 0.065958 trained in 2.03 seconds
AutoML fit time: 3610.96 seconds
AutoML best model: Ensemble_Stacked




In [5]:
reshaped_pred_a = y_pred_a.reshape(-1, 1)

In [6]:
inversed_a = data_a.Y_scaler.inverse_transform(reshaped_pred_a).flatten()

## Pred B

In [7]:
y_pred_b = data_b.predict_location_MLJAR()

AutoML directory: AutoML_2
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['Decision Tree', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree rmse 0.137638 trained in 0.59 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 10-fold CV Shuffle
* Step simple_algorithms will try to check up to 3 models
1_DecisionTree rmse 0.136849 trained in 2.0 seconds
2_DecisionTree rmse 0.129692 trained in 2.2 seconds
3_DecisionTree rmse 0.129692 trained in 2.22 seconds
* Step default_algorithms will try to check up to 5 models
4



5_Default_Xgboost rmse 0.086185 trained in 57.71 seconds
6_Default_CatBoost rmse 0.085411 trained in 85.14 seconds
7_Default_RandomForest rmse 0.122774 trained in 19.24 seconds
8_Default_ExtraTrees rmse 0.13041 trained in 8.83 seconds
* Step not_so_random will try to check up to 45 models
18_LightGBM rmse 0.086345 trained in 123.43 seconds




9_Xgboost rmse 0.084458 trained in 95.08 seconds
27_CatBoost rmse 0.080979 trained in 2752.93 seconds
Skip golden_features because of the time limit.
Skip kmeans_features because of the time limit.
Not enough time to perform features selection. Skip
Time needed for features selection ~ 813.0 seconds
Please increase total_time_limit to at least (8194 seconds) to have features selection
Skip insert_random_feature because no parameters were generated.
Skip features_selection because no parameters were generated.
Skip hill_climbing_1 because of the time limit.
Skip hill_climbing_2 because of the time limit.
Skip boost_on_errors because of the time limit.
* Step ensemble will try to check up to 1 model
Ensemble rmse 0.080149 trained in 0.15 seconds
* Step stack will try to check up to 8 models
27_CatBoost_Stacked not trained. Force to stop the training. Total time for AutoML training already exceeded.
Skip ensemble_stacked because no parameters were generated.
AutoML fit time: 4390.08 secon



In [8]:
reshaped_pred_b = y_pred_b.reshape(-1, 1)

In [9]:
inversed_b = data_b.Y_scaler.inverse_transform(reshaped_pred_b).flatten()

## Pred C

In [10]:
y_pred_c = data_c.predict_location_MLJAR()

AutoML directory: AutoML_3
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['Decision Tree', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree rmse 0.109898 trained in 0.32 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 10-fold CV Shuffle
* Step simple_algorithms will try to check up to 3 models
1_DecisionTree rmse 0.112639 trained in 1064.37 seconds
2_DecisionTree not trained. Force to stop the training. Total time for AutoML training already exceeded.
Skip default_algorithms because of the time limit.
Skip 



In [11]:
reshaped_pred_c = y_pred_c.reshape(-1, 1)

In [12]:
inversed_c = data_c.Y_scaler.inverse_transform(reshaped_pred_c).flatten()

## Submission

In [13]:
prediction = list(inversed_a) + list(inversed_b) + list(inversed_c)

In [14]:
df = pd.DataFrame({"prediction": prediction}).rename_axis(index="id")
df.to_csv(f"submissions/{submission}_submission.csv")