In [1]:
%load_ext autoreload

In [2]:
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd
import numpy as np

In [None]:
import numpy as np
import pandas as pd
from feature_engine.timeseries.forecasting import LagFeatures
from feature_engine.selection import DropCorrelatedFeatures
from sklearn.metrics import mean_absolute_error
import os

current_dir = os.getcwd()
print("Current working directory:", current_dir)


PATH = "../../../"
# Estimate
X_train_estimated_a: pd.DataFrame = pd.read_parquet(
    PATH + 'A/X_train_estimated.parquet')
X_train_estimated_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/X_train_estimated.parquet")
X_train_estimated_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/X_train_estimated.parquet")

# Test estimates
X_test_estimated_a: pd.DataFrame = pd.read_parquet(
    PATH + "A/X_test_estimated.parquet")
X_test_estimated_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/X_test_estimated.parquet")
X_test_estimated_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/X_test_estimated.parquet")

# Observations
X_train_observed_a: pd.DataFrame = pd.read_parquet(
    PATH + "A/X_train_observed.parquet")
X_train_observed_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/X_train_observed.parquet")
X_train_observed_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/X_train_observed.parquet")

# Targets
Y_train_observed_a: pd.DataFrame = pd.read_parquet(
    PATH + "A/train_targets.parquet")
Y_train_observed_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/train_targets.parquet")
Y_train_observed_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/train_targets.parquet")

test_df_example = pd.read_csv(PATH + "test.csv")

best_submission: pd.DataFrame = pd.read_csv(
    PATH + "mikael/submissions/fourth_submission.csv")

optins = {
    "randomize": False,
    "consecutive_threshold": 6,
    "normalize": False,
    "group_by_hour": True,
    "unzip_date_feature": True,
}

# make a options class with the options as attributes


class Options:
    randomize = False
    consecutive_threshold = 6
    normalize = False
    group_by_hour = True
    unzip_date_feature = True

    def __init__(self, randomize=False, consecutive_threshold=6, normalize=False, group_by_hour=True, unzip_date_feature=True) -> None:
        self.randomize = randomize
        self.consecutive_threshold = consecutive_threshold
        self.normalize = normalize
        self.group_by_hour = group_by_hour
        self.unzip_date_feature = unzip_date_feature


class Pipeline:

    def __init__(self):
        pass

    def get_combined_data(self, test_data=False):
        locations = ["A", "B", "C"]
        dfs = []
        for index, location in enumerate(locations):
            if test_data:
                dfs.append(self.get_test_data(location))
            else:
                dfs.append(self.get_data(location))

            dfs[index] = self.onehot_location(dfs[index], location)
        df = pd.concat(dfs).reset_index(drop=True)

        if test_data:
            return df
        return df[[c for c in df if c not in ['pv_measurement']] +  # pv measurement is the target and is at the end columns
                  ['pv_measurement']]
    
    def get_all_data(self, location: str):
        train, targets = self.get_training_data_by_location("A")
        test = self.get_test_data_by_location("A")
        train = self.drop_features(train)
        test = test[train.columns.to_list()]
        train = self.handle_data(train, targets)
        test = self.handle_data(test, test=True)
        return train, test

    def get_data(self, location: str) -> pd.DataFrame:
        train, targets = self.get_training_data_by_location(location)
        return self.handle_data(train, targets)

    def get_test_data(self, location: str, columns: list) -> pd.DataFrame:
        test_data = self.get_test_data_by_location(location)
        test_data = test_data[columns]
        return self.handle_data(test_data)

    def handle_data(self, df, targets=pd.DataFrame(), test=False):
        df["date_calc"] = pd.to_datetime(df["date_calc"])
        df["date_forecast"] = pd.to_datetime(df["date_forecast"])

        # df = self.add_time_since_calucation(df)

        df = self.onehot_estimated(df)
        df = self.unzip_date_feature(df)
        df = self.grouped_by_hour(df)
        df = self.add_lag_features(df)

        df["time"] = df["date_forecast"]
        # df.drop(["date_forecast"], axis=1, inplace=True)
        if not targets.empty:
            df = self.merge_train_target(df, targets)

        # df.drop(["date_calc"], axis=1, inplace=True)
        df.drop(["time"], axis=1, inplace=True)
        # df = self.absolute_values(df)
        return df

    # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– helper funciton ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––

    def get_training_data_by_location(self, location):
        if location == "A":
            X_train_observed_x = X_train_observed_a
            X_train_estimated_x = X_train_estimated_a
            Y_train_x = Y_train_observed_a
        elif location == "B":
            X_train_observed_x = X_train_observed_b
            X_train_estimated_x = X_train_estimated_b
            Y_train_x = Y_train_observed_b
        elif location == "C":
            X_train_observed_x = X_train_observed_c
            X_train_estimated_x = X_train_estimated_c
            Y_train_x = Y_train_observed_c
        else:
            raise Exception("location must be A, B or C")
        train = pd.concat(
            [X_train_observed_x, X_train_estimated_x]).reset_index(drop=True)
        return train, Y_train_x
    
    def add_lag_features(self, df: pd.DataFrame):
        no_cat_features_1h = [c for c in df.columns if "_1h:" in c]
        lag_cols = df[no_cat_features_1h].select_dtypes(include=["number", "float", "int"]).columns.to_list()
        lag_f = LagFeatures(variables=lag_cols, periods=1)
        df_tr = lag_f.fit_transform(df[lag_cols].select_dtypes(include=["number", "float", "int"]))
        df[df_tr.columns] = df_tr
        return df

    def get_test_data_by_location(self, location: str,  normalize=False) -> pd.DataFrame:
        if location == "A":
            df = X_test_estimated_a
        elif location == "B":
            df = X_test_estimated_b
        elif location == "C":
            df = X_test_estimated_c
        else:
            raise Exception("location must be A, B or C")
        return df.copy()

    def unzip_date_feature(self, df: pd.DataFrame, date_column: str = "date_forecast"):
        df[date_column] = pd.to_datetime(df[date_column])
        
        df["day_of_year"] = df[date_column].dt.day_of_year
        df['time_of_day'] = df[date_column].dt.hour + df['date_forecast'].dt.minute / 60
        
        df['time_sin'] = np.sin(2 * np.pi * df['time_of_day'] / 24)
        df['day_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
        
        # df["month"] = df["date_forecast"].dt.month
        df.drop(columns=["day_of_year", "time_of_day"], inplace=True)
        return df

    def add_time_since_calucation(self, df):
        df["date_calc"] = pd.to_datetime(df["date_calc"])
        df["calculated_ago"] = (
            df["date_forecast"] - df["date_calc"]).dt.total_seconds()
        df["calculated_ago"] = df["calculated_ago"].fillna(
            0)
        return df

    def onehot_estimated(self, df):
        df["estimated"] = 0  # Initialize both columns to 0
        df["observed"] = 0
        estimated_mask = df["date_calc"].notna()
        df.loc[estimated_mask, "estimated"] = 1
        df.loc[~estimated_mask, "observed"] = 1
        return df

    def onehot_location(self, df, location):
        if location == "A":
            df["A"], df["B"], df["C"] = 1, 0, 0
        elif location == "B":
            df["A"], df["B"], df["C"] = 0, 1, 0
        elif location == "C":
            df["A"], df["B"], df["C"] = 0, 0, 1
        return df

    def grouped_by_hour(self, df: pd.DataFrame, date_column: str = "date_forecast"):
        df = df.groupby(pd.Grouper(key=date_column, freq="1H")
                        ).mean(numeric_only=True)
        all_nan_mask = df.isnull().all(axis=1)
        df = df[~all_nan_mask]
        return df.reset_index()

    def merge_train_target(self, x, y):
        # henning får med alle pv measurments selv om han merger på inner time. Fordi resample fyller nan rows for alle timer som ikke er i datasettet.
        merged = pd.merge(x, y, on="time", how="right")
        mask = merged["pv_measurement"].notna()
        merged = merged.loc[mask].reset_index(drop=True)
        return merged

    def absolute_values(self, df: pd.DataFrame):
        df[df.columns] = df[df.columns].abs()
        df = df.replace(-0.0, 0.0)
        return df

    def remove_consecutive_measurments(self, df: pd.DataFrame, consecutive_threshold=6, consecutive_threshold_for_zero=12):
        df = df.copy()
        if consecutive_threshold < 2:
            return df

        column_to_check = 'pv_measurement'
        mask = (df[column_to_check] != df[column_to_check].shift(2)).cumsum()

        df['consecutive_count'] = df.groupby(
            mask).transform('count')[column_to_check]

        mask = (df['consecutive_count'] > consecutive_threshold)
        mask_zero = (df['consecutive_count'] > consecutive_threshold_for_zero) & (
            df[column_to_check] == 0)
        df.drop(columns=["consecutive_count"], inplace=True)

        df = df.loc[~mask]
        df = df.loc[~mask_zero]
        return df.reset_index(drop=True)

    def compare_mae(self, df: pd.DataFrame):
        best_submission: pd.DataFrame = pd.read_csv(
            PATH+"mikael/submissions/best_prediction.csv")
        best_submission = best_submission[["prediction"]]

        if best_submission.shape != df.shape:
            print("best_submission", best_submission.shape)
            print("df", df.shape)
            raise Exception("Dataframe shape must be the same")

        return mean_absolute_error(
            best_submission["prediction"], df["prediction"])

    def split_train_tune(self, df: pd.DataFrame):
        df = df.copy()
        df_estimated = df.loc[df["estimated"] == 1]
        df_observed = df.loc[df["estimated"] == 0]

        num_rows = len(df_estimated)
        middle_index = num_rows // 2

        df_estimated.sample(frac=1, random_state=42)
        train_estimated = df.iloc[:middle_index]
        tune = df.iloc[middle_index:]

        train = pd.concat([df_observed, train_estimated])
        return train, tune
    
    def drop_features(self, df: pd.DataFrame):
        df.drop(
        columns=[
            "wind_speed_w_1000hPa:ms",
            "wind_speed_u_10m:ms",
            "wind_speed_v_10m:ms",
        ], inplace=True)
        # correlated = DropCorrelatedFeatures(variables=None, method='pearson', threshold=0.8)
        # df = correlated.fit_transform(df)
        return df

    def post_processing(self, df: pd.DataFrame, prediction_column: str = "prediction_label"):
        df = df[[prediction_column]].rename(
            columns={prediction_column: "prediction"}).reset_index(drop=True).rename_axis(index="id")

        df["prediction"] = df["prediction"].clip(lower=0)
        return df
    
    def find_min_max_date_in_test(self) -> list:
        locations = ["A", "B", "C"]
        dates = []
        for loc in locations:
            df = self.get_test_data_by_location(loc)
            df["date_forecast"] = pd.to_datetime(df["date_forecast"])
            dates.append((df["date_forecast"].min(),
                         df["date_forecast"].max()))
        return dates

    def split_train_summer_2021(self, df: pd.DataFrame):
        dates = self.find_min_max_date_in_test()
        # set the dates to the summer of 2021
        dates = [(date[0].replace(year=2021), date[1].replace(year=2021))
                 for date in dates]

        summer2021 = df[(df["date_forecast"] >= dates[0][0]) & (
            df["date_forecast"] <= dates[0][1])]

        train = df[~df.index.isin(summer2021.index)]
        return train, summer2021

    def post_processing(self, df: pd.DataFrame, prediction_column: str = "prediction_label"):
        df = df[[prediction_column]].rename(
            columns={prediction_column: "prediction"}).reset_index(drop=True).rename_axis(index="id")

        df["prediction"] = df["prediction"].clip(lower=0)
        return df


# pipe = Pipeline()
# df00 = pipe.get_combined_data()
# df00


# pipin = Pipin()
# x = pipin.get_combined_datasets(data_sets={"A"})

# get all date_calc.rows that are nan


# pipin.compare_mae(pd.DataFrame({"prediction": [1,2,3,4,5]}))

# print("df", big_data.head())
# pipin = Pipin()
# test = pipin.get_combined_test_data()
# pipin.get_data("B")

important_features = [
    'time',
    'direct_rad:W',
    'diffuse_rad:W',
    'sun_azimuth:d',
    'sun_elevation:d',
    'clear_sky_energy_1h:J',
    'clear_sky_rad:W',
    'total_cloud_cover:p',
    'effective_cloud_cover:p',
    'rain_water:kgm2',
    'precip_5min:mm',
    'wind_speed_10m:ms',
    'wind_speed_w_1000hPa:ms',
    'super_cooled_liquid_water:kgm2',
    'air_density_2m:kgm3',
    'pressure_100m:hPa',
    'pressure_50m:hPa',
    'sfc_pressure:hPa',
    'msl_pressure:hPa',
    'dew_point_2m:K',
    'is_day:idx',
    'is_in_shadow:idx',
    'elevation:m',

    "snow_melt_10min:mm",
    "snow_density:kgm3",
    "fresh_snow_6h:cm",
    "fresh_snow_1h:cm",
    "snow_water:kgm2",
    "fresh_snow_12h:cm",
    "fresh_snow_3h:cm",
    "fresh_snow_24h:cm",
    "snow_depth:cm",

    'A',
    'B',
    'C',
    "estimated",
    "observed",
]


In [3]:
%autoreload
from pipeline_145_preset import Pipeline
pipin = Pipeline()

Current working directory: /Users/miksx/GitHub/Forest-Gump/mikael/autoML/gluon


In [4]:
DEFAULT_PATH="ag_145_lag_sine_drop_"

In [5]:
df1_0, test1  = pipin.get_all_data("A")
df2_0, test2 = pipin.get_all_data("B")
df3_0, test3 = pipin.get_all_data("C")

In [6]:
# remove_consecutive_measurments
# 6/24
df1_0 = pipin.remove_consecutive_measurments(df1_0, 4, 24)
df2_0 = pipin.remove_consecutive_measurments(df2_0, 4, 24)
df3_0 = pipin.remove_consecutive_measurments(df3_0, 4, 24)

In [7]:
# tuning
# split_train_summer_2021 test this
train1, tune1 = pipin.split_train_summer_2021(df1_0 )
train2, tune2 = pipin.split_train_summer_2021(df2_0 )
train3, tune3 = pipin.split_train_summer_2021(df3_0 )

In [8]:
train1.describe()

Unnamed: 0,date_forecast,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,...,wind_speed_10m:ms,estimated,observed,time_sin,day_sin,clear_sky_energy_1h:J_lag_1,diffuse_rad_1h:J_lag_1,direct_rad_1h:J_lag_1,fresh_snow_1h:cm_lag_1,pv_measurement
count,18843,18843.0,18843.0,15222.0,18843.0,18843.0,17463.0,18843.0,18843.0,18843.0,...,18843.0,18843.0,18843.0,18843.0,18843.0,18842.0,18842.0,18842.0,18842.0,18851.0
mean,2021-04-20 12:32:06.763254272,7.074725,1.242394,3059.118652,899414.1,249.822983,1751.986084,0.00804,277.785797,68.9972,...,3.065145,0.09956,0.90044,0.033309,-0.004713,898332.8,247566.3,301614.6,0.002777,1023.182602
min,2019-06-02 22:00:00,0.7,1.145,27.849998,0.0,0.0,27.950001,-1.0,251.074997,0.0,...,0.025,0.0,0.0,-0.992522,-0.999991,0.0,0.0,0.0,0.0,0.0
25%,2020-05-08 13:30:00,4.7,1.21825,1168.59375,78975.28,21.125,591.125,0.0,272.950012,10.799999,...,1.65,0.0,1.0,-0.632696,-0.699458,76642.17,42180.96,1700.963,0.0,45.98
50%,2021-04-05 09:00:00,7.0,1.24,2026.737427,562687.9,161.649994,1133.449951,0.0,278.850006,53.525002,...,2.775,0.0,1.0,0.16246,-0.060213,563008.6,191932.9,59650.61,0.0,420.2
75%,2022-05-23 21:30:00,9.05,1.26575,4279.862305,1599738.0,440.337524,2149.912598,0.0,282.799988,105.599998,...,4.2,0.0,1.0,0.770942,0.729558,1600173.0,380597.6,406527.1,0.0,1519.87
max,2023-04-30 23:00:00,17.35,1.42625,12042.525391,2988628.0,835.099976,11673.625,1.0,293.625,332.274994,...,11.2,1.0,1.0,0.992522,0.999991,2988628.0,1144410.0,2441750.0,3.125,5651.8
std,,2.909262,0.035103,2621.933838,908520.1,253.266098,1832.834717,0.09576,6.747072,66.715828,...,1.776226,0.29942,0.29942,0.704019,0.710937,909765.3,236800.9,472459.9,0.044683,1318.488784


In [9]:
train1 = TabularDataset(df1_0)
train2 = TabularDataset(df2_0)
train3 = TabularDataset(df3_0)

In [10]:
tuning1 = TabularDataset(tune1)
tuning2 = TabularDataset(tune2)
tuning3 = TabularDataset(tune3)

In [11]:
predictor1 = TabularPredictor(label="pv_measurement", eval_metric='mean_absolute_error',
                                                            path= DEFAULT_PATH+"A").fit(
    train1,
    time_limit=3600,
    # hyperparameters='extrme', 
    presets='best_quality',
    tuning_data = tuning1,
    use_bag_holdout=True,
    # num_bag_folds= 6,
    # refit_full = True,
    # auto_stack = True,
    # num_bag_sets= 10,
    # set_best_to_refit_full= True,
    # num_stack_levels = 2,
    # verbosity = 3
    )

Presets specified: ['best_quality']
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "ag_145_lag_sine_drop_A"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.0.0: Fri Sep 15 14:41:43 PDT 2023; root:xnu-10002.1.13~1/RELEASE_ARM64_T6000
Disk Space Avail:   252.93 GB / 494.38 GB (51.2%)
Train Data Rows:    20301
Train Data Columns: 51
Tuning Data Rows:    1450
Tuning Data Columns: 51
Label Column: pv_measurement
Preprocessing data ...
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max,

In [12]:
predictor2 = TabularPredictor(label="pv_measurement", eval_metric='mean_absolute_error',
                                                            path= DEFAULT_PATH+"B").fit(
    train2,

    presets='best_quality', 
    # hyperparameters='very_large', 
    time_limit=3600,
    tuning_data = tuning2,
    use_bag_holdout=True,
    # num_bag_folds= 6,
    # refit_full = True,
    # auto_stack = True,
    # num_bag_sets= 10,
    # set_best_to_refit_full= True,
    # num_stack_levels = 2,
    # verbosity = 3
    )

# tuning_data
# num bag holdout 6
# bag_holdout

Presets specified: ['best_quality']
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "ag_145_lag_sine_drop_B"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.0.0: Fri Sep 15 14:41:43 PDT 2023; root:xnu-10002.1.13~1/RELEASE_ARM64_T6000
Disk Space Avail:   247.08 GB / 494.38 GB (50.0%)
Train Data Rows:    20301
Train Data Columns: 51
Tuning Data Rows:    1450
Tuning Data Columns: 51
Label Column: pv_measurement
Preprocessing data ...
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max,

In [13]:
predictor3 = TabularPredictor(label="pv_measurement", eval_metric='mean_absolute_error',
                                                            path= DEFAULT_PATH+"C").fit(
    train3,

    presets='best_quality', 
    # hyperparameters='very_large', 
    time_limit=3600,
    tuning_data = tuning3,
    use_bag_holdout=True,
    # num_bag_folds= 6,
    # refit_full = True,
    # auto_stack = True,
    # num_bag_sets= 10,
    # set_best_to_refit_full= True,
    # num_stack_levels = 2,
    # verbosity = 3
    )

Presets specified: ['best_quality']
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "ag_145_lag_sine_drop_C"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.0.0: Fri Sep 15 14:41:43 PDT 2023; root:xnu-10002.1.13~1/RELEASE_ARM64_T6000
Disk Space Avail:   239.24 GB / 494.38 GB (48.4%)
Train Data Rows:    20301
Train Data Columns: 51
Tuning Data Rows:    1450
Tuning Data Columns: 51
Label Column: pv_measurement
Preprocessing data ...
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max,

In [14]:
test_data1 = TabularDataset(test1)
test_data2 = TabularDataset(test2)
test_data3 = TabularDataset(test3)

pred1 = pd.DataFrame(predictor1.predict(test_data1))
pred2 = pd.DataFrame(predictor2.predict(test_data2))
pred3 = pd.DataFrame(predictor3.predict(test_data3))

negatives_pred1 = pred1[pred1["pv_measurement"] < 0]
negatives_pred2 = pred2[pred2["pv_measurement"] < 0]
negatives_pred3 = pred3[pred3["pv_measurement"] < 0]
neg = pd.concat([negatives_pred1, negatives_pred2, negatives_pred3])
neg.shape

(0, 1)

In [15]:
pred = pd.concat([pred1, pred2, pred3])
final_prediction = pipin.post_processing(pred, prediction_column="pv_measurement")
final_prediction.to_csv('gluon_3_same_as_145_tune_lag_sine_drop.csv')

In [16]:
diff = pipin.compare_mae(final_prediction)
diff

564.1427064806163

In [17]:
# predictor.evaluate(df1, silent=True)

<h3> prøver å gjenskape henning sin 143.</h3>
<p><p>

In [1]:
%load_ext autoreload

In [2]:
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
import numpy as np
import os

current_dir = os.getcwd()
print("Current working directory:", current_dir)

PATH = "../../../"
# Estimate
X_train_estimated_a: pd.DataFrame = pd.read_parquet(
    PATH + 'A/X_train_estimated.parquet')
X_train_estimated_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/X_train_estimated.parquet")
X_train_estimated_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/X_train_estimated.parquet")

# Test estimates
X_test_estimated_a: pd.DataFrame = pd.read_parquet(
    PATH + "A/X_test_estimated.parquet")
X_test_estimated_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/X_test_estimated.parquet")
X_test_estimated_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/X_test_estimated.parquet")

# Observations
X_train_observed_a: pd.DataFrame = pd.read_parquet(
    PATH + "A/X_train_observed.parquet")
X_train_observed_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/X_train_observed.parquet")
X_train_observed_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/X_train_observed.parquet")

# Targets
Y_train_observed_a: pd.DataFrame = pd.read_parquet(
    PATH + "A/train_targets.parquet")
Y_train_observed_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/train_targets.parquet")
Y_train_observed_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/train_targets.parquet")

test_df_example = pd.read_csv(PATH + "test.csv")

best_submission: pd.DataFrame = pd.read_csv(
    PATH + "mikael/submissions/fourth_submission.csv")

optins = {
    "randomize": False,
    "consecutive_threshold": 6,
    "normalize": False,
    "group_by_hour": True,
    "unzip_date_feature": True,
}

# make a options class with the options as attributes


class Options:
    randomize = False
    consecutive_threshold = 6
    normalize = False
    group_by_hour = True
    unzip_date_feature = True

    def __init__(self, randomize=False, consecutive_threshold=6, normalize=False, group_by_hour=True, unzip_date_feature=True) -> None:
        self.randomize = randomize
        self.consecutive_threshold = consecutive_threshold
        self.normalize = normalize
        self.group_by_hour = group_by_hour
        self.unzip_date_feature = unzip_date_feature


class Pipeline:

    def __init__(self):
        pass

    def get_combined_data(self, test_data=False):
        locations = ["A", "B", "C"]
        dfs = []
        for index, location in enumerate(locations):
            if test_data:
                dfs.append(self.get_test_data(location))
            else:
                dfs.append(self.get_data(location))

            dfs[index] = self.onehot_location(dfs[index], location)
        df = pd.concat(dfs).reset_index(drop=True)

        if test_data:
            return df
        return df[[c for c in df if c not in ['pv_measurement']] +  # pv measurement is the target and is at the end columns
                  ['pv_measurement']]

    def get_data(self, location: str, keeptime=False) -> pd.DataFrame:
        train, targets = self.get_training_data_by_location(location)
        return self.handle_data(train, targets, keeptime=keeptime)

    def get_test_data(self, location: str) -> pd.DataFrame:
        test_data = self.get_test_data_by_location(location)
        return self.handle_data(test_data)

    def handle_data(self, df, targets=pd.DataFrame(), keeptime=False):
        df["date_calc"] = pd.to_datetime(df["date_calc"])
        df["date_forecast"] = pd.to_datetime(df["date_forecast"])

        df = self.drop_columns(df)
        df = self.grouped_by_hour(df)

        df = self.unzip_date_feature(df)
        df = self.onehot_estimated(df)

        df["time"] = df["date_forecast"]
        df.drop(["date_forecast"], axis=1, inplace=True)
        if not targets.empty:
            df = self.merge_train_target(df, targets)

        df.drop(columns=["time"], axis=1, inplace=True)

        df = self.absolute_values(df)
        return df

    # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– helper funciton ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––

    def get_training_data_by_location(self, location):
        if location == "A":
            X_train_observed_x = X_train_observed_a
            X_train_estimated_x = X_train_estimated_a
            Y_train_x = Y_train_observed_a
        elif location == "B":
            X_train_observed_x = X_train_observed_b
            X_train_estimated_x = X_train_estimated_b
            Y_train_x = Y_train_observed_b
        elif location == "C":
            X_train_observed_x = X_train_observed_c
            X_train_estimated_x = X_train_estimated_c
            Y_train_x = Y_train_observed_c
        else:
            raise Exception("location must be A, B or C")
        train = pd.concat(
            [X_train_observed_x, X_train_estimated_x]).reset_index(drop=True)
        return train, Y_train_x

    def get_test_data_by_location(self, location: str,  normalize=False) -> pd.DataFrame:
        if location == "A":
            df = X_test_estimated_a
        elif location == "B":
            df = X_test_estimated_b
        elif location == "C":
            df = X_test_estimated_c
        else:
            raise Exception("location must be A, B or C")
        return df.copy()

    def unzip_date_feature(self, df: pd.DataFrame, date_column: str = "date_forecast"):
        df[date_column] = pd.to_datetime(df[date_column])
        df['day_of_year_sin'] = np.sin(2 * np.pi * df[date_column].dt.dayofyear / 365.25)
        df['hour'] = np.sin(2 * np.pi * df[date_column].dt.hour / 24)
        # df["month"] = df["date_forecast"].dt.month
        return df

    def add_time_since_calucation(self, df):  # denne er ikke så dum.
        df["date_calc"] = pd.to_datetime(df["date_calc"])
        df["calculated_ago"] = (
            df["date_forecast"] - df["date_calc"]).dt.total_seconds()
        df["calculated_ago"] = df["calculated_ago"].fillna(
            0) / 60/30
        return df

    def onehot_estimated(self, df):
        df["estimated"] = 0  # Initialize both columns to 0
        df["observed"] = 0
        estimated_mask = df["date_calc"].notna()
        df.loc[estimated_mask, "estimated"] = 1
        df.loc[~estimated_mask, "observed"] = 1
        df.drop(columns=["date_calc"], inplace=True)
        return df

    def onehot_location(self, df, location):
        if location == "A":
            df["A"], df["B"], df["C"] = 1, 0, 0
        elif location == "B":
            df["A"], df["B"], df["C"] = 0, 1, 0
        elif location == "C":
            df["A"], df["B"], df["C"] = 0, 0, 1
        return df

    def grouped_by_hour(self, df: pd.DataFrame, date_column: str = "date_forecast") -> pd.DataFrame:
        # Group by hour and aggregate the values into lists for all columns
        df['date_hour'] = df[date_column].dt.to_period('H')
        df["min"] = df[date_column].dt.minute
        df.drop(columns=[date_column], inplace=True)

        # Use pivot_table to combine rows with the same date and hour
        pivot_df = df.pivot_table(index=['date_hour'], columns=[
                                  'min'], values=df.columns, aggfunc='first')
        # rename the date_hour to date_forecast
        # pivot_df.columns = [f'{col[0]}_{col[1]}' if col[1]
        #                     else col[0] for col in pivot_df.columns]

        pivot_df.index.names = [date_column]

        # Reset index to make 'date' a regular column
        pivot_df.columns = pivot_df.columns.to_flat_index()
        pivot_df.reset_index(inplace=True)

        pivot_df["date_forecast"] = pivot_df["date_forecast"].dt.to_timestamp()

        pivot_df["date_calc"] = pivot_df[('date_calc', 0)]
        pivot_df.drop(columns=[
            ('date_calc', c) for c in range(0, 60, 15)
        ], inplace=True)

        pivot_df.columns = [str(col) for col in pivot_df.columns]

        return pivot_df

    def grouped_by_hour_old(self, df: pd.DataFrame, date_column: str = "date_forecast"):
        df = df.groupby(pd.Grouper(key=date_column, freq="1H")
                        ).mean(numeric_only=True)
        all_nan_mask = df.isnull().all(axis=1)
        df = df[~all_nan_mask]
        return df.reset_index()

    def merge_train_target(self, x, y):
        # henning får med alle pv measurments selv om han merger på inner time. Fordi resample fyller nan rows for alle timer som ikke er i datasettet.
        merged = pd.merge(x, y, on="time", how="right")
        mask = merged["pv_measurement"].notna()
        merged = merged.loc[mask].reset_index(drop=True)
        return merged

    def absolute_values(self, df: pd.DataFrame):
        columns = list(df.columns)
        df[columns] = df[columns].abs()
        df = df.replace(-0.0, 0.0)
        return df

    def lag_features_by_1_hour(df, columns_to_lag):
        lag_columns = [c for c in df.columns if "_1h:" in c]
        df[lag_columns] = df[lag_columns].shift(1)
        return df

    def remove_consecutive_measurments_new(self, df: pd.DataFrame, consecutive_threshold=3, consecutive_threshold_zero=12,  return_removed=False):
        if consecutive_threshold < 2:
            return df

        column_to_check = 'pv_measurement'

        mask = (df[column_to_check] != df[column_to_check].shift(1)).cumsum()
        df['consecutive_group'] = df.groupby(
            mask).transform('count')[column_to_check]

        df["is_first_in_consecutive_group"] = False
        df['is_first_in_consecutive_group'] = df['consecutive_group'] != df['consecutive_group'].shift(
            1)

        # masks to remove rows
        mask_non_zero = (df['consecutive_group'] >= consecutive_threshold) & (
            df["pv_measurement"] > 0) & (df["is_first_in_consecutive_group"] == False)  # or df["direct_rad:W"] == 0)

        mask_zero = (df['consecutive_group'] >= consecutive_threshold_zero) & (
            df["pv_measurement"] == 0) & (df["is_first_in_consecutive_group"] == False)

        mask = mask_non_zero | mask_zero

        if return_removed:
            return df[mask]

        df = df.loc[~mask]

        df = df.drop(columns=["consecutive_group",
                     "is_first_in_consecutive_group"])

        return df.reset_index(drop=True)

    def remove_consecutive_measurments_new_new(self, df: pd.DataFrame, consecutive_threshold=3, consecutive_threshold_zero=12, consecutive_threshold_zero_no_rad=20, return_removed=False):
        if consecutive_threshold < 2:
            return df

        column_to_check = 'pv_measurement'

        mask = (df[column_to_check] != df[column_to_check].shift(1)).cumsum()
        df['consecutive_group'] = df.groupby(
            mask).transform('count')[column_to_check]

        df["is_first_in_consecutive_group"] = False
        df['is_first_in_consecutive_group'] = df['consecutive_group'] != df['consecutive_group'].shift(
            1)

        # masks to remove rows
        mask_non_zero = (df['consecutive_group'] >= consecutive_threshold) & (
            df["pv_measurement"] > 0) & (df["is_first_in_consecutive_group"] == False)  # or df["direct_rad:W"] == 0)

        tol = 10
        mask_zero = (df['consecutive_group'] >= consecutive_threshold_zero) & (
            df["pv_measurement"] == 0) & (df["direct_rad:W"] > tol)

        mask_zero_no_rad = (df['consecutive_group'] >= consecutive_threshold_zero_no_rad) & (
            df["pv_measurement"] == 0) & (df["direct_rad:W"] < tol)
        mask = mask_non_zero | mask_zero | mask_zero_no_rad

        if return_removed:
            return df[mask]

        df = df.loc[~mask]

        df = df.drop(columns=["consecutive_group",
                     "is_first_in_consecutive_group"])

        return df.reset_index(drop=True)

    def compare_mae(self, df: pd.DataFrame):
        best_submission: pd.DataFrame = pd.read_csv(
            PATH+"mikael/best_prediction.csv")
        best_submission = best_submission[["prediction"]]

        if best_submission.shape != df.shape:
            print("best_submission", best_submission.shape)
            print("df", df.shape)
            raise Exception("Dataframe shape must be the same")

        return mean_absolute_error(
            best_submission["prediction"], df["prediction"])

    def drop_columns(self, df: pd.DataFrame):
        drop = [
            # wind speed vector u, available up to 20000 m, from 1000 hPa to 10 hPa and on flight levels FL10-FL900[m/s] does not make sens at surfece level
            "wind_speed_w_1000hPa:ms",
            "wind_speed_u_10m:ms",  # same as above
            "wind_speed_v_10m:ms",  # same as above
            "snow_density:kgm3",
            "snow_drift:idx",
            # "snow_melt_10min:mm",  # veldig få verdier
        ]
        shared_columns = list(set(df.columns) & set(drop))
        df = df.drop(columns=shared_columns)
        return df

    def find_min_max_date_in_test(self) -> list:
        locations = ["A", "B", "C"]
        dates = []
        for loc in locations:
            df = self.get_test_data_by_location(loc)
            df["date_forecast"] = pd.to_datetime(df["date_forecast"])
            dates.append((df["date_forecast"].min(),
                         df["date_forecast"].max()))
        return dates

    def split_train_summer_2021(self, df: pd.DataFrame):
        dates = self.find_min_max_date_in_test()
        # set the dates to the summer of 2021
        dates = [(date[0].replace(year=2021), date[1].replace(year=2021))
                 for date in dates]

        summer2021 = df[(df["date_forecast"] >= dates[0][0]) & (
            df["date_forecast"] <= dates[0][1])]

        train = df[~df.index.isin(summer2021.index)]
        return train, summer2021

    def post_processing(self, df: pd.DataFrame, prediction_column: str = "prediction_label"):
        df = df[[prediction_column]].rename(
            columns={prediction_column: "prediction"}).reset_index(drop=True).rename_axis(index="id")

        df["prediction"] = df["prediction"].clip(lower=0)
        return df


# pipe = Pipeline()
# df00 = pipe.get_combined_data()
# df00


# pipin = Pipin()
# x = pipin.get_combined_datasets(data_sets={"A"})

# get all date_calc.rows that are nan


# pipin.compare_mae(pd.DataFrame({"prediction": [1,2,3,4,5]}))

# print("df", big_data.head())
# pipin = Pipin()
# test = pipin.get_combined_test_data()
# pipin.get_data("B")

important_features = [
    'time',
    'direct_rad:W',
    'diffuse_rad:W',
    'sun_azimuth:d',
    'sun_elevation:d',
    'clear_sky_energy_1h:J',
    'clear_sky_rad:W',
    'total_cloud_cover:p',
    'effective_cloud_cover:p',
    'rain_water:kgm2',
    'precip_5min:mm',
    'wind_speed_10m:ms',
    'wind_speed_w_1000hPa:ms',
    'super_cooled_liquid_water:kgm2',
    'air_density_2m:kgm3',
    'pressure_100m:hPa',
    'pressure_50m:hPa',
    'sfc_pressure:hPa',
    'msl_pressure:hPa',
    'dew_point_2m:K',
    'is_day:idx',
    'is_in_shadow:idx',
    'elevation:m',

    "snow_melt_10min:mm",
    "snow_density:kgm3",
    "fresh_snow_6h:cm",
    "fresh_snow_1h:cm",
    "snow_water:kgm2",
    "fresh_snow_12h:cm",
    "fresh_snow_3h:cm",
    "fresh_snow_24h:cm",
    "snow_depth:cm",

    'A',
    'B',
    'C',
    "estimated",
    "observed",
]


In [3]:
%autoreload
from pipeline import Pipeline
pipin = Pipeline()

Current working directory: /Users/miksx/GitHub/Forest-Gump/mikael/autoML/gluon


In [4]:
PATH = "ag_144_exstract_time_4_17"

In [5]:
df1_0 = pipin.get_data("A")
df2_0 = pipin.get_data("B")
df3_0 = pipin.get_data("C")

In [6]:
numeric_columns = df1_0.select_dtypes(include='number').columns

# Select columns with negative values, excluding datetime columns
negative_columns = numeric_columns[(df1_0[numeric_columns] < 0).any()]
negative_columns

Index([], dtype='object')

In [7]:
# remove_consecutive_measurments
# 6/24
# dette skal prøve å gjenskape henning sin
df1_0 = pipin.remove_consecutive_measurments_new(df1_0, 4, 17)
df2_0 = pipin.remove_consecutive_measurments_new(df2_0, 4, 17)
df3_0 = pipin.remove_consecutive_measurments_new(df3_0, 4, 17)

  df['consecutive_group'] = df.groupby(
  df["is_first_in_consecutive_group"] = False
  df['consecutive_group'] = df.groupby(
  df["is_first_in_consecutive_group"] = False
  df['consecutive_group'] = df.groupby(
  df["is_first_in_consecutive_group"] = False


In [8]:
print(df1_0.shape, df2_0.shape, df3_0.shape)

(28713, 165) (20132, 165) (17076, 165)


In [9]:
df1_0.describe()

Unnamed: 0,"('absolute_humidity_2m:gm3', 0)","('absolute_humidity_2m:gm3', 15)","('absolute_humidity_2m:gm3', 30)","('absolute_humidity_2m:gm3', 45)","('air_density_2m:kgm3', 0)","('air_density_2m:kgm3', 15)","('air_density_2m:kgm3', 30)","('air_density_2m:kgm3', 45)","('ceiling_height_agl:m', 0)","('ceiling_height_agl:m', 15)",...,"('visibility:m', 45)","('wind_speed_10m:ms', 0)","('wind_speed_10m:ms', 15)","('wind_speed_10m:ms', 30)","('wind_speed_10m:ms', 45)",day_of_year_sin,hour,estimated,observed,pv_measurement
count,28705.0,28705.0,28705.0,28705.0,28705.0,28705.0,28705.0,28705.0,21845.0,23279.0,...,28705.0,28705.0,28705.0,28705.0,28705.0,28705.0,28705.0,28705.0,28705.0,28713.0
mean,6.745494,6.74564,6.745375,6.745612,1.248199,1.248196,1.248203,1.248202,2961.565186,3057.664795,...,32259.802734,3.015646,3.015614,3.015711,3.015924,0.691487,0.618251,0.108692,0.891308,748.574534
std,2.793042,2.790889,2.789928,2.790788,0.035889,0.035853,0.035836,0.035848,2615.146729,2647.720703,...,17619.517578,1.770563,1.755222,1.749676,1.755036,0.293663,0.316334,0.311258,0.311258,1235.045878
min,0.7,0.7,0.7,0.7,1.145,1.145,1.145,1.145,27.799999,27.799999,...,265.299988,0.0,0.0,0.0,0.0,0.004301,0.0,0.0,0.0,0.0
25%,4.5,4.5,4.5,4.5,1.224,1.224,1.224,1.224,1096.300049,1145.550049,...,16159.599609,1.6,1.6,1.6,1.6,0.484089,0.258819,0.0,1.0,0.0
50%,6.5,6.5,6.5,6.5,1.246,1.246,1.246,1.246,1916.400024,1997.199951,...,35751.398438,2.7,2.7,2.7,2.7,0.790563,0.707107,0.0,1.0,83.82
75%,8.7,8.7,8.7,8.7,1.272,1.272,1.272,1.272,4082.899902,4293.75,...,47427.699219,4.1,4.1,4.1,4.1,0.948772,0.866025,0.0,1.0,949.3
max,17.5,17.4,17.4,17.4,1.427,1.426,1.426,1.426,12307.700195,12294.900391,...,70518.203125,11.4,11.3,11.1,11.1,0.999999,1.0,1.0,1.0,5733.42


In [10]:
train1 = TabularDataset(df1_0)
train2 = TabularDataset(df2_0)
train3 = TabularDataset(df3_0)

In [11]:
predictor1 = TabularPredictor(label="pv_measurement", eval_metric='mean_absolute_error',
                              path=PATH+"A").fit(
    train1,
    # time_limit=60,
    presets='best_quality',
    # tuning_data = tuning1,
    # use_bag_holdout=True,

    # verbosity = 3
    )

Presets specified: ['best_quality']
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "ag_144_exstract_time_4_17A"
AutoGluon Version:  0.8.3b20231108
Python Version:     3.10.12
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.0.0: Fri Sep 15 14:41:43 PDT 2023; root:xnu-10002.1.13~1/RELEASE_ARM64_T6000
Disk Space Avail:   243.23 GB / 494.38 GB (49.2%)
Train Data Rows:    28713
Train Data Columns: 164
Label Column: pv_measurement
Preprocessing data ...
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (5733.42, 0.0, 748.57453, 1235.0458

In [12]:
test1 = pipin.get_test_data("A")
test_data1 = TabularDataset(test1)


pred1 = pd.DataFrame(predictor1.predict(test_data1))

  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)


In [13]:
predictor2 = TabularPredictor(label="pv_measurement", eval_metric='mean_absolute_error',
                              path=PATH+"B").fit(
    train2,

    presets='best_quality', 
    # hyperparameters='very_large', 
    # time_limit=60,
    # tuning_data = tuning2,
    # use_bag_holdout=True,
    # verbosity = 3
    )

Presets specified: ['best_quality']
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "ag_144_exstract_time_4_17B"
AutoGluon Version:  0.8.3b20231108
Python Version:     3.10.12
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.0.0: Fri Sep 15 14:41:43 PDT 2023; root:xnu-10002.1.13~1/RELEASE_ARM64_T6000
Disk Space Avail:   238.02 GB / 494.38 GB (48.1%)
Train Data Rows:    20132
Train Data Columns: 164
Label Column: pv_measurement
Preprocessing data ...
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (1152.3, 0.0, 137.91481, 232.20902)

In [14]:
predictor3 = TabularPredictor(label="pv_measurement", eval_metric='mean_absolute_error',
                              path=PATH+"C").fit(
    train3,

    presets='best_quality', 
    # hyperparameters='very_large', 
    # time_limit=60,
    # tuning_data = tuning3,
    # use_bag_holdout=True,
    # verbosity = 3
    )

Presets specified: ['best_quality']
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "ag_144_exstract_time_4_17C"
AutoGluon Version:  0.8.3b20231108
Python Version:     3.10.12
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.0.0: Fri Sep 15 14:41:43 PDT 2023; root:xnu-10002.1.13~1/RELEASE_ARM64_T6000
Disk Space Avail:   228.41 GB / 494.38 GB (46.2%)
Train Data Rows:    17076
Train Data Columns: 164
Label Column: pv_measurement
Preprocessing data ...
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and label-values can't be converted to int).
	Label info (max, min, mean, stddev): (999.6, 0.0, 118.60806, 192.76

In [15]:
test1 = pipin.get_test_data("A")
test2 = pipin.get_test_data("B")
test3 = pipin.get_test_data("C")

In [16]:
test_data1 = TabularDataset(test1)
test_data2 = TabularDataset(test2)
test_data3 = TabularDataset(test3)

# pred1 = pd.DataFrame(predictor1.predict(test_data1))
pred2 = pd.DataFrame(predictor2.predict(test_data2))
pred3 = pd.DataFrame(predictor3.predict(test_data3))

negatives_pred1 = pred1[pred1["pv_measurement"] < 0]
negatives_pred2 = pred2[pred2["pv_measurement"] < 0]
negatives_pred3 = pred3[pred3["pv_measurement"] < 0]
neg = pd.concat([negatives_pred1, negatives_pred2, negatives_pred3])
neg.shape

  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = d

(38, 1)

In [17]:
pred = pd.concat([pred1, pred2, pred3])
final_prediction = pipin.post_processing(pred, prediction_column="pv_measurement")
final_prediction.to_csv('submissions/gluon_3_exstract_time_4_17_backwards.csv')

In [21]:
diff = pipin.compare_mae(final_prediction)
diff

21.880052342675608

In [19]:
# predictor.evaluate(df1, silent=True)

<h3> Henning skal teste å fjerne consecutives.</h3>
<p> yee<p>

In [1]:
%load_ext autoreload

In [2]:
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd
import numpy as np

In [None]:
import numpy as np
import pandas as pd
from feature_engine.timeseries.forecasting import LagFeatures
from feature_engine.selection import DropCorrelatedFeatures
from sklearn.metrics import mean_absolute_error
import os

current_dir = os.getcwd()
print("Current working directory:", current_dir)


PATH = "../../../"
# Estimate
X_train_estimated_a: pd.DataFrame = pd.read_parquet(
    PATH + 'A/X_train_estimated.parquet')
X_train_estimated_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/X_train_estimated.parquet")
X_train_estimated_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/X_train_estimated.parquet")

# Test estimates
X_test_estimated_a: pd.DataFrame = pd.read_parquet(
    PATH + "A/X_test_estimated.parquet")
X_test_estimated_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/X_test_estimated.parquet")
X_test_estimated_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/X_test_estimated.parquet")

# Observations
X_train_observed_a: pd.DataFrame = pd.read_parquet(
    PATH + "A/X_train_observed.parquet")
X_train_observed_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/X_train_observed.parquet")
X_train_observed_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/X_train_observed.parquet")

# Targets
Y_train_observed_a: pd.DataFrame = pd.read_parquet(
    PATH + "A/train_targets.parquet")
Y_train_observed_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/train_targets.parquet")
Y_train_observed_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/train_targets.parquet")

test_df_example = pd.read_csv(PATH + "test.csv")

best_submission: pd.DataFrame = pd.read_csv(
    PATH + "mikael/submissions/fourth_submission.csv")

optins = {
    "randomize": False,
    "consecutive_threshold": 6,
    "normalize": False,
    "group_by_hour": True,
    "unzip_date_feature": True,
}

# make a options class with the options as attributes


class Options:
    randomize = False
    consecutive_threshold = 6
    normalize = False
    group_by_hour = True
    unzip_date_feature = True

    def __init__(self, randomize=False, consecutive_threshold=6, normalize=False, group_by_hour=True, unzip_date_feature=True) -> None:
        self.randomize = randomize
        self.consecutive_threshold = consecutive_threshold
        self.normalize = normalize
        self.group_by_hour = group_by_hour
        self.unzip_date_feature = unzip_date_feature


class Pipeline:

    def __init__(self):
        pass

    def get_combined_data(self, test_data=False):
        locations = ["A", "B", "C"]
        dfs = []
        for index, location in enumerate(locations):
            if test_data:
                dfs.append(self.get_test_data(location))
            else:
                dfs.append(self.get_data(location))

            dfs[index] = self.onehot_location(dfs[index], location)
        df = pd.concat(dfs).reset_index(drop=True)

        if test_data:
            return df
        return df[[c for c in df if c not in ['pv_measurement']] +  # pv measurement is the target and is at the end columns
                  ['pv_measurement']]
    
    def get_all_data(self, location: str):
        train, targets = self.get_training_data_by_location("A")
        train = self.drop_features(train)
        
        test = self.get_test_data_by_location("A")
        test = test[train.columns.to_list()]
        train = self.handle_data(train, targets)
        test = self.handle_data(test, train=False)
        return train, test

    def get_data(self, location: str) -> pd.DataFrame:
        train, targets = self.get_training_data_by_location(location)
        return self.handle_data(train, targets)

    def get_test_data(self, location: str, columns: list) -> pd.DataFrame:
        test_data = self.get_test_data_by_location(location)
        test_data = test_data[columns]
        return self.handle_data(test_data)

    def handle_data(self, df, targets=pd.DataFrame(), train=True):
        df["date_calc"] = pd.to_datetime(df["date_calc"])
        df["date_forecast"] = pd.to_datetime(df["date_forecast"])

        df = self.onehot_estimated(df)
        df = self.unzip_date_feature(df)
        df = self.grouped_by_hour(df)
        # df = self.add_lag_features(df)

        df["time"] = df["date_forecast"]
        df.drop(["date_forecast"], axis=1, inplace=True)
        if not targets.empty:
            df = self.merge_train_target(df, targets)

        # df.drop(["date_calc"], axis=1, inplace=True)
        df.drop(["time"], axis=1, inplace=True)
        if train:
            df = self.remove_consecutive_measurments(df, 4)
        df = self.absolute_values(df)
        return df

    # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– helper funciton ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––

    def get_training_data_by_location(self, location):
        if location == "A":
            X_train_observed_x = X_train_observed_a
            X_train_estimated_x = X_train_estimated_a
            Y_train_x = Y_train_observed_a
        elif location == "B":
            X_train_observed_x = X_train_observed_b
            X_train_estimated_x = X_train_estimated_b
            Y_train_x = Y_train_observed_b
        elif location == "C":
            X_train_observed_x = X_train_observed_c
            X_train_estimated_x = X_train_estimated_c
            Y_train_x = Y_train_observed_c
        else:
            raise Exception("location must be A, B or C")
        train = pd.concat(
            [X_train_observed_x, X_train_estimated_x]).reset_index(drop=True)
        return train, Y_train_x
    
    def add_lag_features(self, df: pd.DataFrame):
        no_cat_features_1h = [c for c in df.columns if "_1h:" in c]
        lag_cols = df[no_cat_features_1h].select_dtypes(include=["number", "float", "int"]).columns.to_list()
        lag_f = LagFeatures(variables=lag_cols, periods=1)
        df_tr = lag_f.fit_transform(df[lag_cols].select_dtypes(include=["number", "float", "int"]))
        df[df_tr.columns] = df_tr
        return df

    def get_test_data_by_location(self, location: str,  normalize=False) -> pd.DataFrame:
        if location == "A":
            df = X_test_estimated_a
        elif location == "B":
            df = X_test_estimated_b
        elif location == "C":
            df = X_test_estimated_c
        else:
            raise Exception("location must be A, B or C")
        return df.copy()

    def unzip_date_feature(self, df: pd.DataFrame, date_column: str = "date_forecast"):
        df[date_column] = pd.to_datetime(df[date_column])
        
        df["day_of_year"] = df[date_column].dt.day_of_year
        df['time_of_day'] = df[date_column].dt.hour + df['date_forecast'].dt.minute / 60
        
        df['time_sin'] = np.sin(2 * np.pi * df['time_of_day'] / 24)
        df['day_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
        
        # df["month"] = df["date_forecast"].dt.month
        df.drop(columns=["day_of_year", "time_of_day"], inplace=True)
        return df

    def add_time_since_calucation(self, df):
        df["date_calc"] = pd.to_datetime(df["date_calc"])
        df["calculated_ago"] = (
            df["date_forecast"] - df["date_calc"]).dt.total_seconds()
        df["calculated_ago"] = df["calculated_ago"].fillna(
            0)
        return df

    def onehot_estimated(self, df):
        df["estimated"] = 0  # Initialize both columns to 0
        df["observed"] = 0
        estimated_mask = df["date_calc"].notna()
        df.loc[estimated_mask, "estimated"] = 1
        df.loc[~estimated_mask, "observed"] = 1
        return df

    def onehot_location(self, df, location):
        if location == "A":
            df["A"], df["B"], df["C"] = 1, 0, 0
        elif location == "B":
            df["A"], df["B"], df["C"] = 0, 1, 0
        elif location == "C":
            df["A"], df["B"], df["C"] = 0, 0, 1
        return df

    def grouped_by_hour(self, df: pd.DataFrame, date_column: str = "date_forecast"):
        df = df.groupby(pd.Grouper(key=date_column, freq="1H")
                        ).mean(numeric_only=True)
        all_nan_mask = df.isnull().all(axis=1)
        df = df[~all_nan_mask]
        return df.reset_index()

    def merge_train_target(self, x, y):
        # henning får med alle pv measurments selv om han merger på inner time. Fordi resample fyller nan rows for alle timer som ikke er i datasettet.
        merged = pd.merge(x, y, on="time", how="right")
        mask = merged["pv_measurement"].notna()
        merged = merged.loc[mask].reset_index(drop=True)
        return merged

    def absolute_values(self, df: pd.DataFrame):
        df[df.columns] = df[df.columns].abs()
        df = df.replace(-0.0, 0.0)
        return df

    def remove_consecutive_measurments(self, df: pd.DataFrame, consecutive_threshold=6, consecutive_threshold_for_zero=12):
        df = df.copy()
        if consecutive_threshold < 2:
            return df

        column_to_check = 'pv_measurement'
        mask = (df[column_to_check] != df[column_to_check].shift(2)).cumsum()

        df['consecutive_count'] = df.groupby(
            mask).transform('count')[column_to_check]

        mask = (df['consecutive_count'] > consecutive_threshold)
        mask_zero = (df['consecutive_count'] > consecutive_threshold_for_zero) & (
            df[column_to_check] == 0)
        df.drop(columns=["consecutive_count"], inplace=True)

        df = df.loc[~mask]
        df = df.loc[~mask_zero]
        return df.reset_index(drop=True)

    def compare_mae(self, df: pd.DataFrame):
        best_submission: pd.DataFrame = pd.read_csv(
            PATH+"mikael/submissions/best_prediction.csv")
        best_submission = best_submission[["prediction"]]

        if best_submission.shape != df.shape:
            print("best_submission", best_submission.shape)
            print("df", df.shape)
            raise Exception("Dataframe shape must be the same")

        return mean_absolute_error(
            best_submission["prediction"], df["prediction"])

    def split_train_tune(self, df: pd.DataFrame):
        df = df.copy()
        df_estimated = df.loc[df["estimated"] == 1]
        df_observed = df.loc[df["estimated"] == 0]

        num_rows = len(df_estimated)
        middle_index = num_rows // 2

        df_estimated.sample(frac=1, random_state=42)
        train_estimated = df.iloc[:middle_index]
        tune = df.iloc[middle_index:]

        train = pd.concat([df_observed, train_estimated])
        return train, tune
    
    def drop_features(self, df: pd.DataFrame):
        df.drop(
        columns=[
            "wind_speed_w_1000hPa:ms",
            "wind_speed_u_10m:ms",
            "wind_speed_v_10m:ms",
            "snow_density:kgm3",
        ], inplace=True)
        # correlated = DropCorrelatedFeatures(variables=None, method='spearman', threshold=0.98)
        # df = correlated.fit_transform(df)
        return df

    def post_processing(self, df: pd.DataFrame, prediction_column: str = "prediction_label"):
        df = df[[prediction_column]].rename(
            columns={prediction_column: "prediction"}).reset_index(drop=True).rename_axis(index="id")

        df["prediction"] = df["prediction"].clip(lower=0)
        return df


# pipe = Pipeline()
# df00 = pipe.get_combined_data()
# df00


# pipin = Pipin()
# x = pipin.get_combined_datasets(data_sets={"A"})

# get all date_calc.rows that are nan


# pipin.compare_mae(pd.DataFrame({"prediction": [1,2,3,4,5]}))

# print("df", big_data.head())
# pipin = Pipin()
# test = pipin.get_combined_test_data()
# pipin.get_data("B")

important_features = [
    'time',
    'direct_rad:W',
    'diffuse_rad:W',
    'sun_azimuth:d',
    'sun_elevation:d',
    'clear_sky_energy_1h:J',
    'clear_sky_rad:W',
    'total_cloud_cover:p',
    'effective_cloud_cover:p',
    'rain_water:kgm2',
    'precip_5min:mm',
    'wind_speed_10m:ms',
    'wind_speed_w_1000hPa:ms',
    'super_cooled_liquid_water:kgm2',
    'air_density_2m:kgm3',
    'pressure_100m:hPa',
    'pressure_50m:hPa',
    'sfc_pressure:hPa',
    'msl_pressure:hPa',
    'dew_point_2m:K',
    'is_day:idx',
    'is_in_shadow:idx',
    'elevation:m',

    "snow_melt_10min:mm",
    "snow_density:kgm3",
    "fresh_snow_6h:cm",
    "fresh_snow_1h:cm",
    "snow_water:kgm2",
    "fresh_snow_12h:cm",
    "fresh_snow_3h:cm",
    "fresh_snow_24h:cm",
    "snow_depth:cm",

    'A',
    'B',
    'C',
    "estimated",
    "observed",
]


In [3]:
%autoreload
from pipeline_main import Pipeline
pipin = Pipeline()

Current working directory: /Users/miksx/GitHub/Forest-Gump/mikael/autoML/gluon


In [4]:
DEFAULT_PATH = f"ag_145_4_drop_correlated"

In [5]:
df1_0, test1 = pipin.get_all_data("A")
df2_0, test2 = pipin.get_all_data("B")
df3_0, test3 = pipin.get_all_data("C")

In [6]:
df1_0.describe()

Unnamed: 0,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,direct_rad:W,direct_rad_1h:J,effective_cloud_cover:p,...,visibility:m,wind_speed_10m:ms,estimated,observed,time_sin,day_sin,clear_sky_energy_1h:J_lag_1,direct_rad_1h:J_lag_1,fresh_snow_1h:cm_lag_1,pv_measurement
count,20293.0,20293.0,16271.0,20293.0,20293.0,18794.0,20293.0,20293.0,20293.0,20293.0,...,20293.0,20293.0,20293.0,20293.0,20293.0,20293.0,20292.0,20292.0,20292.0,20301.0
mean,7.133427,1.24134,3130.42334,926402.2,257.321381,1791.566284,0.01094,88.096649,317180.3,65.917831,...,33474.390625,3.011817,0.092446,0.907554,0.637215,0.6250961,925362.6,317112.0,0.002579,1058.756741
std,2.856283,0.034508,2662.231934,927269.8,258.477142,1877.248169,0.096687,138.067474,489327.6,34.17968,...,18058.730469,1.759086,0.289661,0.289661,0.303578,0.3131824,928467.4,489578.7,0.043062,1352.401862
min,0.7,1.145,27.849998,0.0,0.0,27.950001,0.0,0.0,0.0,0.0,...,301.0,0.025,0.0,0.0,0.097755,6.432491e-16,0.0,0.0,0.0,0.0
25%,4.8,1.21775,1185.8125,79756.08,21.699999,598.849976,0.0,0.15,2632.45,39.724998,...,16755.474609,1.625,0.0,1.0,0.411603,0.3617137,77881.01,1819.581,0.0,47.3
50%,7.15,1.2385,2072.899902,593308.6,169.199997,1160.600098,0.0,15.675,64800.42,77.324997,...,37874.523438,2.7,0.0,1.0,0.770942,0.6932812,593420.1,64768.04,0.0,441.32
75%,9.1,1.264,4432.649902,1665297.0,451.700012,2204.0,0.0,121.074997,436111.3,98.224998,...,49022.375,4.1,0.0,1.0,0.933402,0.9242907,1665483.0,436244.1,0.0,1600.94
max,17.35,1.42625,12042.525391,2988628.0,835.099976,11673.625,1.0,683.400024,2441750.0,100.0,...,69395.523438,11.2,1.0,1.0,0.992522,0.9999907,2988628.0,2441750.0,3.125,5733.42


In [7]:
df2_0.head()

Unnamed: 0,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,direct_rad:W,direct_rad_1h:J,effective_cloud_cover:p,...,visibility:m,wind_speed_10m:ms,estimated,observed,time_sin,day_sin,clear_sky_energy_1h:J_lag_1,direct_rad_1h:J_lag_1,fresh_snow_1h:cm_lag_1,pv_measurement
0,7.7,1.22825,1728.949951,0.0,0.0,1728.949951,0.0,0.0,0.0,99.074997,...,40386.476562,3.6,0.0,1.0,0.411603,0.486273,,,,0.0
1,7.7,1.2235,1689.824951,0.0,0.0,1689.824951,0.0,0.0,0.0,99.75,...,33770.648438,3.35,0.0,1.0,0.16246,0.486273,0.0,0.0,0.0,0.0
2,7.875,1.21975,1563.224976,0.0,0.0,1563.224976,0.0,0.0,0.0,100.0,...,13595.5,3.05,0.0,1.0,0.097755,0.47116,0.0,0.0,0.0,0.0
3,8.425,1.218,1283.425049,208.649994,0.75,1283.425049,0.0,0.0,0.0,100.0,...,2321.850098,2.725,0.0,1.0,0.351308,0.47116,0.0,0.0,0.0,0.0
4,8.95,1.218,1003.5,32468.150391,23.1,1003.5,0.0,0.15,282.975006,84.875,...,11634.799805,2.55,0.0,1.0,0.580919,0.47116,208.649994,0.0,0.0,19.36


In [8]:
df3_0.head()

Unnamed: 0,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,direct_rad:W,direct_rad_1h:J,effective_cloud_cover:p,...,visibility:m,wind_speed_10m:ms,estimated,observed,time_sin,day_sin,clear_sky_energy_1h:J_lag_1,direct_rad_1h:J_lag_1,fresh_snow_1h:cm_lag_1,pv_measurement
0,7.7,1.22825,1728.949951,0.0,0.0,1728.949951,0.0,0.0,0.0,99.074997,...,40386.476562,3.6,0.0,1.0,0.411603,0.486273,,,,0.0
1,7.7,1.2235,1689.824951,0.0,0.0,1689.824951,0.0,0.0,0.0,99.75,...,33770.648438,3.35,0.0,1.0,0.16246,0.486273,0.0,0.0,0.0,0.0
2,7.875,1.21975,1563.224976,0.0,0.0,1563.224976,0.0,0.0,0.0,100.0,...,13595.5,3.05,0.0,1.0,0.097755,0.47116,0.0,0.0,0.0,0.0
3,8.425,1.218,1283.425049,208.649994,0.75,1283.425049,0.0,0.0,0.0,100.0,...,2321.850098,2.725,0.0,1.0,0.351308,0.47116,0.0,0.0,0.0,0.0
4,8.95,1.218,1003.5,32468.150391,23.1,1003.5,0.0,0.15,282.975006,84.875,...,11634.799805,2.55,0.0,1.0,0.580919,0.47116,208.649994,0.0,0.0,19.36


In [9]:
train1 = TabularDataset(df1_0)
train2 = TabularDataset(df2_0)
train3 = TabularDataset(df3_0)

In [10]:
predictor1 = TabularPredictor(label="pv_measurement",
                              eval_metric='mean_absolute_error',
                              path= DEFAULT_PATH+"A"
                              ).fit(
    train1,
    # time_limit=6000,
    # hyperparameters='extrme', 
    presets='best_quality', 
    # tuning_data = tuning1,
    # use_bag_holdout=True,
    # num_bag_folds= 6,
    # refit_full = True,
    # auto_stack = True,
    # num_bag_sets= 10,
    # set_best_to_refit_full= True,
    # num_stack_levels = 2,
    # verbosity = 3
    )

Presets specified: ['best_quality']
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "ag_145_4_drop_correlatedA"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.0.0: Fri Sep 15 14:41:43 PDT 2023; root:xnu-10002.1.13~1/RELEASE_ARM64_T6000
Disk Space Avail:   258.87 GB / 494.38 GB (52.4%)
Train Data Rows:    20301
Train Data Columns: 41
Label Column: pv_measurement
Preprocessing data ...
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (5733.42, 0.0, 1058.75674, 1352.40186)
	If 're

In [11]:
predictor2 = TabularPredictor(label="pv_measurement",
                              eval_metric='mean_absolute_error',
                              path= DEFAULT_PATH+"B"
                              ).fit(
    train2,
    # time_limit=6000,
    # hyperparameters='extrme', 
    presets='best_quality', 
    # tuning_data = tuning1,
    # use_bag_holdout=True,
    # num_bag_folds= 6,
    # refit_full = True,
    # auto_stack = True,
    # num_bag_sets= 10,
    # set_best_to_refit_full= True,
    # num_stack_levels = 2,
    # verbosity = 3
    )

# tuning_data
# num bag holdout 6
# bag_holdout

Presets specified: ['best_quality']
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "ag_145_4_drop_correlatedB"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.0.0: Fri Sep 15 14:41:43 PDT 2023; root:xnu-10002.1.13~1/RELEASE_ARM64_T6000
Disk Space Avail:   258.88 GB / 494.38 GB (52.4%)
Train Data Rows:    20301
Train Data Columns: 41
Label Column: pv_measurement
Preprocessing data ...
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (5733.42, 0.0, 1058.75674, 1352.40186)
	If 're

In [12]:
predictor3 = TabularPredictor(label="pv_measurement",
                              eval_metric='mean_absolute_error',
                              path= DEFAULT_PATH+"C"
                              ).fit(
    train3,
    # time_limit=6000,
    # hyperparameters='extrme', 
    presets='best_quality', 
    # tuning_data = tuning1,
    # use_bag_holdout=True,
    # num_bag_folds= 6,
    # refit_full = True,
    # auto_stack = True,
    # num_bag_sets= 10,
    # set_best_to_refit_full= True,
    # num_stack_levels = 2,
    # verbosity = 3
    )

Presets specified: ['best_quality']
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "ag_145_4_drop_correlatedC"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.0.0: Fri Sep 15 14:41:43 PDT 2023; root:xnu-10002.1.13~1/RELEASE_ARM64_T6000
Disk Space Avail:   258.89 GB / 494.38 GB (52.4%)
Train Data Rows:    20301
Train Data Columns: 41
Label Column: pv_measurement
Preprocessing data ...
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (5733.42, 0.0, 1058.75674, 1352.40186)
	If 're

In [13]:
test_data1 = TabularDataset(test1)
test_data2 = TabularDataset(test2)
test_data3 = TabularDataset(test3)

pred1 = pd.DataFrame(predictor1.predict(test_data1))
pred2 = pd.DataFrame(predictor2.predict(test_data2))
pred3 = pd.DataFrame(predictor3.predict(test_data3))

negatives_pred1 = pred1[pred1["pv_measurement"] < 0]
negatives_pred2 = pred2[pred2["pv_measurement"] < 0]
negatives_pred3 = pred3[pred3["pv_measurement"] < 0]
neg = pd.concat([negatives_pred1, negatives_pred2, negatives_pred3])
neg.shape

  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = d

(99, 1)

In [14]:
pred = pd.concat([pred1, pred2, pred3])
final_prediction = pipin.post_processing(pred, prediction_column="pv_measurement")
final_prediction.to_csv('submissions/gluon_145_4_drop_correlated_lag.csv')

In [15]:
diff = pipin.compare_mae(final_prediction)
diff

691.3612539340936

In [16]:
# predictor.evaluate(df1, silent=True)

<h3> Henning skal teste å fjerne consecutives.</h3>
<p> yee<p>

In [1]:
%load_ext autoreload

In [2]:
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd
import numpy as np

In [None]:
import numpy as np
import pandas as pd
from feature_engine.timeseries.forecasting import LagFeatures
from feature_engine.selection import DropCorrelatedFeatures
from sklearn.metrics import mean_absolute_error
import os

current_dir = os.getcwd()
print("Current working directory:", current_dir)


PATH = "../../../"
# Estimate
X_train_estimated_a: pd.DataFrame = pd.read_parquet(
    PATH + 'A/X_train_estimated.parquet')
X_train_estimated_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/X_train_estimated.parquet")
X_train_estimated_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/X_train_estimated.parquet")

# Test estimates
X_test_estimated_a: pd.DataFrame = pd.read_parquet(
    PATH + "A/X_test_estimated.parquet")
X_test_estimated_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/X_test_estimated.parquet")
X_test_estimated_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/X_test_estimated.parquet")

# Observations
X_train_observed_a: pd.DataFrame = pd.read_parquet(
    PATH + "A/X_train_observed.parquet")
X_train_observed_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/X_train_observed.parquet")
X_train_observed_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/X_train_observed.parquet")

# Targets
Y_train_observed_a: pd.DataFrame = pd.read_parquet(
    PATH + "A/train_targets.parquet")
Y_train_observed_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/train_targets.parquet")
Y_train_observed_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/train_targets.parquet")

test_df_example = pd.read_csv(PATH + "test.csv")

best_submission: pd.DataFrame = pd.read_csv(
    PATH + "mikael/submissions/fourth_submission.csv")

optins = {
    "randomize": False,
    "consecutive_threshold": 6,
    "normalize": False,
    "group_by_hour": True,
    "unzip_date_feature": True,
}

# make a options class with the options as attributes


class Options:
    randomize = False
    consecutive_threshold = 6
    normalize = False
    group_by_hour = True
    unzip_date_feature = True

    def __init__(self, randomize=False, consecutive_threshold=6, normalize=False, group_by_hour=True, unzip_date_feature=True) -> None:
        self.randomize = randomize
        self.consecutive_threshold = consecutive_threshold
        self.normalize = normalize
        self.group_by_hour = group_by_hour
        self.unzip_date_feature = unzip_date_feature


class Pipeline:

    def __init__(self):
        pass

    def get_combined_data(self, test_data=False):
        locations = ["A", "B", "C"]
        dfs = []
        for index, location in enumerate(locations):
            if test_data:
                dfs.append(self.get_test_data(location))
            else:
                dfs.append(self.get_data(location))

            dfs[index] = self.onehot_location(dfs[index], location)
        df = pd.concat(dfs).reset_index(drop=True)

        if test_data:
            return df
        return df[[c for c in df if c not in ['pv_measurement']] +  # pv measurement is the target and is at the end columns
                  ['pv_measurement']]
    
    def get_all_data(self, location: str):
        train, targets = self.get_training_data_by_location("A")
        test = self.get_test_data_by_location("A")
        train = self.drop_features(train)
        test = test[train.columns.to_list()]
        train = self.handle_data(train, targets)
        test = self.handle_data(test, test=True)
        return train, test

    def get_data(self, location: str) -> pd.DataFrame:
        train, targets = self.get_training_data_by_location(location)
        return self.handle_data(train, targets)

    def get_test_data(self, location: str, columns: list) -> pd.DataFrame:
        test_data = self.get_test_data_by_location(location)
        test_data = test_data[columns]
        return self.handle_data(test_data)

    def handle_data(self, df, targets=pd.DataFrame(), test=False):
        df["date_calc"] = pd.to_datetime(df["date_calc"])
        df["date_forecast"] = pd.to_datetime(df["date_forecast"])

        # df = self.add_time_since_calucation(df)

        df = self.onehot_estimated(df)
        df = self.unzip_date_feature(df)
        df = self.grouped_by_hour(df)
        df = self.add_lag_features(df)

        df["time"] = df["date_forecast"]
        # df.drop(["date_forecast"], axis=1, inplace=True)
        if not targets.empty:
            df = self.merge_train_target(df, targets)

        # df.drop(["date_calc"], axis=1, inplace=True)
        df.drop(["time"], axis=1, inplace=True)
        # df = self.absolute_values(df)
        return df

    # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– helper funciton ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––

    def get_training_data_by_location(self, location):
        if location == "A":
            X_train_observed_x = X_train_observed_a
            X_train_estimated_x = X_train_estimated_a
            Y_train_x = Y_train_observed_a
        elif location == "B":
            X_train_observed_x = X_train_observed_b
            X_train_estimated_x = X_train_estimated_b
            Y_train_x = Y_train_observed_b
        elif location == "C":
            X_train_observed_x = X_train_observed_c
            X_train_estimated_x = X_train_estimated_c
            Y_train_x = Y_train_observed_c
        else:
            raise Exception("location must be A, B or C")
        train = pd.concat(
            [X_train_observed_x, X_train_estimated_x]).reset_index(drop=True)
        return train, Y_train_x
    
    def add_lag_features(self, df: pd.DataFrame):
        no_cat_features_1h = [c for c in df.columns if "_1h:" in c]
        lag_cols = df[no_cat_features_1h].select_dtypes(include=["number", "float", "int"]).columns.to_list()
        lag_f = LagFeatures(variables=lag_cols, periods=1)
        df_tr = lag_f.fit_transform(df[lag_cols].select_dtypes(include=["number", "float", "int"]))
        df[df_tr.columns] = df_tr
        return df

    def get_test_data_by_location(self, location: str,  normalize=False) -> pd.DataFrame:
        if location == "A":
            df = X_test_estimated_a
        elif location == "B":
            df = X_test_estimated_b
        elif location == "C":
            df = X_test_estimated_c
        else:
            raise Exception("location must be A, B or C")
        return df.copy()

    def unzip_date_feature(self, df: pd.DataFrame, date_column: str = "date_forecast"):
        df[date_column] = pd.to_datetime(df[date_column])
        
        df["day_of_year"] = df[date_column].dt.day_of_year
        df['time_of_day'] = df[date_column].dt.hour + df['date_forecast'].dt.minute / 60
        
        df['time_sin'] = np.sin(2 * np.pi * df['time_of_day'] / 24)
        df['day_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
        
        # df["month"] = df["date_forecast"].dt.month
        df.drop(columns=["day_of_year", "time_of_day"], inplace=True)
        return df

    def add_time_since_calucation(self, df):
        df["date_calc"] = pd.to_datetime(df["date_calc"])
        df["calculated_ago"] = (
            df["date_forecast"] - df["date_calc"]).dt.total_seconds()
        df["calculated_ago"] = df["calculated_ago"].fillna(
            0)
        return df

    def onehot_estimated(self, df):
        df["estimated"] = 0  # Initialize both columns to 0
        df["observed"] = 0
        estimated_mask = df["date_calc"].notna()
        df.loc[estimated_mask, "estimated"] = 1
        df.loc[~estimated_mask, "observed"] = 1
        return df

    def onehot_location(self, df, location):
        if location == "A":
            df["A"], df["B"], df["C"] = 1, 0, 0
        elif location == "B":
            df["A"], df["B"], df["C"] = 0, 1, 0
        elif location == "C":
            df["A"], df["B"], df["C"] = 0, 0, 1
        return df

    def grouped_by_hour(self, df: pd.DataFrame, date_column: str = "date_forecast"):
        df = df.groupby(pd.Grouper(key=date_column, freq="1H")
                        ).mean(numeric_only=True)
        all_nan_mask = df.isnull().all(axis=1)
        df = df[~all_nan_mask]
        return df.reset_index()

    def merge_train_target(self, x, y):
        # henning får med alle pv measurments selv om han merger på inner time. Fordi resample fyller nan rows for alle timer som ikke er i datasettet.
        merged = pd.merge(x, y, on="time", how="right")
        mask = merged["pv_measurement"].notna()
        merged = merged.loc[mask].reset_index(drop=True)
        return merged

    def absolute_values(self, df: pd.DataFrame):
        df[df.columns] = df[df.columns].abs()
        df = df.replace(-0.0, 0.0)
        return df

    def remove_consecutive_measurments(self, df: pd.DataFrame, consecutive_threshold=6, consecutive_threshold_for_zero=12):
        df = df.copy()
        if consecutive_threshold < 2:
            return df

        column_to_check = 'pv_measurement'
        mask = (df[column_to_check] != df[column_to_check].shift(2)).cumsum()

        df['consecutive_count'] = df.groupby(
            mask).transform('count')[column_to_check]

        mask = (df['consecutive_count'] > consecutive_threshold)
        mask_zero = (df['consecutive_count'] > consecutive_threshold_for_zero) & (
            df[column_to_check] == 0)
        df.drop(columns=["consecutive_count"], inplace=True)

        df = df.loc[~mask]
        df = df.loc[~mask_zero]
        return df.reset_index(drop=True)

    def compare_mae(self, df: pd.DataFrame):
        best_submission: pd.DataFrame = pd.read_csv(
            PATH+"mikael/submissions/best_prediction.csv")
        best_submission = best_submission[["prediction"]]

        if best_submission.shape != df.shape:
            print("best_submission", best_submission.shape)
            print("df", df.shape)
            raise Exception("Dataframe shape must be the same")

        return mean_absolute_error(
            best_submission["prediction"], df["prediction"])

    def split_train_tune(self, df: pd.DataFrame):
        df = df.copy()
        df_estimated = df.loc[df["estimated"] == 1]
        df_observed = df.loc[df["estimated"] == 0]

        num_rows = len(df_estimated)
        middle_index = num_rows // 2

        df_estimated.sample(frac=1, random_state=42)
        train_estimated = df.iloc[:middle_index]
        tune = df.iloc[middle_index:]

        train = pd.concat([df_observed, train_estimated])
        return train, tune
    
    def drop_features(self, df: pd.DataFrame):
        df.drop(
        columns=[
            "wind_speed_w_1000hPa:ms",
            "wind_speed_u_10m:ms",
            "wind_speed_v_10m:ms",
        ], inplace=True)
        # correlated = DropCorrelatedFeatures(variables=None, method='pearson', threshold=0.8)
        # df = correlated.fit_transform(df)
        return df

    def post_processing(self, df: pd.DataFrame, prediction_column: str = "prediction_label"):
        df = df[[prediction_column]].rename(
            columns={prediction_column: "prediction"}).reset_index(drop=True).rename_axis(index="id")

        df["prediction"] = df["prediction"].clip(lower=0)
        return df
    
    def find_min_max_date_in_test(self) -> list:
        locations = ["A", "B", "C"]
        dates = []
        for loc in locations:
            df = self.get_test_data_by_location(loc)
            df["date_forecast"] = pd.to_datetime(df["date_forecast"])
            dates.append((df["date_forecast"].min(),
                         df["date_forecast"].max()))
        return dates

    def split_train_summer_2021(self, df: pd.DataFrame):
        dates = self.find_min_max_date_in_test()
        # set the dates to the summer of 2021
        dates = [(date[0].replace(year=2021), date[1].replace(year=2021))
                 for date in dates]

        summer2021 = df[(df["date_forecast"] >= dates[0][0]) & (
            df["date_forecast"] <= dates[0][1])]

        train = df[~df.index.isin(summer2021.index)]
        return train, summer2021

    def post_processing(self, df: pd.DataFrame, prediction_column: str = "prediction_label"):
        df = df[[prediction_column]].rename(
            columns={prediction_column: "prediction"}).reset_index(drop=True).rename_axis(index="id")

        df["prediction"] = df["prediction"].clip(lower=0)
        return df


# pipe = Pipeline()
# df00 = pipe.get_combined_data()
# df00


# pipin = Pipin()
# x = pipin.get_combined_datasets(data_sets={"A"})

# get all date_calc.rows that are nan


# pipin.compare_mae(pd.DataFrame({"prediction": [1,2,3,4,5]}))

# print("df", big_data.head())
# pipin = Pipin()
# test = pipin.get_combined_test_data()
# pipin.get_data("B")

important_features = [
    'time',
    'direct_rad:W',
    'diffuse_rad:W',
    'sun_azimuth:d',
    'sun_elevation:d',
    'clear_sky_energy_1h:J',
    'clear_sky_rad:W',
    'total_cloud_cover:p',
    'effective_cloud_cover:p',
    'rain_water:kgm2',
    'precip_5min:mm',
    'wind_speed_10m:ms',
    'wind_speed_w_1000hPa:ms',
    'super_cooled_liquid_water:kgm2',
    'air_density_2m:kgm3',
    'pressure_100m:hPa',
    'pressure_50m:hPa',
    'sfc_pressure:hPa',
    'msl_pressure:hPa',
    'dew_point_2m:K',
    'is_day:idx',
    'is_in_shadow:idx',
    'elevation:m',

    "snow_melt_10min:mm",
    "snow_density:kgm3",
    "fresh_snow_6h:cm",
    "fresh_snow_1h:cm",
    "snow_water:kgm2",
    "fresh_snow_12h:cm",
    "fresh_snow_3h:cm",
    "fresh_snow_24h:cm",
    "snow_depth:cm",

    'A',
    'B',
    'C',
    "estimated",
    "observed",
]


In [3]:
%autoreload
from pipeline_145_preset import Pipeline
pipin = Pipeline()

Current working directory: /Users/miksx/GitHub/Forest-Gump/mikael/autoML/gluon


In [4]:
DEFAULT_PATH = f"ag_145_experimental_exstreme_preset"

In [5]:
df1_0 = pipin.get_data("A")
df2_0 = pipin.get_data("B")
df3_0 = pipin.get_data("C")

In [6]:
# remove_consecutive_measurments
df1_0 = pipin.remove_consecutive_measurments(df1_0, 400, 25)
df2_0 = pipin.remove_consecutive_measurments(df2_0, 400, 25)
df3_0 = pipin.remove_consecutive_measurments(df3_0, 400, 25)

In [7]:
df1_0.head()

Unnamed: 0,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,...,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,estimated,observed,day_of_year,hour,pv_measurement
0,7.7,1.22825,1728.949951,0.0,0.0,1728.949951,0.0,280.299988,0.0,0.0,...,40386.476562,3.6,3.575,0.5,0.0,0.0,1.0,153.0,22.0,0.0
1,7.7,1.2235,1689.824951,0.0,0.0,1689.824951,0.0,280.299988,0.0,0.0,...,33770.648438,3.35,3.35,0.275,0.0,0.0,1.0,153.0,23.0,0.0
2,7.875,1.21975,1563.224976,0.0,0.0,1563.224976,0.0,280.649994,0.0,0.0,...,13595.5,3.05,2.95,0.75,0.0,0.0,1.0,154.0,0.0,0.0
3,8.425,1.218,1283.425049,208.649994,0.75,1283.425049,0.0,281.674988,0.3,526.775024,...,2321.850098,2.725,2.6,0.875,0.0,0.0,1.0,154.0,1.0,0.0
4,8.95,1.218,1003.5,32468.150391,23.1,1003.5,0.0,282.5,11.975,22068.949219,...,11634.799805,2.55,2.35,0.925,0.0,0.0,1.0,154.0,2.0,19.36


In [8]:
train1 = TabularDataset(df1_0)
train2 = TabularDataset(df2_0)
train3 = TabularDataset(df3_0)

In [9]:
predictor1 = TabularPredictor(label="pv_measurement",
                              eval_metric='mean_absolute_error',
                              path= DEFAULT_PATH+"A"
                              ).fit(
    train1,
    # time_limit=6000,
    # hyperparameters='extrme', 
    presets='experimental_extreme_quality', 
    # tuning_data = tuning1,
    # use_bag_holdout=True,
    # num_bag_folds= 6,
    # refit_full = True,
    # auto_stack = True,
    # num_bag_sets= 10,
    # set_best_to_refit_full= True,
    # num_stack_levels = 2,
    # verbosity = 3
    )



Presets specified: ['experimental_extreme_quality']
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "ag_145_experimental_exstreme_presetA"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.0.0: Fri Sep 15 14:41:43 PDT 2023; root:xnu-10002.1.13~1/RELEASE_ARM64_T6000
Disk Space Avail:   213.28 GB / 494.38 GB (43.1%)
Train Data Rows:    34043
Train Data Columns: 49
Label Column: pv_measurement
Preprocessing data ...
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (5733.42, 0.0, 631.

In [10]:
predictor2 = TabularPredictor(label="pv_measurement",
                              eval_metric='mean_absolute_error',
                              path= DEFAULT_PATH+"B"
                              ).fit(
    train2,
    # time_limit=6000,
    # hyperparameters='extrme', 
    presets='experimental_extreme_quality', 
    # tuning_data = tuning1,
    # use_bag_holdout=True,
    # num_bag_folds= 6,
    # refit_full = True,
    # auto_stack = True,
    # num_bag_sets= 10,
    # set_best_to_refit_full= True,
    # num_stack_levels = 2,
    # verbosity = 3
    )

# tuning_data
# num bag holdout 6
# bag_holdout

Presets specified: ['experimental_extreme_quality']
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "ag_145_experimental_exstreme_presetB"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.0.0: Fri Sep 15 14:41:43 PDT 2023; root:xnu-10002.1.13~1/RELEASE_ARM64_T6000
Disk Space Avail:   212.21 GB / 494.38 GB (42.9%)
Train Data Rows:    27618
Train Data Columns: 49
Label Column: pv_measurement
Preprocessing data ...
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (1152.3, 0.0, 107.6

In [11]:
predictor3 = TabularPredictor(label="pv_measurement",
                              eval_metric='mean_absolute_error',
                              path= DEFAULT_PATH+"C"
                              ).fit(
    train3,
    # time_limit=6000,
    # hyperparameters='extrme', 
    presets='experimental_extreme_quality', 
    # tuning_data = tuning1,
    # use_bag_holdout=True,
    # num_bag_folds= 6,
    # refit_full = True,
    # auto_stack = True,
    # num_bag_sets= 10,
    # set_best_to_refit_full= True,
    # num_stack_levels = 2,
    # verbosity = 3
    )

Presets specified: ['experimental_extreme_quality']
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "ag_145_experimental_exstreme_presetC"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.0.0: Fri Sep 15 14:41:43 PDT 2023; root:xnu-10002.1.13~1/RELEASE_ARM64_T6000
Disk Space Avail:   209.17 GB / 494.38 GB (42.3%)
Train Data Rows:    21169
Train Data Columns: 49
Label Column: pv_measurement
Preprocessing data ...
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and label-values can't be converted to int).
	Label info (max, min, mean, stddev): (999.6, 0.0, 9

In [12]:
test1 = pipin.get_test_data("A")
test2 = pipin.get_test_data("B")
test3 = pipin.get_test_data("C")

In [13]:
test_data1 = TabularDataset(test1)
test_data2 = TabularDataset(test2)
test_data3 = TabularDataset(test3)

pred1 = pd.DataFrame(predictor1.predict(test_data1))
pred2 = pd.DataFrame(predictor2.predict(test_data2))
pred3 = pd.DataFrame(predictor3.predict(test_data3))

negatives_pred1 = pred1[pred1["pv_measurement"] < 0]
negatives_pred2 = pred2[pred2["pv_measurement"] < 0]
negatives_pred3 = pred3[pred3["pv_measurement"] < 0]
neg = pd.concat([negatives_pred1, negatives_pred2, negatives_pred3])
neg.shape

  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = d

(151, 1)

In [14]:
pred = pd.concat([pred1, pred2, pred3])
final_prediction = pipin.post_processing(pred, prediction_column="pv_measurement")
final_prediction.to_csv('gluon_145_experimental_exstreme_preset.csv')

In [15]:
diff = pipin.compare_mae(final_prediction)
diff

55.24093004142889

In [16]:
# predictor.evaluate(df1, silent=True)

In [1]:
%load_ext autoreload

In [2]:
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd
import numpy as np

In [None]:
import pandas as pd
from feature_engine.timeseries.forecasting import LagFeatures
from sklearn.metrics import mean_absolute_error
import os

current_dir = os.getcwd()
print("Current working directory:", current_dir)


PATH = "../../../"
# Estimate
X_train_estimated_a: pd.DataFrame = pd.read_parquet(
    PATH + 'A/X_train_estimated.parquet')
X_train_estimated_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/X_train_estimated.parquet")
X_train_estimated_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/X_train_estimated.parquet")

# Test estimates
X_test_estimated_a: pd.DataFrame = pd.read_parquet(
    PATH + "A/X_test_estimated.parquet")
X_test_estimated_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/X_test_estimated.parquet")
X_test_estimated_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/X_test_estimated.parquet")

# Observations
X_train_observed_a: pd.DataFrame = pd.read_parquet(
    PATH + "A/X_train_observed.parquet")
X_train_observed_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/X_train_observed.parquet")
X_train_observed_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/X_train_observed.parquet")

# Targets
Y_train_observed_a: pd.DataFrame = pd.read_parquet(
    PATH + "A/train_targets.parquet")
Y_train_observed_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/train_targets.parquet")
Y_train_observed_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/train_targets.parquet")

test_df_example = pd.read_csv(PATH + "test.csv")

best_submission: pd.DataFrame = pd.read_csv(
    PATH + "mikael/submissions/fourth_submission.csv")

optins = {
    "randomize": False,
    "consecutive_threshold": 6,
    "normalize": False,
    "group_by_hour": True,
    "unzip_date_feature": True,
}

# make a options class with the options as attributes


class Options:
    randomize = False
    consecutive_threshold = 6
    normalize = False
    group_by_hour = True
    unzip_date_feature = True

    def __init__(self, randomize=False, consecutive_threshold=6, normalize=False, group_by_hour=True, unzip_date_feature=True) -> None:
        self.randomize = randomize
        self.consecutive_threshold = consecutive_threshold
        self.normalize = normalize
        self.group_by_hour = group_by_hour
        self.unzip_date_feature = unzip_date_feature


class Pipeline:

    def __init__(self):
        pass

    def get_combined_data(self, test_data=False):
        locations = ["A", "B", "C"]
        dfs = []
        for index, location in enumerate(locations):
            if test_data:
                dfs.append(self.get_test_data(location))
            else:
                dfs.append(self.get_data(location))

            dfs[index] = self.onehot_location(dfs[index], location)
        df = pd.concat(dfs).reset_index(drop=True)

        if test_data:
            return df
        return df[[c for c in df if c not in ['pv_measurement']] +  # pv measurement is the target and is at the end columns
                  ['pv_measurement']]

    def get_data(self, location: str) -> pd.DataFrame:
        train, targets = self.get_training_data_by_location(location)
        return self.handle_data(train, targets)

    def get_test_data(self, location: str) -> pd.DataFrame:
        test_data = self.get_test_data_by_location(location)
        return self.handle_data(test_data)

    def handle_data(self, df, targets=pd.DataFrame()):
        df["date_calc"] = pd.to_datetime(df["date_calc"])
        df["date_forecast"] = pd.to_datetime(df["date_forecast"])

        # df = self.add_time_since_calucation(df)
        df = self.onehot_estimated(df)
        df = self.unzip_date_feature(df)
        df = self.grouped_by_hour(df)
        df = self.add_lag_features(df)

        df["time"] = df["date_forecast"]
        df.drop(["date_forecast"], axis=1, inplace=True)
        if not targets.empty:
            df = self.merge_train_target(df, targets)

        # df.drop(["date_calc"], axis=1, inplace=True)
        df.drop(["time"], axis=1, inplace=True)
        df = self.absolute_values(df)
        return df

    # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– helper funciton ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––

    def get_training_data_by_location(self, location):
        if location == "A":
            X_train_observed_x = X_train_observed_a
            X_train_estimated_x = X_train_estimated_a
            Y_train_x = Y_train_observed_a
        elif location == "B":
            X_train_observed_x = X_train_observed_b
            X_train_estimated_x = X_train_estimated_b
            Y_train_x = Y_train_observed_b
        elif location == "C":
            X_train_observed_x = X_train_observed_c
            X_train_estimated_x = X_train_estimated_c
            Y_train_x = Y_train_observed_c
        else:
            raise Exception("location must be A, B or C")
        train = pd.concat(
            [X_train_observed_x, X_train_estimated_x]).reset_index(drop=True)
        return train, Y_train_x
    
    def add_lag_features(self, df: pd.DataFrame):
        no_cat_features_1h = [c for c in df.columns if "_1h:" in c]
        lag_cols = df[no_cat_features_1h].select_dtypes(include=["number", "float", "int"]).columns.to_list()
        lag_f = LagFeatures(variables=lag_cols, periods=1)
        df_tr = lag_f.fit_transform(df[lag_cols].select_dtypes(include=["number", "float", "int"]))
        df[df_tr.columns] = df_tr
        return df

    def get_test_data_by_location(self, location: str,  normalize=False) -> pd.DataFrame:
        if location == "A":
            df = X_test_estimated_a
        elif location == "B":
            df = X_test_estimated_b
        elif location == "C":
            df = X_test_estimated_c
        else:
            raise Exception("location must be A, B or C")
        return df.copy()

    def unzip_date_feature(self, df: pd.DataFrame, date_column: str = "date_forecast"):
        df[date_column] = pd.to_datetime(df[date_column])
        df["day_of_year"] = df["date_forecast"].dt.day_of_year
        df["hour"] = df["date_forecast"].dt.hour
        # df["month"] = df["date_forecast"].dt.month
        return df

    def add_time_since_calucation(self, df):
        df["date_calc"] = pd.to_datetime(df["date_calc"])
        df["calculated_ago"] = (
            df["date_forecast"] - df["date_calc"]).dt.total_seconds()
        df["calculated_ago"] = df["calculated_ago"].fillna(
            0)
        return df

    def onehot_estimated(self, df):
        df["estimated"] = 0  # Initialize both columns to 0
        df["observed"] = 0
        estimated_mask = df["date_calc"].notna()
        df.loc[estimated_mask, "estimated"] = 1
        df.loc[~estimated_mask, "observed"] = 1
        return df

    def onehot_location(self, df, location):
        if location == "A":
            df["A"], df["B"], df["C"] = 1, 0, 0
        elif location == "B":
            df["A"], df["B"], df["C"] = 0, 1, 0
        elif location == "C":
            df["A"], df["B"], df["C"] = 0, 0, 1
        return df

    def grouped_by_hour(self, df: pd.DataFrame, date_column: str = "date_forecast"):
        df = df.groupby(pd.Grouper(key=date_column, freq="1H")
                        ).mean(numeric_only=True)
        all_nan_mask = df.isnull().all(axis=1)
        df = df[~all_nan_mask]
        return df.reset_index()

    def merge_train_target(self, x, y):
        # henning får med alle pv measurments selv om han merger på inner time. Fordi resample fyller nan rows for alle timer som ikke er i datasettet.
        merged = pd.merge(x, y, on="time", how="right")
        mask = merged["pv_measurement"].notna()
        merged = merged.loc[mask].reset_index(drop=True)
        return merged

    def absolute_values(self, df: pd.DataFrame):
        df[df.columns] = df[df.columns].abs()
        df = df.replace(-0.0, 0.0)
        return df

    def remove_consecutive_measurments(self, df: pd.DataFrame, consecutive_threshold=6, consecutive_threshold_for_zero=12):
        if consecutive_threshold < 2:
            return df

        column_to_check = 'pv_measurement'
        mask = (df[column_to_check] != df[column_to_check].shift(2)).cumsum()

        df['consecutive_count'] = df.groupby(
            mask).transform('count')[column_to_check]

        mask = (df['consecutive_count'] > consecutive_threshold)
        mask_zero = (df['consecutive_count'] > consecutive_threshold_for_zero) & (
            df[column_to_check] == 0)
        df.drop(columns=["consecutive_count"], inplace=True)

        df = df.loc[~mask]
        df = df.loc[~mask_zero]
        return df.reset_index(drop=True)

    def compare_mae(self, df: pd.DataFrame):
        best_submission: pd.DataFrame = pd.read_csv(
            PATH+"mats/submissions/big_gluon_best.csv")
        best_submission = best_submission[["prediction"]]

        if best_submission.shape != df.shape:
            print("best_submission", best_submission.shape)
            print("df", df.shape)
            raise Exception("Dataframe shape must be the same")

        return mean_absolute_error(
            best_submission["prediction"], df["prediction"])

    def split_train_tune(self, df: pd.DataFrame):
        df = df.copy()
        df_estimated = df.loc[df["estimated"] == 1]
        df_observed = df.loc[df["estimated"] == 0]

        num_rows = len(df_estimated)
        middle_index = num_rows // 2

        df_estimated.sample(frac=1, random_state=42)
        train_estimated = df.iloc[:middle_index]
        tune = df.iloc[middle_index:]

        train = pd.concat([df_observed, train_estimated])
        return train, tune

    def post_processing(self, df: pd.DataFrame, prediction_column: str = "prediction_label"):
        df = df[[prediction_column]].rename(
            columns={prediction_column: "prediction"}).reset_index(drop=True).rename_axis(index="id")

        df["prediction"] = df["prediction"].clip(lower=0)
        return df


# pipe = Pipeline()
# df00 = pipe.get_combined_data()
# df00


# pipin = Pipin()
# x = pipin.get_combined_datasets(data_sets={"A"})

# get all date_calc.rows that are nan


# pipin.compare_mae(pd.DataFrame({"prediction": [1,2,3,4,5]}))

# print("df", big_data.head())
# pipin = Pipin()
# test = pipin.get_combined_test_data()
# pipin.get_data("B")

important_features = [
    'time',
    'direct_rad:W',
    'diffuse_rad:W',
    'sun_azimuth:d',
    'sun_elevation:d',
    'clear_sky_energy_1h:J',
    'clear_sky_rad:W',
    'total_cloud_cover:p',
    'effective_cloud_cover:p',
    'rain_water:kgm2',
    'precip_5min:mm',
    'wind_speed_10m:ms',
    'wind_speed_w_1000hPa:ms',
    'super_cooled_liquid_water:kgm2',
    'air_density_2m:kgm3',
    'pressure_100m:hPa',
    'pressure_50m:hPa',
    'sfc_pressure:hPa',
    'msl_pressure:hPa',
    'dew_point_2m:K',
    'is_day:idx',
    'is_in_shadow:idx',
    'elevation:m',

    "snow_melt_10min:mm",
    "snow_density:kgm3",
    "fresh_snow_6h:cm",
    "fresh_snow_1h:cm",
    "snow_water:kgm2",
    "fresh_snow_12h:cm",
    "fresh_snow_3h:cm",
    "fresh_snow_24h:cm",
    "snow_depth:cm",

    'A',
    'B',
    'C',
    "estimated",
    "observed",
]


In [3]:
%autoreload
from pipeline_lag_features_sine import Pipeline
pipin = Pipeline()

Current working directory: /Users/miksx/GitHub/Forest-Gump/mikael/autoML/gluon


In [4]:
DEFAULT_PATH = f"ag_145_lag_features_4_24_sine"

In [5]:
df1_0, test1  = pipin.get_all_data("A")
df2_0, test2 = pipin.get_all_data("B")
df3_0, test3 = pipin.get_all_data("C")

In [6]:
# remove_consecutive_measurments
df1_0 = pipin.remove_consecutive_measurments(df1_0, 4, 24)
df2_0 = pipin.remove_consecutive_measurments(df2_0, 4, 24)
df3_0 = pipin.remove_consecutive_measurments(df3_0, 4, 24)

In [7]:
df1_0.head()

Unnamed: 0,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,...,wind_speed_10m:ms,estimated,observed,time_sin,day_sin,clear_sky_energy_1h:J_lag_1,diffuse_rad_1h:J_lag_1,direct_rad_1h:J_lag_1,fresh_snow_1h:cm_lag_1,pv_measurement
0,7.7,1.22825,1728.949951,0.0,0.0,1728.949951,0.0,280.299988,0.0,0.0,...,3.6,0.0,1.0,0.411603,0.486273,,,,,0.0
1,7.7,1.2235,1689.824951,0.0,0.0,1689.824951,0.0,280.299988,0.0,0.0,...,3.35,0.0,1.0,0.16246,0.486273,0.0,0.0,0.0,0.0,0.0
2,7.875,1.21975,1563.224976,0.0,0.0,1563.224976,0.0,280.649994,0.0,0.0,...,3.05,0.0,1.0,0.097755,0.47116,0.0,0.0,0.0,0.0,0.0
3,8.425,1.218,1283.425049,208.649994,0.75,1283.425049,0.0,281.674988,0.3,526.775024,...,2.725,0.0,1.0,0.351308,0.47116,0.0,0.0,0.0,0.0,0.0
4,8.95,1.218,1003.5,32468.150391,23.1,1003.5,0.0,282.5,11.975,22068.949219,...,2.55,0.0,1.0,0.580919,0.47116,208.649994,526.775024,0.0,0.0,19.36


In [8]:
train1 = TabularDataset(df1_0)
train2 = TabularDataset(df2_0)
train3 = TabularDataset(df3_0)

In [9]:
predictor1 = TabularPredictor(label="pv_measurement",
                              eval_metric='mean_absolute_error',
                              path= DEFAULT_PATH+"A"
                              ).fit(
    train1,
    time_limit=3600,
    # hyperparameters='extrme', 
    presets='best_quality', 
    # tuning_data = tuning1,
    # use_bag_holdout=True,
    # num_bag_folds= 6,
    # refit_full = True,
    # auto_stack = True,
    # num_bag_sets= 10,
    # set_best_to_refit_full= True,
    # num_stack_levels = 2,
    # verbosity = 3
    )

Presets specified: ['best_quality']
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "ag_145_lag_features_4_24_sineA"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.0.0: Fri Sep 15 14:41:43 PDT 2023; root:xnu-10002.1.13~1/RELEASE_ARM64_T6000
Disk Space Avail:   252.38 GB / 494.38 GB (51.0%)
Train Data Rows:    20301
Train Data Columns: 50
Label Column: pv_measurement
Preprocessing data ...
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (5733.42, 0.0, 1058.7

In [10]:
predictor2 = TabularPredictor(label="pv_measurement", 
                              eval_metric='mean_absolute_error',
                              path= DEFAULT_PATH+"B"
                              ).fit(
    train2,
    time_limit=3600,
    presets='best_quality', 
    # hyperparameters='very_large', 
    # tuning_data = tuning2,
    # use_bag_holdout=True,
    # num_bag_folds= 6,
    # refit_full = True,
    # auto_stack = True,
    # num_bag_sets= 10,
    # set_best_to_refit_full= True,
    # num_stack_levels = 2,
    # verbosity = 3
    )

# tuning_data
# num bag holdout 6
# bag_holdout

Presets specified: ['best_quality']
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "ag_145_lag_features_4_24_sineB"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.0.0: Fri Sep 15 14:41:43 PDT 2023; root:xnu-10002.1.13~1/RELEASE_ARM64_T6000
Disk Space Avail:   247.07 GB / 494.38 GB (50.0%)
Train Data Rows:    20301
Train Data Columns: 50
Label Column: pv_measurement
Preprocessing data ...
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (5733.42, 0.0, 1058.7

In [11]:
predictor3 = TabularPredictor(label="pv_measurement", 
                              eval_metric='mean_absolute_error',
                              path= DEFAULT_PATH+"C"
                              ).fit(
    train3,
    time_limit=3600,
    presets='best_quality', 
    # hyperparameters='very_large', 
    # tuning_data = tuning3,
    # use_bag_holdout=True,
    # num_bag_folds= 6,
    # refit_full = True,
    # auto_stack = True,
    # num_bag_sets= 10,
    # set_best_to_refit_full= True,
    # num_stack_levels = 2,
    # verbosity = 3
    )

Presets specified: ['best_quality']
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "ag_145_lag_features_4_24_sineC"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.0.0: Fri Sep 15 14:41:43 PDT 2023; root:xnu-10002.1.13~1/RELEASE_ARM64_T6000
Disk Space Avail:   239.22 GB / 494.38 GB (48.4%)
Train Data Rows:    20301
Train Data Columns: 50
Label Column: pv_measurement
Preprocessing data ...
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (5733.42, 0.0, 1058.7

In [12]:
test_data1 = TabularDataset(test1)
test_data2 = TabularDataset(test2)
test_data3 = TabularDataset(test3)

pred1 = pd.DataFrame(predictor1.predict(test_data1))
pred2 = pd.DataFrame(predictor2.predict(test_data2))
pred3 = pd.DataFrame(predictor3.predict(test_data3))

negatives_pred1 = pred1[pred1["pv_measurement"] < 0]
negatives_pred2 = pred2[pred2["pv_measurement"] < 0]
negatives_pred3 = pred3[pred3["pv_measurement"] < 0]
neg = pd.concat([negatives_pred1, negatives_pred2, negatives_pred3])
neg.shape

  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = d

(240, 1)

In [13]:
pred = pd.concat([pred1, pred2, pred3])
final_prediction = pipin.post_processing(pred, prediction_column="pv_measurement")
final_prediction.to_csv('submissions/gluon_145_lag_features_4_24_sine.csv')

In [14]:
diff = pipin.compare_mae(final_prediction)
diff

689.0320463115394

In [15]:
# predictor.evaluate(df1, silent=True)

In [1]:
%load_ext autoreload

In [2]:
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd
import numpy as np

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
import os

current_dir = os.getcwd()
print("Current working directory:", current_dir)


PATH = "../../../"
# Estimate
X_train_estimated_a: pd.DataFrame = pd.read_parquet(
    PATH + 'A/X_train_estimated.parquet')
X_train_estimated_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/X_train_estimated.parquet")
X_train_estimated_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/X_train_estimated.parquet")

# Test estimates
X_test_estimated_a: pd.DataFrame = pd.read_parquet(
    PATH + "A/X_test_estimated.parquet")
X_test_estimated_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/X_test_estimated.parquet")
X_test_estimated_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/X_test_estimated.parquet")

# Observations
X_train_observed_a: pd.DataFrame = pd.read_parquet(
    PATH + "A/X_train_observed.parquet")
X_train_observed_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/X_train_observed.parquet")
X_train_observed_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/X_train_observed.parquet")

# Targets
Y_train_observed_a: pd.DataFrame = pd.read_parquet(
    PATH + "A/train_targets.parquet")
Y_train_observed_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/train_targets.parquet")
Y_train_observed_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/train_targets.parquet")

test_df_example = pd.read_csv(PATH + "test.csv")

best_submission: pd.DataFrame = pd.read_csv(
    PATH + "mikael/submissions/fourth_submission.csv")

optins = {
    "randomize": False,
    "consecutive_threshold": 6,
    "normalize": False,
    "group_by_hour": True,
    "unzip_date_feature": True,
}

# make a options class with the options as attributes


class Options:
    randomize = False
    consecutive_threshold = 6
    normalize = False
    group_by_hour = True
    unzip_date_feature = True

    def __init__(self, randomize=False, consecutive_threshold=6, normalize=False, group_by_hour=True, unzip_date_feature=True) -> None:
        self.randomize = randomize
        self.consecutive_threshold = consecutive_threshold
        self.normalize = normalize
        self.group_by_hour = group_by_hour
        self.unzip_date_feature = unzip_date_feature


class Pipeline:

    def __init__(self):
        pass

    def get_combined_data(self, test_data=False):
        locations = ["A", "B", "C"]
        dfs = []
        for index, location in enumerate(locations):
            if test_data:
                dfs.append(self.get_test_data(location))
            else:
                dfs.append(self.get_data(location))

            dfs[index] = self.onehot_location(dfs[index], location)
        df = pd.concat(dfs).reset_index(drop=True)

        if test_data:
            return df
        return df[[c for c in df if c not in ['pv_measurement']] +  # pv measurement is the target and is at the end columns
                  ['pv_measurement']]

    def get_data(self, location: str) -> pd.DataFrame:
        train, targets = self.get_training_data_by_location(location)
        return self.handle_data(train, targets)

    def get_test_data(self, location: str) -> pd.DataFrame:
        test_data = self.get_test_data_by_location(location)
        return self.handle_data(test_data)

    def handle_data(self, df, targets=pd.DataFrame()):
        df["date_calc"] = pd.to_datetime(df["date_calc"])
        df["date_forecast"] = pd.to_datetime(df["date_forecast"])

        # df = self.add_time_since_calucation(df)
        df = self.onehot_estimated(df)
        df = self.unzip_date_feature(df)
        df = self.grouped_by_hour(df)

        df["time"] = df["date_forecast"]
        df.drop(["date_forecast"], axis=1, inplace=True)
        if not targets.empty:
            df = self.merge_train_target(df, targets)

        # df.drop(["date_calc"], axis=1, inplace=True)
        df.drop(["time"], axis=1, inplace=True)
        df = self.absolute_values(df)
        return df

    # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– helper funciton ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––

    def get_training_data_by_location(self, location):
        if location == "A":
            X_train_observed_x = X_train_observed_a
            X_train_estimated_x = X_train_estimated_a
            Y_train_x = Y_train_observed_a
        elif location == "B":
            X_train_observed_x = X_train_observed_b
            X_train_estimated_x = X_train_estimated_b
            Y_train_x = Y_train_observed_b
        elif location == "C":
            X_train_observed_x = X_train_observed_c
            X_train_estimated_x = X_train_estimated_c
            Y_train_x = Y_train_observed_c
        else:
            raise Exception("location must be A, B or C")
        train = pd.concat(
            [X_train_observed_x, X_train_estimated_x]).reset_index(drop=True)
        return train, Y_train_x

    def get_test_data_by_location(self, location: str,  normalize=False) -> pd.DataFrame:
        if location == "A":
            df = X_test_estimated_a
        elif location == "B":
            df = X_test_estimated_b
        elif location == "C":
            df = X_test_estimated_c
        else:
            raise Exception("location must be A, B or C")
        return df.copy()

    def unzip_date_feature(self, df: pd.DataFrame, date_column: str = "date_forecast"):
        df[date_column] = pd.to_datetime(df[date_column])
        
        df["day_of_year"] = df[date_column].dt.day_of_year
        df['time_of_day'] = df[date_column].dt.hour + df['date_forecast'].dt.minute / 60
        
        df['time_sin'] = np.sin(2 * np.pi * df['time_of_day'] / 24)
        df['day_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
        
        # df["month"] = df["date_forecast"].dt.month
        df.drop(columns=["day_of_year", "time_of_day"], inplace=True)
        return df

    def add_time_since_calucation(self, df):
        df["date_calc"] = pd.to_datetime(df["date_calc"])
        df["calculated_ago"] = (
            df["date_forecast"] - df["date_calc"]).dt.total_seconds()
        df["calculated_ago"] = df["calculated_ago"].fillna(
            0)
        return df

    def onehot_estimated(self, df):
        df["estimated"] = 0  # Initialize both columns to 0
        df["observed"] = 0
        estimated_mask = df["date_calc"].notna()
        df.loc[estimated_mask, "estimated"] = 1
        df.loc[~estimated_mask, "observed"] = 1
        return df

    def onehot_location(self, df, location):
        if location == "A":
            df["A"], df["B"], df["C"] = 1, 0, 0
        elif location == "B":
            df["A"], df["B"], df["C"] = 0, 1, 0
        elif location == "C":
            df["A"], df["B"], df["C"] = 0, 0, 1
        return df

    def grouped_by_hour(self, df: pd.DataFrame, date_column: str = "date_forecast"):
        df = df.groupby(pd.Grouper(key=date_column, freq="1H")
                        ).mean(numeric_only=True)
        all_nan_mask = df.isnull().all(axis=1)
        df = df[~all_nan_mask]
        return df.reset_index()

    def merge_train_target(self, x, y):
        # henning får med alle pv measurments selv om han merger på inner time. Fordi resample fyller nan rows for alle timer som ikke er i datasettet.
        merged = pd.merge(x, y, on="time", how="right")
        mask = merged["pv_measurement"].notna()
        merged = merged.loc[mask].reset_index(drop=True)
        return merged

    def absolute_values(self, df: pd.DataFrame):
        df[df.columns] = df[df.columns].abs()
        df = df.replace(-0.0, 0.0)
        return df

    def remove_consecutive_measurments(self, df: pd.DataFrame, consecutive_threshold=6, consecutive_threshold_for_zero=12):
        if consecutive_threshold < 2:
            return df

        column_to_check = 'pv_measurement'
        mask = (df[column_to_check] != df[column_to_check].shift(2)).cumsum()

        df['consecutive_count'] = df.groupby(
            mask).transform('count')[column_to_check]

        mask = (df['consecutive_count'] > consecutive_threshold)
        mask_zero = (df['consecutive_count'] > consecutive_threshold_for_zero) & (
            df[column_to_check] == 0)
        df.drop(columns=["consecutive_count"], inplace=True)

        df = df.loc[~mask]
        df = df.loc[~mask_zero]
        return df.reset_index(drop=True)

    def compare_mae(self, df: pd.DataFrame):
        best_submission: pd.DataFrame = pd.read_csv(
            PATH+"mikael/best_submission.csv")
        best_submission = best_submission[["prediction"]]

        if best_submission.shape != df.shape:
            print("best_submission", best_submission.shape)
            print("df", df.shape)
            raise Exception("Dataframe shape must be the same")

        return mean_absolute_error(
            best_submission["prediction"], df["prediction"])

    def split_train_tune(self, df: pd.DataFrame):
        df = df.copy()
        df_estimated = df.loc[df["estimated"] == 1]
        df_observed = df.loc[df["estimated"] == 0]

        num_rows = len(df_estimated)
        middle_index = num_rows // 2

        df_estimated.sample(frac=1, random_state=42)
        train_estimated = df.iloc[:middle_index]
        tune = df.iloc[middle_index:]

        train = pd.concat([df_observed, train_estimated])
        return train, tune

    def post_processing(self, df: pd.DataFrame, prediction_column: str = "prediction_label"):
        df = df[[prediction_column]].rename(
            columns={prediction_column: "prediction"}).reset_index(drop=True).rename_axis(index="id")

        df["prediction"] = df["prediction"].clip(lower=0)
        return df


# pipe = Pipeline()
# df00 = pipe.get_combined_data()
# df00


# pipin = Pipin()
# x = pipin.get_combined_datasets(data_sets={"A"})

# get all date_calc.rows that are nan


# pipin.compare_mae(pd.DataFrame({"prediction": [1,2,3,4,5]}))

# print("df", big_data.head())
# pipin = Pipin()
# test = pipin.get_combined_test_data()
# pipin.get_data("B")

important_features = [
    'time',
    'direct_rad:W',
    'diffuse_rad:W',
    'sun_azimuth:d',
    'sun_elevation:d',
    'clear_sky_energy_1h:J',
    'clear_sky_rad:W',
    'total_cloud_cover:p',
    'effective_cloud_cover:p',
    'rain_water:kgm2',
    'precip_5min:mm',
    'wind_speed_10m:ms',
    'wind_speed_w_1000hPa:ms',
    'super_cooled_liquid_water:kgm2',
    'air_density_2m:kgm3',
    'pressure_100m:hPa',
    'pressure_50m:hPa',
    'sfc_pressure:hPa',
    'msl_pressure:hPa',
    'dew_point_2m:K',
    'is_day:idx',
    'is_in_shadow:idx',
    'elevation:m',

    "snow_melt_10min:mm",
    "snow_density:kgm3",
    "fresh_snow_6h:cm",
    "fresh_snow_1h:cm",
    "snow_water:kgm2",
    "fresh_snow_12h:cm",
    "fresh_snow_3h:cm",
    "fresh_snow_24h:cm",
    "snow_depth:cm",

    'A',
    'B',
    'C',
    "estimated",
    "observed",
]


In [3]:
%autoreload
from pipeline_sin_transformation import Pipeline
pipin = Pipeline()

Current working directory: /Users/miksx/GitHub/Forest-Gump/mikael/autoML/gluon


In [4]:
DEFAULT_PATH = f"ag_148_sine_transform"

In [5]:
df1_0 = pipin.get_data("A")
df2_0 = pipin.get_data("B")
df3_0 = pipin.get_data("C")


In [6]:
df1_0.head()

Unnamed: 0,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,...,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,estimated,observed,time_sin,day_sin,pv_measurement
0,7.7,1.22825,1728.949951,0.0,0.0,1728.949951,0.0,280.299988,0.0,0.0,...,40386.476562,3.6,3.575,0.5,0.0,0.0,1.0,0.411603,0.486273,0.0
1,7.7,1.2235,1689.824951,0.0,0.0,1689.824951,0.0,280.299988,0.0,0.0,...,33770.648438,3.35,3.35,0.275,0.0,0.0,1.0,0.16246,0.486273,0.0
2,7.875,1.21975,1563.224976,0.0,0.0,1563.224976,0.0,280.649994,0.0,0.0,...,13595.5,3.05,2.95,0.75,0.0,0.0,1.0,0.097755,0.47116,0.0
3,8.425,1.218,1283.425049,208.649994,0.75,1283.425049,0.0,281.674988,0.3,526.775024,...,2321.850098,2.725,2.6,0.875,0.0,0.0,1.0,0.351308,0.47116,0.0
4,8.95,1.218,1003.5,32468.150391,23.1,1003.5,0.0,282.5,11.975,22068.949219,...,11634.799805,2.55,2.35,0.925,0.0,0.0,1.0,0.580919,0.47116,19.36


In [7]:
train1 = TabularDataset(df1_0)
train2 = TabularDataset(df2_0)
train3 = TabularDataset(df3_0)

In [8]:
predictor1 = TabularPredictor(label="pv_measurement",
                              eval_metric='mean_absolute_error',
                              path= DEFAULT_PATH+"A"
                              ).fit(
    train1,
    # time_limit=6000,
    # hyperparameters='extrme', 
    presets='best_quality', 
    # tuning_data = tuning1,
    # use_bag_holdout=True,
    # num_bag_folds= 6,
    # refit_full = True,
    # auto_stack = True,
    # num_bag_sets= 10,
    # set_best_to_refit_full= True,
    # num_stack_levels = 2,
    # verbosity = 3
    )

Presets specified: ['best_quality']
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "ag_148_sine_transformA"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.0.0: Fri Sep 15 14:41:43 PDT 2023; root:xnu-10002.1.13~1/RELEASE_ARM64_T6000
Disk Space Avail:   50.00 GB / 494.38 GB (10.1%)
Train Data Rows:    34085
Train Data Columns: 49
Label Column: pv_measurement
Preprocessing data ...
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (5733.42, 0.0, 630.59471, 1165.90242)
	If 'regress

In [10]:
predictor2 = TabularPredictor(label="pv_measurement", 
                              eval_metric='mean_absolute_error',
                              path= DEFAULT_PATH+"B"
                              ).fit(
    train2,

    presets='best_quality', 
    # hyperparameters='very_large', 
    # time_limit=6000,
    # tuning_data = tuning2,
    # use_bag_holdout=True,
    # num_bag_folds= 6,
    # refit_full = True,
    # auto_stack = True,
    # num_bag_sets= 10,
    # set_best_to_refit_full= True,
    # num_stack_levels = 2,
    # verbosity = 3
    )

# tuning_data
# num bag holdout 6
# bag_holdout

Presets specified: ['best_quality']
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "ag_148_sine_transformB"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.0.0: Fri Sep 15 14:41:43 PDT 2023; root:xnu-10002.1.13~1/RELEASE_ARM64_T6000
Disk Space Avail:   39.84 GB / 494.38 GB (8.1%)
Train Data Rows:    32844
Train Data Columns: 49
Label Column: pv_measurement
Preprocessing data ...
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (1152.3, 0.0, 96.82478, 193.94649)
	If 'regression'

In [11]:
predictor3 = TabularPredictor(label="pv_measurement", 
                              eval_metric='mean_absolute_error',
                              path= DEFAULT_PATH+"C"
                              ).fit(
    train3,

    presets='best_quality', 
    # hyperparameters='very_large', 
    # time_limit=6000,
    # tuning_data = tuning3,
    # use_bag_holdout=True,
    # num_bag_folds= 6,
    # refit_full = True,
    # auto_stack = True,
    # num_bag_sets= 10,
    # set_best_to_refit_full= True,
    # num_stack_levels = 2,
    # verbosity = 3
    )

Presets specified: ['best_quality']
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "ag_148_sine_transformC"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.0.0: Fri Sep 15 14:41:43 PDT 2023; root:xnu-10002.1.13~1/RELEASE_ARM64_T6000
Disk Space Avail:   29.39 GB / 494.38 GB (5.9%)
Train Data Rows:    26095
Train Data Columns: 49
Label Column: pv_measurement
Preprocessing data ...
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and label-values can't be converted to int).
	Label info (max, min, mean, stddev): (999.6, 0.0, 77.63106, 165.81688)
	If 'regress

In [12]:
test1 = pipin.get_test_data("A")
test2 = pipin.get_test_data("B")
test3 = pipin.get_test_data("C")

In [13]:
test_data1 = TabularDataset(test1)
test_data2 = TabularDataset(test2)
test_data3 = TabularDataset(test3)

pred1 = pd.DataFrame(predictor1.predict(test_data1))
pred2 = pd.DataFrame(predictor2.predict(test_data2))
pred3 = pd.DataFrame(predictor3.predict(test_data3))

negatives_pred1 = pred1[pred1["pv_measurement"] < 0]
negatives_pred2 = pred2[pred2["pv_measurement"] < 0]
negatives_pred3 = pred3[pred3["pv_measurement"] < 0]
neg = pd.concat([negatives_pred1, negatives_pred2, negatives_pred3])
neg.shape

  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = df.fillna(column_fills, inplace=False, downcast=False)
  df = d

(69, 1)

In [18]:
pred = pd.concat([pred1, pred2, pred3])
final_prediction = pipin.post_processing(pred, prediction_column="pv_measurement")
final_prediction.to_csv(f'submissions/gluon_3_sine_transform.csv')

In [20]:
diff = pipin.compare_mae(final_prediction)
diff

57.990381872286804

In [None]:
# predictor.evaluate(df1, silent=True)