In [54]:
import pandas as pd
from Master import Master
from autogluon.tabular import TabularDataset, TabularPredictor


pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)

In [55]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
import os

current_dir = os.getcwd()
print("Current working directory:", current_dir)


PATH = "/Users/matsalexander/Desktop/Forest Gump/"
# Estimate
X_train_estimated_a: pd.DataFrame = pd.read_parquet(
    PATH + 'A/X_train_estimated.parquet')
X_train_estimated_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/X_train_estimated.parquet")
X_train_estimated_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/X_train_estimated.parquet")

# Test estimates
X_test_estimated_a: pd.DataFrame = pd.read_parquet(
    PATH + "A/X_test_estimated.parquet")
X_test_estimated_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/X_test_estimated.parquet")
X_test_estimated_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/X_test_estimated.parquet")

# Observations
X_train_observed_a: pd.DataFrame = pd.read_parquet(
    PATH + "A/X_train_observed.parquet")
X_train_observed_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/X_train_observed.parquet")
X_train_observed_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/X_train_observed.parquet")

# Targets
Y_train_observed_a: pd.DataFrame = pd.read_parquet(
    PATH + "A/train_targets.parquet")
Y_train_observed_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/train_targets.parquet")
Y_train_observed_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/train_targets.parquet")

test_df_example = pd.read_csv(PATH + "test.csv")

best_submission: pd.DataFrame = pd.read_csv(
    PATH + "mikael/submissions/fourth_submission.csv")

optins = {
    "randomize": False,
    "consecutive_threshold": 6,
    "normalize": False,
    "group_by_hour": True,
    "unzip_date_feature": True,
}

# make a options class with the options as attributes

class Options:
    randomize = False
    consecutive_threshold = 6
    normalize = False
    group_by_hour = True
    unzip_date_feature = True

    def __init__(self, randomize=False, consecutive_threshold=6, normalize=False, group_by_hour=True, unzip_date_feature=True) -> None:
        self.randomize = randomize
        self.consecutive_threshold = consecutive_threshold
        self.normalize = normalize
        self.group_by_hour = group_by_hour
        self.unzip_date_feature = unzip_date_feature


class Pipeline:

    def __init__(self):
        pass
    
    def get_combined_data(self, test_data = False):
        locations = ["A", "B", "C"]
        dfs = []
        for index , location in enumerate(locations):
            if test_data:
                dfs.append(self.get_test_data(location))
            else: dfs.append(self.get_data(location))
            
            dfs[index] = self.onehot_location(dfs[index], location)
        df = pd.concat(dfs).reset_index(drop=True)
            
        if test_data:
            return df
        return df[[c for c in df if c not in ['pv_measurement']] + #pv measurement is the target and is at the end columns
                ['pv_measurement']]
    
        

    def get_data(self, location: str) -> pd.DataFrame:
        train, targets = self.get_training_data_by_location(location)
        return self.handle_data(train, targets)

    def get_test_data(self, location: str) -> pd.DataFrame:
        test_data = self.get_test_data_by_location(location)
        return self.handle_data(test_data)

    def handle_data(self, df, targets = pd.DataFrame()):
        df["date_calc"] = pd.to_datetime(df["date_calc"])
        df["date_forecast"] = pd.to_datetime(df["date_forecast"])
        
        df = self.add_time_since_calucation(df)
        df = self.onehot_estimated(df)
        df = self.unzip_date_feature(df)
        df = self.grouped_by_hour(df)
        
        df["time"] = df["date_forecast"]
        df.drop(["date_forecast"], axis=1, inplace=True)
        if not targets.empty:
            df = self.merge_train_target(df, targets)
            
        # df.drop(["date_calc"], axis=1, inplace=True)
        df.drop(["time"], axis=1, inplace=True)
        
        return df


    # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– helper funciton ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
    def get_training_data_by_location(self, location):
        if location == "A":
            X_train_observed_x = X_train_observed_a
            X_train_estimated_x = X_train_estimated_a
            Y_train_x = Y_train_observed_a
        elif location == "B":
            X_train_observed_x = X_train_observed_b
            X_train_estimated_x = X_train_estimated_b
            Y_train_x = Y_train_observed_b
        elif location == "C":
            X_train_observed_x = X_train_observed_c
            X_train_estimated_x = X_train_estimated_c
            Y_train_x = Y_train_observed_c
        else:
            raise Exception("location must be A, B or C")
        train = pd.concat(
            [X_train_observed_x, X_train_estimated_x]).reset_index(drop=True)
        return train, Y_train_x
    
    def get_test_data_by_location(self, location: str,  normalize=False) -> pd.DataFrame:
        if location == "A":
            df = X_test_estimated_a
        elif location == "B":
            df = X_test_estimated_b
        elif location == "C":
            df = X_test_estimated_c
        else:
            raise Exception("location must be A, B or C")
        return df.copy()
    
    def unzip_date_feature(self, df: pd.DataFrame, date_column: str = "date_forecast"):
        df[date_column] = pd.to_datetime(df[date_column])
        df["day_of_year"] = df["date_forecast"].dt.day_of_year
        df["hour"] = df["date_forecast"].dt.hour
        # df["month"] = df["date_forecast"].dt.month
        return df
    
    def add_time_since_calucation(self, df):
        df["date_calc"] = pd.to_datetime(df["date_calc"])
        df["calculated_ago"] = (
            df["date_forecast"] - df["date_calc"]).dt.total_seconds()
        df["calculated_ago"] = df["calculated_ago"].fillna(
            0)
        return df
    
    def onehot_estimated(self, df):
        df["estimated"] = 0  # Initialize both columns to 0
        df["observed"] = 0
        estimated_mask = df["date_calc"].notna()
        df.loc[estimated_mask, "estimated"] = 1
        df.loc[~estimated_mask, "observed"] = 1
        return df

    def onehot_location(self, df, location):
        if location == "A":
            df["A"], df["B"], df["C"] = 1, 0, 0
        elif location == "B":
            df["A"], df["B"], df["C"] = 0, 1, 0
        elif location == "C":
            df["A"], df["B"], df["C"] = 0, 0, 1
        return df

    def grouped_by_hour(self, df: pd.DataFrame, date_column: str = "date_forecast"):
        df = df.groupby(pd.Grouper(key=date_column, freq="1H")
                        ).mean(numeric_only=True)
        all_nan_mask = df.isnull().all(axis=1)
        df = df[~all_nan_mask]
        return df.reset_index()
    
    def merge_train_target(self, x, y):
        merged = pd.merge(x, y, on="time", how="inner")
        
        mask = merged["pv_measurement"].notna()
        merged = merged.loc[mask].reset_index(drop=True)
        return merged
        
    def remove_consecutive_measurments(self, df: pd.DataFrame, consecutive_threshold=6, consecutive_threshold_for_zero=12):
        if consecutive_threshold < 2:
            return df

        column_to_check = 'pv_measurement'
        mask = (df[column_to_check] != df[column_to_check].shift(2)).cumsum()

        df['consecutive_count'] = df.groupby(
            mask).transform('count')[column_to_check]

        mask = (df['consecutive_count'] > consecutive_threshold)
        mask_zero = (df['consecutive_count'] > consecutive_threshold_for_zero) & (
            df[column_to_check] == 0)
        df.drop(columns=["consecutive_count"], inplace=True)

        df = df.loc[~mask]
        df = df.loc[~mask_zero]
        return df.reset_index(drop=True)
    
    def compare_mae(self, df: pd.DataFrame):
        best_submission: pd.DataFrame = pd.read_csv(
            PATH+"mats/submissions/big_gluon_best.csv")
        best_submission = best_submission[["prediction"]]

        if best_submission.shape != df.shape:
            print("best_submission", best_submission.shape)
            print("df", df.shape)
            raise Exception("Dataframe shape must be the same")

        return mean_absolute_error(
            best_submission["prediction"], df["prediction"])


pipe = Pipeline()
df00 = pipe.get_combined_data()
df00

Current working directory: /Users/matsalexander/Desktop/Forest Gump/henning mats


Unnamed: 0,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,direct_rad:W,direct_rad_1h:J,effective_cloud_cover:p,elevation:m,fresh_snow_12h:cm,fresh_snow_1h:cm,fresh_snow_24h:cm,fresh_snow_3h:cm,fresh_snow_6h:cm,is_day:idx,is_in_shadow:idx,msl_pressure:hPa,precip_5min:mm,precip_type_5min:idx,pressure_100m:hPa,pressure_50m:hPa,prob_rime:p,rain_water:kgm2,relative_humidity_1000hPa:p,sfc_pressure:hPa,snow_density:kgm3,snow_depth:cm,snow_drift:idx,snow_melt_10min:mm,snow_water:kgm2,sun_azimuth:d,sun_elevation:d,super_cooled_liquid_water:kgm2,t_1000hPa:K,total_cloud_cover:p,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,calculated_ago,estimated,observed,day_of_year,hour,A,B,C,pv_measurement
0,7.700,1.22825,1728.949951,0.000000,0.000,1728.949951,0.0,280.299988,0.000,0.000000,0.00,0.000000,99.074997,6.0,0.0,0.0,0.0,0.0,0.0,0.00,1.0,1006.299988,0.0,0.0,993.750000,999.775024,0.0,0.000,71.674995,1005.799988,,0.0,0.0,0.0,0.175,348.036743,-3.77425,0.000,286.225006,100.000000,40386.476562,3.600,-3.575,-0.500,0.0,0.0,0.0,1.0,153.0,22.0,1,0,0,0.00
1,7.700,1.22350,1689.824951,0.000000,0.000,1689.824951,0.0,280.299988,0.000,0.000000,0.00,0.000000,99.750000,6.0,0.0,0.0,0.0,0.0,0.0,0.00,1.0,1005.200012,0.0,0.0,992.674988,998.650024,0.0,0.025,68.000000,1004.650024,,0.0,0.0,0.0,0.200,91.980751,-4.35725,0.000,286.899994,100.000000,33770.648438,3.350,-3.350,0.275,0.0,0.0,0.0,1.0,153.0,23.0,1,0,0,0.00
2,7.875,1.21975,1563.224976,0.000000,0.000,1563.224976,0.0,280.649994,0.000,0.000000,0.00,0.000000,100.000000,6.0,0.0,0.0,0.0,0.0,0.0,0.00,1.0,1004.525024,0.0,0.0,992.000000,997.974976,0.0,0.100,67.949997,1003.950012,,0.0,0.0,0.0,0.400,14.934750,-3.30950,0.000,286.950012,100.000000,13595.500000,3.050,-2.950,0.750,0.0,0.0,0.0,1.0,154.0,0.0,1,0,0,0.00
3,8.425,1.21800,1283.425049,208.649994,0.750,1283.425049,0.0,281.674988,0.300,526.775024,0.00,0.000000,100.000000,6.0,0.0,0.0,0.0,0.0,0.0,0.25,1.0,1004.025024,0.0,0.0,991.500000,997.449951,0.0,0.125,73.875000,1003.449951,,0.0,0.0,0.0,0.550,28.630251,-0.82250,0.000,286.750000,100.000000,2321.850098,2.725,-2.600,0.875,0.0,0.0,0.0,1.0,154.0,1.0,1,0,0,0.00
4,8.950,1.21800,1003.500000,32468.150391,23.100,1003.500000,0.0,282.500000,11.975,22068.949219,0.15,282.975006,84.875000,6.0,0.0,0.0,0.0,0.0,0.0,1.00,0.0,1003.099976,0.0,0.0,990.550049,996.500000,0.0,0.100,79.925003,1002.500000,,0.0,0.0,0.0,0.250,41.997501,3.05125,0.000,286.450012,99.224998,11634.799805,2.550,-2.350,0.925,0.0,0.0,0.0,1.0,154.0,2.0,1,0,0,19.36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92946,4.400,1.27550,1456.574951,84010.148438,4.175,551.224976,0.0,272.024994,2.775,54774.000000,0.00,9934.575195,97.724998,24.0,0.0,0.0,0.3,0.0,0.0,0.50,1.0,1014.900024,0.0,0.0,999.049988,1005.275024,0.0,0.000,74.800003,1011.525024,,0.0,0.0,0.0,0.000,304.936493,-0.20150,0.000,274.924988,97.724998,25028.000000,4.075,3.600,1.875,0.0,130945.0,1.0,0.0,120.0,19.0,0,0,1,50.96
92947,4.400,1.27850,1476.349976,2206.800049,0.000,564.099976,0.0,271.950012,0.000,4984.049805,0.00,0.000000,95.449997,24.0,0.0,0.0,0.3,0.0,0.0,0.00,1.0,1014.849976,0.0,0.0,999.025024,1005.250000,0.0,0.000,76.974998,1011.549988,,0.0,0.0,0.0,0.000,318.620483,-5.20400,0.000,274.575012,95.850006,23995.599609,3.600,2.950,2.125,0.0,134545.0,1.0,0.0,120.0,20.0,0,0,1,2.94
92948,4.400,1.27900,1516.300049,0.000000,0.000,578.700012,0.0,271.899994,0.000,0.000000,0.00,0.000000,93.925003,24.0,0.0,0.0,0.3,0.0,0.0,0.00,1.0,1014.650024,0.0,0.0,998.900024,1005.125000,0.0,0.000,77.724998,1011.400024,,0.0,0.0,0.0,0.000,332.780243,-8.98450,0.025,274.399994,95.925003,23068.599609,3.600,2.625,2.400,0.0,138145.0,1.0,0.0,120.0,21.0,0,0,1,0.00
92949,4.400,1.27975,1240.599976,0.000000,0.000,551.500000,0.0,271.950012,0.000,0.000000,0.00,0.000000,98.375000,24.0,0.0,0.0,0.3,0.0,0.0,0.00,1.0,1014.500000,0.0,0.0,998.724976,1004.974976,0.0,0.000,79.400002,1011.224976,,0.0,0.0,0.0,0.075,347.373230,-11.27050,0.125,274.225006,99.425003,11856.700195,3.275,2.325,2.325,0.0,141745.0,1.0,0.0,120.0,22.0,0,0,1,-0.00


In [56]:
df00.describe()

Unnamed: 0,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,direct_rad:W,direct_rad_1h:J,effective_cloud_cover:p,elevation:m,fresh_snow_12h:cm,fresh_snow_1h:cm,fresh_snow_24h:cm,fresh_snow_3h:cm,fresh_snow_6h:cm,is_day:idx,is_in_shadow:idx,msl_pressure:hPa,precip_5min:mm,precip_type_5min:idx,pressure_100m:hPa,pressure_50m:hPa,prob_rime:p,rain_water:kgm2,relative_humidity_1000hPa:p,sfc_pressure:hPa,snow_density:kgm3,snow_depth:cm,snow_drift:idx,snow_melt_10min:mm,snow_water:kgm2,sun_azimuth:d,sun_elevation:d,super_cooled_liquid_water:kgm2,t_1000hPa:K,total_cloud_cover:p,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,calculated_ago,estimated,observed,day_of_year,hour,A,B,C,pv_measurement
count,92951.0,92951.0,76534.0,92951.0,92951.0,86213.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,4213.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0,92951.0
mean,6.017394,1.255435,2888.300781,515155.2,143.098022,1735.995239,0.006799,275.237946,39.395199,141818.5,50.245178,180885.9,67.086441,11.401738,0.117096,0.009727,0.231608,0.029244,0.058533,0.483303,0.564284,1009.502441,0.005657,0.084348,995.818848,1001.949585,0.747828,0.009566,73.670586,1008.107666,250.0,0.193164,2.2e-05,0.000273,0.090299,179.648544,-1.206875,0.056897,279.430634,73.692551,33025.015625,3.038167,0.664565,0.685095,-8e-06,11896.119375,0.117535,0.882465,176.23748,11.501339,0.36644,0.353079,0.280481,287.232321
std,2.711861,0.036567,2536.682617,818571.6,227.959961,1809.297363,0.234531,6.829573,60.518574,215289.8,112.91716,401263.1,34.269562,7.877236,0.779764,0.104943,1.223007,0.275208,0.47704,0.485974,0.483166,13.085625,0.029169,0.330071,13.004988,13.063836,5.298545,0.041121,14.229107,13.124816,0.0,1.253925,0.004181,0.004249,0.237841,97.282532,23.970707,0.105794,6.515625,34.021942,17913.982422,1.760291,2.802007,1.878808,0.005994,33700.371188,0.322058,0.322058,109.498583,6.920153,0.481834,0.477929,0.449237,766.670114
min,0.5,1.13925,27.799999,0.0,0.0,27.5,-1.0,247.425003,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,944.375,0.0,0.0,929.974976,935.75,0.0,0.0,19.575001,941.549988,250.0,0.0,0.0,0.0,0.0,6.983,-49.931999,0.0,258.024994,0.0,132.375,0.025,-7.225,-8.4,-0.1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.0
25%,4.025,1.23025,1087.606201,0.0,0.0,591.924988,0.0,270.75,0.0,0.0,0.0,0.0,42.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1001.400024,0.0,0.0,987.775024,993.849976,0.0,0.0,64.199997,999.974976,250.0,0.0,0.0,0.0,0.0,94.678749,-18.599751,0.0,274.899994,53.224998,16862.799805,1.675,-1.35,-0.575,0.0,0.0,0.0,1.0,78.0,6.0,0.0,0.0,0.0,0.0
50%,5.45,1.255,1887.887451,10476.17,1.65,1164.525024,0.0,274.975006,0.925,10513.03,0.0,0.0,79.949997,7.0,0.0,0.0,0.0,0.0,0.0,0.25,1.0,1010.349976,0.0,0.0,996.75,1002.849976,0.0,0.0,76.0,1009.0,250.0,0.0,0.0,0.0,0.0,179.979752,-0.8645,0.0,278.650024,93.050003,36846.175781,2.7,0.3,0.725,0.0,0.0,0.0,1.0,171.0,12.0,0.0,0.0,0.0,0.0
75%,7.825,1.2785,3988.412598,797531.5,216.800003,2079.25,0.0,280.5,65.275002,234580.6,29.299999,114297.0,98.637497,24.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1018.549988,0.0,0.0,1004.924988,1011.049988,0.0,0.0,85.050003,1017.200012,250.0,0.0,0.0,0.0,0.1,264.419983,15.25075,0.1,283.950012,99.900002,48308.988281,4.05,2.5,1.875,0.0,0.0,0.0,1.0,276.0,17.0,1.0,1.0,1.0,173.3625
max,17.35,1.441,12294.901367,2990596.0,835.650024,11673.724609,1.0,293.625,334.75,1198315.0,683.400024,2441810.0,100.0,24.0,37.474998,7.25,37.474998,20.325001,33.375,1.0,1.0,1044.099976,0.6225,5.0,1030.875,1037.25,96.775002,1.1,100.0,1043.724976,250.0,18.200001,1.0,0.18,5.65,348.487518,49.943748,1.375,303.25,100.0,75489.328125,13.275,11.2,8.825,0.1,145347.0,1.0,1.0,366.0,23.0,1.0,1.0,1.0,5733.42


In [57]:
A_B_C_all = pipe.get_combined_data()


In [58]:
A_B_C_all.head(30)

Unnamed: 0,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,direct_rad:W,direct_rad_1h:J,effective_cloud_cover:p,elevation:m,fresh_snow_12h:cm,fresh_snow_1h:cm,fresh_snow_24h:cm,fresh_snow_3h:cm,fresh_snow_6h:cm,is_day:idx,is_in_shadow:idx,msl_pressure:hPa,precip_5min:mm,precip_type_5min:idx,pressure_100m:hPa,pressure_50m:hPa,prob_rime:p,rain_water:kgm2,relative_humidity_1000hPa:p,sfc_pressure:hPa,snow_density:kgm3,snow_depth:cm,snow_drift:idx,snow_melt_10min:mm,snow_water:kgm2,sun_azimuth:d,sun_elevation:d,super_cooled_liquid_water:kgm2,t_1000hPa:K,total_cloud_cover:p,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,calculated_ago,estimated,observed,day_of_year,hour,A,B,C,pv_measurement
0,7.7,1.22825,1728.949951,0.0,0.0,1728.949951,0.0,280.299988,0.0,0.0,0.0,0.0,99.074997,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1006.299988,0.0,0.0,993.75,999.775024,0.0,0.0,71.674995,1005.799988,,0.0,0.0,0.0,0.175,348.036743,-3.77425,0.0,286.225006,100.0,40386.476562,3.6,-3.575,-0.5,0.0,0.0,0.0,1.0,153.0,22.0,1,0,0,0.0
1,7.7,1.2235,1689.824951,0.0,0.0,1689.824951,0.0,280.299988,0.0,0.0,0.0,0.0,99.75,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1005.200012,0.0,0.0,992.674988,998.650024,0.0,0.025,68.0,1004.650024,,0.0,0.0,0.0,0.2,91.980751,-4.35725,0.0,286.899994,100.0,33770.648438,3.35,-3.35,0.275,0.0,0.0,0.0,1.0,153.0,23.0,1,0,0,0.0
2,7.875,1.21975,1563.224976,0.0,0.0,1563.224976,0.0,280.649994,0.0,0.0,0.0,0.0,100.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1004.525024,0.0,0.0,992.0,997.974976,0.0,0.1,67.949997,1003.950012,,0.0,0.0,0.0,0.4,14.93475,-3.3095,0.0,286.950012,100.0,13595.5,3.05,-2.95,0.75,0.0,0.0,0.0,1.0,154.0,0.0,1,0,0,0.0
3,8.425,1.218,1283.425049,208.65,0.75,1283.425049,0.0,281.674988,0.3,526.775024,0.0,0.0,100.0,6.0,0.0,0.0,0.0,0.0,0.0,0.25,1.0,1004.025024,0.0,0.0,991.5,997.449951,0.0,0.125,73.875,1003.449951,,0.0,0.0,0.0,0.55,28.630251,-0.8225,0.0,286.75,100.0,2321.850098,2.725,-2.6,0.875,0.0,0.0,0.0,1.0,154.0,1.0,1,0,0,0.0
4,8.95,1.218,1003.5,32468.15,23.1,1003.5,0.0,282.5,11.975,22068.949219,0.15,282.975,84.875,6.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1003.099976,0.0,0.0,990.550049,996.5,0.0,0.1,79.925003,1002.5,,0.0,0.0,0.0,0.25,41.997501,3.05125,0.0,286.450012,99.224998,11634.799805,2.55,-2.35,0.925,0.0,0.0,0.0,1.0,154.0,2.0,1,0,0,19.36
5,9.25,1.2165,809.375,179499.1,84.375,809.375,0.0,283.049988,45.125,102788.046875,6.3,11627.55,65.025002,6.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1002.349976,0.0,0.0,989.75,995.700012,0.0,0.0,82.849998,1001.674988,,0.0,0.0,0.0,0.1,55.041748,8.071,0.0,286.475006,94.800003,29848.199219,2.3,-2.2,0.8,0.0,0.0,0.0,1.0,154.0,3.0,1,0,0,251.02
6,9.525,1.213,757.775024,478117.8,186.649994,757.775024,0.0,283.524994,89.525002,242438.0,43.724998,90073.38,77.550003,6.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1001.825012,0.0,0.0,989.25,995.200012,0.0,0.0,82.275002,1001.174988,,0.0,0.0,0.0,0.0,67.898003,13.9565,0.0,286.825012,95.099998,35980.148438,2.325,-2.2,0.75,0.0,0.0,0.0,1.0,154.0,4.0,1,0,0,263.78
7,9.7,1.2075,705.650024,892667.9,311.525024,705.650024,0.0,283.799988,139.0,411381.90625,53.099998,174254.5,85.050003,6.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1001.200012,0.0,0.0,988.700012,994.599976,0.0,0.0,78.699997,1000.5,,0.0,0.0,0.0,0.0,80.821251,20.40625,0.0,287.325012,97.150002,37874.523438,2.225,-2.0,0.95,0.0,0.0,0.0,1.0,154.0,5.0,1,0,0,522.72
8,9.55,1.205,669.650024,1357902.0,442.75,669.650024,0.0,283.600006,167.100006,550960.5625,33.825001,156473.2,96.75,6.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1000.75,0.0,0.0,988.275024,994.200012,0.0,0.05,75.099998,1000.150024,,0.0,0.0,0.0,0.125,94.178497,27.096001,0.0,287.450012,99.199997,25470.25,2.35,-1.75,1.525,0.0,0.0,0.0,1.0,154.0,6.0,1,0,0,904.42
9,9.45,1.205,662.224976,1821739.0,567.299988,662.224976,0.0,283.524994,217.699997,692633.125,17.6,92651.22,100.0,6.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1000.849976,0.0,0.0,988.349976,994.25,0.0,0.15,74.199997,1000.174988,,0.0,0.0,0.0,0.425,108.438499,33.660751,0.0,287.325012,100.0,2655.699951,2.7,-1.175,2.425,0.0,0.0,0.0,1.0,154.0,7.0,1,0,0,1238.82


In [59]:
test_A_B_C = pipe.get_combined_data(test_data=True)

In [60]:


train_data = TabularDataset(A_B_C_all)

label="pv_measurement"


predictor = TabularPredictor(label=label,eval_metric='mean_absolute_error').fit(A_B_C_all,presets="best_quality")


No path specified. Models will be saved in: "AutogluonModels/ag-20231106_173253/"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231106_173253/"
AutoGluon Version:  0.8.2
Python Version:     3.10.13
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 22.5.0: Thu Jun  8 22:22:20 PDT 2023; root:xnu-8796.121.3~7/RELEASE_ARM64_T6000
Disk Space Avail:   711.66 GB / 994.66 GB (71.5%)
Train Data Rows:    92951
Train Data Columns: 53
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (5733.42, -0.0, 287.23232, 766.67011)
	If 'regression' is not the correct problem_type, please manually specify the problem_type param

In [61]:
predictions=predictor.predict(test_A_B_C)

In [71]:
predictions_df = pd.DataFrame({
    'prediction': predictions
})
predictions_df.insert(0, 'id', range(0, len(predictions_df)))


predictions_df.set_index("id", inplace=True)
pipe.compare_mae(predictions_df)
predictions_df.insert(0, 'id', range(0, len(predictions_df)))
predictions_df



Unnamed: 0_level_0,id,prediction
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,-0.145134
1,1,-0.152983
2,2,0.016135
3,3,60.679718
4,4,298.935638
...,...,...
2155,2155,82.910408
2156,2156,55.217949
2157,2157,13.562857
2158,2158,2.465167


In [72]:
predictions_df['prediction'] = pd.to_numeric(predictions_df['prediction'], errors='coerce')

predictions_df['prediction'] = predictions_df['prediction'].clip(lower=0)
predictions_df
# predictions_df.to_csv('predictions_new_pipeline1.csv', index=False)


Unnamed: 0_level_0,id,prediction
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,0.000000
1,1,0.000000
2,2,0.016135
3,3,60.679718
4,4,298.935638
...,...,...
2155,2155,82.910408
2156,2156,55.217949
2157,2157,13.562857
2158,2158,2.465167


In [73]:
predictions_df.to_csv('predictions_new_pipeline1.csv', index=False)
