<h1> Dora the explorer consecutive measures </h1>

In [14]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)

In [16]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
import os

current_dir = os.getcwd() + "../"
print("Current working directory:", current_dir)


# PATH = "/Users/matsalexander/Desktop/Forest Gump/"
# PATH = "/Users/henningdropping/Documents/GitHub/Forest-Gump//"
PATH = "../../"
# Estimate
X_train_estimated_a: pd.DataFrame = pd.read_parquet(
    PATH + 'A/X_train_estimated.parquet')
X_train_estimated_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/X_train_estimated.parquet")
X_train_estimated_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/X_train_estimated.parquet")

# Test estimates
X_test_estimated_a: pd.DataFrame = pd.read_parquet(
    PATH + "A/X_test_estimated.parquet")
X_test_estimated_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/X_test_estimated.parquet")
X_test_estimated_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/X_test_estimated.parquet")

# Observations
X_train_observed_a: pd.DataFrame = pd.read_parquet(
    PATH + "A/X_train_observed.parquet")
X_train_observed_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/X_train_observed.parquet")
X_train_observed_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/X_train_observed.parquet")

# Targets
Y_train_observed_a: pd.DataFrame = pd.read_parquet(
    PATH + "A/train_targets.parquet")
Y_train_observed_b: pd.DataFrame = pd.read_parquet(
    PATH + "B/train_targets.parquet")
Y_train_observed_c: pd.DataFrame = pd.read_parquet(
    PATH + "C/train_targets.parquet")

test_df_example = pd.read_csv(PATH + "test.csv")

best_submission: pd.DataFrame = pd.read_csv(
    PATH + "mikael/submissions/fourth_submission.csv")

optins = {
    "randomize": False,
    "consecutive_threshold": 6,
    "normalize": False,
    "group_by_hour": True,
    "unzip_date_feature": True,
}

# make a options class with the options as attributes


class Options:
    randomize = False
    consecutive_threshold = 6
    normalize = False
    group_by_hour = True
    unzip_date_feature = True

    def __init__(self, randomize=False, consecutive_threshold=6, normalize=False, group_by_hour=True, unzip_date_feature=True) -> None:
        self.randomize = randomize
        self.consecutive_threshold = consecutive_threshold
        self.normalize = normalize
        self.group_by_hour = group_by_hour
        self.unzip_date_feature = unzip_date_feature


class Pipeline:

    def __init__(self):
        pass

    def get_combined_data(self, test_data=False):
        locations = ["A", "B", "C"]
        dfs = []
        for index, location in enumerate(locations):
            if test_data:
                dfs.append(self.get_test_data(location))
            else:
                dfs.append(self.get_data(location))

            dfs[index] = self.onehot_location(dfs[index], location)
        df = pd.concat(dfs).reset_index(drop=True)

        if test_data:
            return df
        return df[[c for c in df if c not in ['pv_measurement']] +  # pv measurement is the target and is at the end columns
                  ['pv_measurement']]

    def get_data(self, location: str) -> pd.DataFrame:
        train, targets = self.get_training_data_by_location(location)
        return self.handle_data(train, targets)

    def get_test_data(self, location: str) -> pd.DataFrame:
        test_data = self.get_test_data_by_location(location)
        return self.handle_data(test_data)

    def handle_data(self, df, targets=pd.DataFrame()):
        df["date_calc"] = pd.to_datetime(df["date_calc"])
        df["date_forecast"] = pd.to_datetime(df["date_forecast"])

        # df = self.add_time_since_calucation(df)
        df = self.onehot_estimated(df)
        df = self.unzip_date_feature(df)
        df = self.grouped_by_hour(df)

        df["time"] = df["date_forecast"]
        df.drop(["date_forecast"], axis=1, inplace=True)

        # denne kjører bare når vi prossessere train data (med targets som parameter)
        if not targets.empty:
            df = self.merge_train_target(df, targets)

        df.drop(["time"], axis=1, inplace=True)
        df = self.absolute_values(df)
        return df

    # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– helper funciton ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––

    def get_training_data_by_location(self, location):
        if location == "A":
            X_train_observed_x = X_train_observed_a
            X_train_estimated_x = X_train_estimated_a
            Y_train_x = Y_train_observed_a
        elif location == "B":
            X_train_observed_x = X_train_observed_b
            X_train_estimated_x = X_train_estimated_b
            Y_train_x = Y_train_observed_b
        elif location == "C":
            X_train_observed_x = X_train_observed_c
            X_train_estimated_x = X_train_estimated_c
            Y_train_x = Y_train_observed_c
        else:
            raise Exception("location must be A, B or C")
        train = pd.concat(
            [X_train_observed_x, X_train_estimated_x]).reset_index(drop=True)
        return train, Y_train_x

    def get_test_data_by_location(self, location: str,  normalize=False) -> pd.DataFrame:
        if location == "A":
            df = X_test_estimated_a
        elif location == "B":
            df = X_test_estimated_b
        elif location == "C":
            df = X_test_estimated_c
        else:
            raise Exception("location must be A, B or C")
        return df.copy()

    def unzip_date_feature(self, df: pd.DataFrame, date_column: str = "date_forecast"):
        df[date_column] = pd.to_datetime(df[date_column])
        df["day_of_year"] = df["date_forecast"].dt.day_of_year
        df["hour"] = df["date_forecast"].dt.hour
        # df["month"] = df["date_forecast"].dt.month
        return df

    def add_time_since_calucation(self, df):
        df["date_calc"] = pd.to_datetime(df["date_calc"])
        df["calculated_ago"] = (
            df["date_forecast"] - df["date_calc"]).dt.total_seconds()
        df["calculated_ago"] = df["calculated_ago"].fillna(
            0)
        return df

    def onehot_estimated(self, df):
        df["estimated"] = 0  # Initialize both columns to 0
        df["observed"] = 0
        estimated_mask = df["date_calc"].notna()
        df.loc[estimated_mask, "estimated"] = 1
        df.loc[~estimated_mask, "observed"] = 1
        return df

    def onehot_location(self, df, location):
        if location == "A":
            df["A"], df["B"], df["C"] = 1, 0, 0
        elif location == "B":
            df["A"], df["B"], df["C"] = 0, 1, 0
        elif location == "C":
            df["A"], df["B"], df["C"] = 0, 0, 1
        return df

    def grouped_by_hour(self, df: pd.DataFrame, date_column: str = "date_forecast"):
        df = df.groupby(pd.Grouper(key=date_column, freq="1H")
                        ).mean(numeric_only=True)
        all_nan_mask = df.isnull().all(axis=1)
        df = df[~all_nan_mask]
        return df.reset_index()

    def merge_train_target(self, x, y):
        # henning får med alle pv measurments selv om han merger på inner time. Fordi resample fyller nan rows for alle timer som ikke er i datasettet.
        merged = pd.merge(x, y, on="time", how="right")
        mask = merged["pv_measurement"].notna()
        merged = merged.loc[mask].reset_index(drop=True)
        return merged

    def absolute_values(self, df: pd.DataFrame):
        df[df.columns] = df[df.columns].abs()
        df = df.replace(-0.0, 0.0)
        return df

    def remove_consecutive_measurments(self, df: pd.DataFrame, consecutive_threshold=6, consecutive_threshold_for_zero=12, return_removed=False):
        if consecutive_threshold < 2:
            return df

        column_to_check = 'pv_measurement'
        mask = (df[column_to_check] != df[column_to_check].shift(2)).cumsum()

        df['consecutive_group'] = df.groupby(
            mask).transform('count')[column_to_check]
        mask = (df['consecutive_group'] > consecutive_threshold)
        mask_zero = (df['consecutive_group'] > consecutive_threshold_for_zero) & (
            df[column_to_check] == 0)

        if return_removed:
            return df[mask | mask_zero]

        df.drop(columns=["consecutive_group"], inplace=True)

        df = df.loc[~mask]
        df = df.loc[~mask_zero]
        return df.reset_index(drop=True)

    def compare_mae(self, df: pd.DataFrame):
        best_submission: pd.DataFrame = pd.read_csv(
            PATH+"mats/submissions/big_gluon_best.csv")
        best_submission = best_submission[["prediction"]]

        if best_submission.shape != df.shape:
            print("best_submission", best_submission.shape)
            print("df", df.shape)
            raise Exception("Dataframe shape must be the same")

        return mean_absolute_error(
            best_submission["prediction"], df["prediction"])

    def split_train_tune(self, df: pd.DataFrame):
        df = df.copy()
        df_estimated = df.loc[df["estimated"] == 1]
        df_observed = df.loc[df["estimated"] == 0]

        num_rows = len(df_estimated)
        middle_index = num_rows // 2

        df_estimated.sample(frac=1, random_state=42)
        train_estimated = df.iloc[:middle_index]
        tune = df.iloc[middle_index:]

        train = pd.concat([df_observed, train_estimated])
        return train, tune

    def post_processing(self, df: pd.DataFrame, prediction_column: str = "prediction_label"):
        df = df[[prediction_column]].rename(
            columns={prediction_column: "prediction"}).reset_index(drop=True).rename_axis(index="id")

        df["prediction"] = df["prediction"].clip(lower=0)
        return df


# pipe = Pipeline()
# df00 = pipe.get_combined_data()
# df00


# pipin = Pipin()
# x = pipin.get_combined_datasets(data_sets={"A"})

# get all date_calc.rows that are nan


# pipin.compare_mae(pd.DataFrame({"prediction": [1,2,3,4,5]}))

# print("df", big_data.head())
# pipin = Pipin()
# test = pipin.get_combined_test_data()
# pipin.get_data("B")

important_features = [
    'time',
    'direct_rad:W',
    'diffuse_rad:W',
    'sun_azimuth:d',
    'sun_elevation:d',
    'clear_sky_energy_1h:J',
    'clear_sky_rad:W',
    'total_cloud_cover:p',
    'effective_cloud_cover:p',
    'rain_water:kgm2',
    'precip_5min:mm',
    'wind_speed_10m:ms',
    'wind_speed_w_1000hPa:ms',
    'super_cooled_liquid_water:kgm2',
    'air_density_2m:kgm3',
    'pressure_100m:hPa',
    'pressure_50m:hPa',
    'sfc_pressure:hPa',
    'msl_pressure:hPa',
    'dew_point_2m:K',
    'is_day:idx',
    'is_in_shadow:idx',
    'elevation:m',

    "snow_melt_10min:mm",
    "snow_density:kgm3",
    "fresh_snow_6h:cm",
    "fresh_snow_1h:cm",
    "snow_water:kgm2",
    "fresh_snow_12h:cm",
    "fresh_snow_3h:cm",
    "fresh_snow_24h:cm",
    "snow_depth:cm",

    'A',
    'B',
    'C',
    "estimated",
    "observed",
]

pipe = Pipeline()
%load_ext autoreload

Current working directory: /Users/matsalexander/Desktop/Forest Gump/final_submission/mats_explore_to_merge../
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
# all_data = pipe.get_combined_datasets(offset_years=True)
all_data = pipe.get_combined_data()
all_data.shape

(93024, 53)

In [18]:
df1_0 = pipe.get_data("A")
df2_0 = pipe.get_data("B")
df3_0 = pipe.get_data("C")

In [19]:
def diff_between_dataframes(df1, df2):
    # Add a 'source' column to identify the source DataFrame
    df1['source'] = 'df1'
    df2['source'] = 'df2'

    # Concatenate the DataFrames and drop duplicates
    merged_df = pd.concat([df1, df2], ignore_index=True)
    diff_df = merged_df[merged_df.duplicated(subset=df1.columns, keep=False)]

    return diff_df

In [20]:
# df1_0 = pipe.remove_consecutive_measurments(df1_0, 4, 24)
# df2_0 = pipe.remove_consecutive_measurments(df2_0, 4, 24)
# df3_0 = pipe.remove_consecutive_measurments(df3_0, 4, 24)
print(df1_0.shape, df2_0.shape, df3_0.shape)

(34085, 50) (32844, 50) (26095, 50)


In [21]:
# df0, df1 = pipe.remove_consecutive_measurments(all_data.copy(), 4,34, fuck = True)
relevant_columns = [ "pv_measurement", "consecutive_group", "direct_rad:W", "hour"]
relevant_columns2 = [ "pv_measurement", "direct_rad:W", "hour", "day_of_year"]
# print(df0.shape, df1.shape)


In [22]:
# make a list of 

numbers = [0 if c % 2 == 0 else c for c in range(100)]
numbers = [0,0,0,0,0] + numbers +[0,0,0,0]

od = pd.DataFrame(numbers, columns=["pv_measurement"])
conscs = pipe.remove_consecutive_measurments(all_data.copy(), 4, 4, return_removed=True)
# # conscs, coc = pipe.remove_consecutive_measurments(od, 3, 12, fuck= True)
print(conscs.shape)
# conscs[relevant_columns]

(47368, 54)


In [23]:
%autoreload
import sys
sys.path.append("../")
from pipeline import Pipeline as Pipin
pipe2 = Pipin()

In [24]:
mydf = df3_0
numbers = [0 if c % 2 == 0 else c for c in range(100)]
numbers = [0,0,0,0,0] + numbers +[0,0,0,0]

od = pd.DataFrame(numbers, columns=["pv_measurement"])
# od
# dette skal være samme som henning sin.
# print(mydf.shape)
removed = pipe2.remove_consecutive_measurments_new_new(mydf.copy(), 3, 100, 100, return_removed=True) #5, 1000
# conscs, coc = pipe.remove_consecutive_measurments(od, 3, 12, fuck= True)
print("removed",removed.shape)

row_where_consecutive_group_is_between = removed[(removed['consecutive_group'] > 0) &  (removed['consecutive_group'] < 30)]
see = row_where_consecutive_group_is_between[relevant_columns]
print(see.shape)
# see.head(20)

outliers = see[(see["direct_rad:W"] > 0.1) & (see["direct_rad:W"] < 10000)]
print("som ikke har 0 rad",outliers.shape)
outliers.head(100)
outliers

removed (3977, 52)
(129, 4)
som ikke har 0 rad (110, 4)


Unnamed: 0,pv_measurement,consecutive_group,direct_rad:W,hour
1703,9.8,4,65.5,10.0
1704,9.8,4,71.275002,11.0
1705,9.8,4,53.424999,12.0
2208,9.8,3,0.425,11.0
2400,9.8,3,18.35,11.0
2401,9.8,3,2.775,12.0
2424,9.8,3,3.375,11.0
2425,9.8,3,0.525,12.0
2592,9.8,3,0.925,11.0
2593,9.8,3,0.15,12.0


In [25]:
value_counts = outliers['consecutive_group'].value_counts()

value_counts

3    85
4    20
6     5
Name: consecutive_group, dtype: int64

In [26]:
sub = mydf.loc[810:850]
sub[relevant_columns2]

Unnamed: 0,pv_measurement,direct_rad:W,hour,day_of_year
810,0.0,0.0,2.0,281.0
811,0.0,0.0,3.0,281.0
812,0.0,0.0,4.0,281.0
813,0.0,0.0,5.0,281.0
814,29.4,7.75,6.0,281.0
815,137.2,70.5,7.0,281.0
816,225.4,153.024994,8.0,281.0
817,254.8,230.399994,9.0,281.0
818,372.4,271.125,10.0,281.0
819,323.4,279.25,11.0,281.0
