In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

In [3]:
# Plot function

def plot_feature(dataset:pd.DataFrame, featureName:str):
    
    fig, axs = plt.subplots(1, 1, figsize=(20, 10))

    dataset[['date_forecast', featureName]].set_index("date_forecast").plot(ax=axs, title=featureName, color='red')

In [4]:
# Read data

data_A = pd.read_csv("current_csv_files/data_A_fix.csv", index_col='Unnamed: 0')
data_B = pd.read_csv("current_csv_files/data_B_fix.csv", index_col='Unnamed: 0')
data_C = pd.read_csv("current_csv_files/data_C_fix.csv", index_col='Unnamed: 0')

test_A = pd.read_csv("current_csv_files/X_test_A_fix.csv", index_col='Unnamed: 0')
test_B = pd.read_csv("current_csv_files/X_test_B_fix.csv", index_col='Unnamed: 0')
test_C = pd.read_csv("current_csv_files/X_test_C_fix.csv", index_col='Unnamed: 0')

In [5]:
# Mark estimated data

est_dates_A = pd.to_datetime(pd.read_parquet('../A/X_train_estimated.parquet')['date_forecast'])
est_dates_B = pd.to_datetime(pd.read_parquet('../B/X_train_estimated.parquet')['date_forecast'])
est_dates_C = pd.to_datetime(pd.read_parquet('../C/X_train_estimated.parquet')['date_forecast'])

data_A['est'] = (pd.to_datetime(data_A['date_forecast']).isin(est_dates_A)).astype(int)
data_B['est'] = (pd.to_datetime(data_B['date_forecast']).isin(est_dates_B)).astype(int)
data_C['est'] = (pd.to_datetime(data_C['date_forecast']).isin(est_dates_C)).astype(int)

test_A['est'] = 1
test_B['est'] = 1
test_C['est'] = 1

In [6]:
# Create full dataset

data_A['train'] = 1
data_B['train'] = 1
data_C['train'] = 1

test_A['train'] = 0
test_B['train'] = 0
test_C['train'] = 0

test_A['pv_measurement'] = np.nan
test_B['pv_measurement'] = np.nan
test_C['pv_measurement'] = np.nan

data_test_A = pd.concat([data_A, test_A], ignore_index=True)
data_test_B = pd.concat([data_B, test_B], ignore_index=True)
data_test_C = pd.concat([data_C, test_C], ignore_index=True)

In [7]:
# Add location

data_test_A['A'] = 1
data_test_A['B'] = 0
data_test_A['C'] = 0

data_test_B['A'] = 0
data_test_B['B'] = 1
data_test_B['C'] = 0

data_test_C['A'] = 0
data_test_C['B'] = 0
data_test_C['C'] = 1


In [8]:
# Precip type to category
categories = ['precip_none', 'precip_rain', 'precip_rain_Snow', 'precip_snow', 'precip_sleet', 'precip_freezing_rain', 'precip_hail']

def add_precip_category(df, categories):
    # Assuming 'precip_type_5min:idx' contains integer category indices
    # Ensure the indices are integers
    df['precip_type_5min:idx'] = df['precip_type_5min:idx'].astype(int)

    # Get dummies and concatenate with the original dataframe
    dummies = pd.get_dummies(df['precip_type_5min:idx'], prefix='precip_type').astype(int)
    df = pd.concat([df, dummies], axis=1)

    return df

# Apply the function
data_test_A = add_precip_category(data_test_A, categories)
data_test_B = add_precip_category(data_test_B, categories)
data_test_C = add_precip_category(data_test_C, categories)

# Drop precip_type
data_test_A = data_test_A.drop('precip_type_5min:idx', axis='columns')
data_test_B = data_test_B.drop('precip_type_5min:idx', axis='columns')
data_test_C = data_test_C.drop('precip_type_5min:idx', axis='columns')

In [9]:
# Dates columns

def extract_dates(df):
    # Convert 'date_forecast' to datetime
    df['date_forecast'] = pd.to_datetime(df['date_forecast'])

    # Extract year, month, and day
    df['year'] = df['date_forecast'].dt.year
    df['month'] = df['date_forecast'].dt.month
    df['day'] = df['date_forecast'].dt.day

    return df

data_test_A = extract_dates(data_test_A)
data_test_B = extract_dates(data_test_B)
data_test_C = extract_dates(data_test_C)



In [10]:
# Add daily and annual sinus curve

def hour_func(x): 
    return np.cos(2*np.pi/24 * x)

data_test_A["daily_sinus"] = hour_func(data_test_A["date_forecast"].dt.hour)
data_test_B["daily_sinus"] = hour_func(data_test_B["date_forecast"].dt.hour)
data_test_C["daily_sinus"] = hour_func(data_test_C["date_forecast"].dt.hour)

#plot_feature(data_test_A.iloc[:40], 'daily_sinus')

def day_func(x): 
    return np.cos(2*np.pi/365 * x - 2*np.pi/365 * 173)

data_test_A["annual_sinus"] = day_func(data_test_A["date_forecast"].dt.dayofyear)
data_test_B["annual_sinus"] = day_func(data_test_B["date_forecast"].dt.dayofyear)
data_test_C["annual_sinus"] = day_func(data_test_C["date_forecast"].dt.dayofyear)

#plot_feature(data_test_A.iloc[8000:10000], 'annual_sinus')

In [11]:
# Mark weird cloud data as category

data_test_A['bad_cloud_data'] = (data_test_A['date_forecast'] < pd.to_datetime('2020-03-26 00:00:00')).astype(int)
data_test_B['bad_cloud_data'] = (data_test_B['date_forecast'] < pd.to_datetime('2020-03-26 00:00:00')).astype(int)
data_test_C['bad_cloud_data'] = (data_test_C['date_forecast'] < pd.to_datetime('2020-03-26 00:00:00')).astype(int)

In [12]:
# Clip sun elevation. When sun elevation is below ca. -5, no pv_measurement is recorded

data_test_A['sun_elevation:d'] = data_test_A['sun_elevation:d'].clip(lower=0)
data_test_B['sun_elevation:d'] = data_test_B['sun_elevation:d'].clip(lower=0)
data_test_C['sun_elevation:d'] = data_test_C['sun_elevation:d'].clip(lower=0)

In [13]:
# Create mask for open sky, clip negative values for clouds

data_test_A['open_sky'] = ( data_test_A['cloud_base_agl:m_y'] < 0).astype(int)
data_test_B['open_sky'] = ( data_test_B['cloud_base_agl:m_y'] < 0).astype(int)
data_test_C['open_sky'] = ( data_test_C['cloud_base_agl:m_y'] < 0).astype(int)

data_test_A['cloud_base_agl:m_y'] = data_test_A['cloud_base_agl:m_y'].clip(lower=0)
data_test_B['cloud_base_agl:m_y'] = data_test_B['cloud_base_agl:m_y'].clip(lower=0)
data_test_C['cloud_base_agl:m_y'] = data_test_C['cloud_base_agl:m_y'].clip(lower=0)


In [14]:
# Create lag and lead for direct sun rad

def create_lag_average_column(df, column_name, lag_window):
    """
    Creates a new column in the DataFrame which is the average of the last 3 rows of the specified column.
    For the first two rows, it uses the available values for averaging.

    :param df: Pandas DataFrame.
    :param column_name: The name of the column for which the lagged average is calculated.
    :return: DataFrame with the new lag average column added.
    """
    # Ensure that the DataFrame has the specified column
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in the DataFrame.")

    # Calculate the rolling average of the last 3 rows
    df[f'{column_name}_lag_avg'] = df[column_name].rolling(window=lag_window, min_periods=1).mean()

    return df

lag_window = 10

data_test_A = create_lag_average_column(data_test_A, 'direct_rad:W', lag_window)
data_test_B = create_lag_average_column(data_test_B, 'direct_rad:W', lag_window)
data_test_C = create_lag_average_column(data_test_C, 'direct_rad:W', lag_window)

#data_test_A = create_lag_average_column(data_test_A, 'diffuse_rad:W', lag_window)
#data_test_B = create_lag_average_column(data_test_B, 'diffuse_rad:W', lag_window)
#data_test_C = create_lag_average_column(data_test_C, 'diffuse_rad:W', lag_window)


In [15]:
# Continuing with lead

def create_lead_average_column(df, column_name, lead_window):
    """
    Creates a new column in the DataFrame which is the average of the next 3 rows of the specified column.
    For the last two rows, it uses the available values for averaging.

    :param df: Pandas DataFrame.
    :param column_name: The name of the column for which the lead average is calculated.
    :return: DataFrame with the new lead average column added.
    """
    # Ensure that the DataFrame has the specified column
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in the DataFrame.")

    # Reverse the DataFrame, calculate the rolling average, and then reverse it back
    df_reversed = df.iloc[::-1].copy()
    df_reversed[f'{column_name}_lead_avg'] = df_reversed[column_name].rolling(window=lead_window, min_periods=1).mean()
    df[f'{column_name}_lead_avg'] = df_reversed[f'{column_name}_lead_avg'].iloc[::-1].values

    return df

lead_window = 1

data_test_A = create_lead_average_column(data_test_A, 'direct_rad:W', lead_window)
data_test_B = create_lead_average_column(data_test_B, 'direct_rad:W', lead_window)
data_test_C = create_lead_average_column(data_test_C, 'direct_rad:W', lead_window)

#data_test_A = create_lead_average_column(data_test_A, 'diffuse_rad:W', lead_window)
#data_test_B = create_lead_average_column(data_test_B, 'diffuse_rad:W', lead_window)
#data_test_C = create_lead_average_column(data_test_C, 'diffuse_rad:W', lead_window)

In [16]:
# Create central derivative column

def create_central_difference_column(df, column_name, new_column_name):
    """
    Creates a new column in the DataFrame which is the central difference of the specified column with respect
    to the previous and next row. For the first and last rows, forward and backward differences are used.

    :param df: Pandas DataFrame.
    :param column_name: The name of the column for which the central difference is calculated.
    :param new_column_name: The name of the new column to store the central difference.
    :return: DataFrame with the new central difference column added.
    """

    # Ensure that the DataFrame has the specified column
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in the DataFrame.")

    # Calculate the central difference
    df[new_column_name] = (df[column_name].shift(-1) - df[column_name].shift(1)) / 2

    # Handle the first row using forward difference
    df.at[0, new_column_name] = df.at[1, column_name] - df.at[0, column_name]

    # Handle the last row using backward difference
    last_idx = df.index[-1]
    df.at[last_idx, new_column_name] = df.at[last_idx, column_name] - df.at[last_idx - 1, column_name]

    return df

data_test_A = create_central_difference_column(data_test_A, 'direct_rad:W', 'direct_rad_CD')
data_test_B = create_central_difference_column(data_test_B, 'direct_rad:W', 'direct_rad_CD')
data_test_C = create_central_difference_column(data_test_C, 'direct_rad:W', 'direct_rad_CD')

In [17]:
# Create difference between columns

def create_difference_column(df, column1, column2, new_column_name):
    """
    Creates a new column in the DataFrame which is the difference between two specified columns.

    :param df: Pandas DataFrame.
    :param column1: The name of the first column.
    :param column2: The name of the second column.
    :param new_column_name: The name of the new column to store the difference.
    :return: DataFrame with the new difference column added.
    """
    # Ensure that the DataFrame has the specified columns
    if column1 not in df.columns:
        raise ValueError(f"Column '{column1}' not found in the DataFrame.")
    if column2 not in df.columns:
        raise ValueError(f"Column '{column2}' not found in the DataFrame.")

    # Calculate the difference and store it in the new column
    df[new_column_name] = df[column1] - df[column2]

    return df


#data_test_A = create_difference_column(data_test_A, 'is_day:idx', 'is_in_shadow:idx', 'diff_day_shadow')


In [18]:
# Drop useless columns
useless_col = [
    'elevation:m',
    'fresh_snow_12h:cm',
    'fresh_snow_24h:cm',
    'precip_type_2',
    'precip_type_3',
    'precip_type_5',
    'precip_type_6',
    'pressure_50m:hPa',
    'msl_pressure:hPa',
    'sfc_pressure:hPa',
    'cloud_base_agl:m_x',
]


data_test_A = data_test_A.drop(useless_col, axis='columns')
data_test_B = data_test_B.drop(useless_col, axis='columns')
data_test_C = data_test_C.drop(useless_col, axis='columns')


# C needs to drop this aswell
data_test_C = data_test_C.drop('precip_type_4', axis='columns')


In [19]:
# Split dataset by labeled and non-labeled

data_A_done = data_test_A[ data_test_A['train'] == 1 ]
data_B_done = data_test_B[ data_test_B['train'] == 1 ]
data_C_done = data_test_C[ data_test_C['train'] == 1 ]

test_A_done = data_test_A[ data_test_A['train'] == 0 ]
test_B_done = data_test_B[ data_test_B['train'] == 0 ]
test_C_done = data_test_C[ data_test_C['train'] == 0 ]

In [20]:
# Drop NaN pv_measurement

test_A_done = test_A_done.drop('pv_measurement', axis='columns')
test_B_done = test_B_done.drop('pv_measurement', axis='columns')
test_C_done = test_C_done.drop('pv_measurement', axis='columns')

In [21]:
# Save files

data_test_A.to_csv("current_csv_files/data_test_A.csv")
data_test_B.to_csv("current_csv_files/data_test_B.csv")
data_test_C.to_csv("current_csv_files/data_test_C.csv")

data_A_done.to_csv("current_csv_files/data_A.csv")
data_B_done.to_csv("current_csv_files/data_B.csv")
data_C_done.to_csv("current_csv_files/data_C.csv")

test_A_done.to_csv("current_csv_files/test_A.csv")
test_C_done.to_csv("current_csv_files/test_C.csv")
test_B_done.to_csv("current_csv_files/test_B.csv")