# Imports

In [1]:

import pandas as pd
import numpy as np
import tensorflow as tf

# Load datasets

In [2]:
X_train_estimated_a = pd.read_parquet('../data/A/X_train_estimated.parquet')
X_train_estimated_b = pd.read_parquet('../data/B/X_train_estimated.parquet')
X_train_estimated_c = pd.read_parquet('../data/C/X_train_estimated.parquet')

X_train_observed_a = pd.read_parquet('../data/A/X_train_observed.parquet')
X_train_observed_b = pd.read_parquet('../data/B/X_train_observed.parquet')
X_train_observed_c = pd.read_parquet('../data/C/X_train_observed.parquet')

X_test_estimated_a = pd.read_parquet('../data/A/X_test_estimated.parquet')
X_test_estimated_b = pd.read_parquet('../data/B/X_test_estimated.parquet')
X_test_estimated_c = pd.read_parquet('../data/C/X_test_estimated.parquet')

train_targets_a = pd.read_parquet('../data/A/train_targets.parquet')
train_targets_b = pd.read_parquet('../data/B/train_targets.parquet')
train_targets_c = pd.read_parquet('../data/C/train_targets.parquet')

# Data clean up

researching which columns have the most null values. 

In [3]:
for column in X_train_estimated_a.columns:
    null_c =  X_train_estimated_a[column].isna().sum()
    if null_c > 0: 
        print(f'Column: {column} has {null_c} NULL values')

for column in X_train_observed_a.columns:
    null_c = X_train_observed_a[column].isna().sum()
    if null_c > 0: 
        print(f'Column: {column} has {null_c} NULL values')

Column: ceiling_height_agl:m has 3919 NULL values
Column: cloud_base_agl:m has 2094 NULL values
Column: snow_density:kgm3 has 15769 NULL values
Column: ceiling_height_agl:m has 22247 NULL values
Column: cloud_base_agl:m has 8066 NULL values
Column: snow_density:kgm3 has 115945 NULL values


Based on output we choose to drop `snow_density:kgm3`, `ceiling_height_agl:m`, `cloud_base_agl:m`

```
X_ESTIMATED
Column: ceiling_height_agl:m has 3919 NULL values
Column: cloud_base_agl:m has 2094 NULL values
Column: snow_density:kgm3 has 15769 NULL values

X_OBSERVED:
Column: ceiling_height_agl:m has 22247 NULL values
Column: cloud_base_agl:m has 8066 NULL values
Column: snow_density:kgm3 has 115945 NULL values

```

In [4]:
""" for column in X_train_estimated_a.columns:
    null_c =  X_train_estimated_a[column].value_counts()[0]
    if null_c > 0: 
        print(f'Column: {column} has {null_c} 0 values')

for column in X_train_observed_a.columns:
    null_c = X_train_observed_a[column].value_counts()[0]
    if null_c > 0: 
        print(f'Column: {column} has {null_c} 0 values') """

" for column in X_train_estimated_a.columns:\n    null_c =  X_train_estimated_a[column].value_counts()[0]\n    if null_c > 0: \n        print(f'Column: {column} has {null_c} 0 values')\n\nfor column in X_train_observed_a.columns:\n    null_c = X_train_observed_a[column].value_counts()[0]\n    if null_c > 0: \n        print(f'Column: {column} has {null_c} 0 values') "

In [5]:

# Data set A, B and C clean up

def data_clean_up(x_train_est, x_train_observe, y_train):

  if 'date_calc' in x_train_est.columns:
    x_train_est.drop(columns="date_calc", inplace=True)

  x_train = pd.concat([x_train_observe, x_train_est])

  # Group the rows into blocks of 4 and apply the aggregation function
  agg_func = {col: 'mean' for col in x_train.columns[1:]}
  X_train_downscaled = x_train.groupby(x_train.index // 4).agg({**{'date_forecast': 'first'}, **agg_func})

  y_train.dropna(inplace=True)
  combined_data = pd.merge(X_train_downscaled, y_train, left_on='date_forecast', right_on='time')
  combined_data.drop(columns=['snow_density:kgm3', 'ceiling_height_agl:m', 'cloud_base_agl:m'], inplace=True)
  combined_data.dropna(inplace=True)
  y_train = combined_data[['pv_measurement', 'date_forecast']]

  if 'time' and 'pv_measurement' in combined_data.columns:
    combined_data.drop(columns="time", inplace=True)
    combined_data.drop(columns="pv_measurement", inplace=True)

  return combined_data, y_train

def count_null_in_column(df: pd.DataFrame, column_name: str):
  return df[column_name].value_counts(None)

x_train_a, y_train_a = data_clean_up(X_train_estimated_a, X_train_observed_a, train_targets_a)
x_train_b, y_train_b = data_clean_up(X_train_estimated_b, X_train_observed_b, train_targets_b)
x_train_c, y_train_c = data_clean_up(X_train_estimated_c, X_train_observed_c, train_targets_c)


def data_clean_up_test(x_test_est):

  # Group the rows into blocks of 4 and apply the aggregation function
  agg_func = {col: 'mean' for col in x_test_est.columns[1:]}
  X_test_downscaled = x_test_est.groupby(x_test_est.index // 4).agg({**{'date_forecast': 'first'}, **agg_func})

  """
  if 'date_forecast' in X_test_downscaled.columns:
    X_test_downscaled.drop(columns="date_forecast", inplace=True)
  """
  return X_test_downscaled

X_test_estimated_a = data_clean_up_test(X_test_estimated_a)
X_test_estimated_b = data_clean_up_test(X_test_estimated_b)
X_test_estimated_c = data_clean_up_test(X_test_estimated_c)
print(len(X_test_estimated_a))



720


Keeping this average for now since its the best aggreagation we have at this point. However i do believe that using avg or mean together with delta (total change within the hour) might be a better solution. Also some columns might need other aggregations than avg. 

# Feature engineering

Need to tell the model something about how time seasonality works. E.g. night and day, as well as yearly seasonality. 

In [6]:
def get_unixtime(datetime: pd.Series) -> pd.Series:
    unixtime = (datetime - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
    return unixtime

xat_datetime = x_train_a['date_forecast']
xat_unixtime = get_unixtime(xat_datetime)

## We now need functions for assigning daily and yearly cycles (described in datanalysis docu on Peter branch)
# plus 2 avoids 0 and negative values
day = 24*60*60
year = (365.2425)*day

def sinus_day(unix_time):
    return 2 + np.sin(unix_time * (2 * np.pi / day)) # since it is seconds since 1.1.1970 we divide by seconds in a day to get seasonal changes throughout the dat

def sinus_year(unix_time):
    return 2+ np.sin(unix_time * (2 * np.pi / year))

def cosinus_day(unix_time):
    return 2+np.cos(unix_time * (2 * np.pi / day))

def cosinus_year(unix_time):
    return 2+np.cos(unix_time * (2 * np.pi / year))

# function for returning two series with the daily cycles (sine and cosine)
def get_daycycle(unixtime: pd.Series) -> (pd.Series, pd.Series):
    sinus_daytime = unixtime.apply(sinus_day)
    sinus_daytime = sinus_daytime.rename('sinus_day') 
    cosinus_daytime = unixtime.apply(cosinus_day)
    cosinus_daytime = cosinus_daytime.rename('cosine_day')
    return sinus_daytime, cosinus_daytime

# Function for returning two series with the yearly cycles
def get_yearcycle(unixtime: pd.Series) -> (pd.Series, pd.Series):
    sinus_yeartime = unixtime.apply(sinus_year)
    sinus_yeartime = sinus_yeartime.rename('sinus_year')
    cosinus_yeartime = unixtime.apply(cosinus_year)
    cosinus_yeartime = cosinus_yeartime.rename('cosinus_year')
    return sinus_yeartime, cosinus_yeartime

xat_day_sin, xat_day_cos = get_daycycle(xat_unixtime)
xat_year_sin, xat_year_cos = get_yearcycle(xat_unixtime)

xta_feat = x_train_a.join([xat_day_sin, xat_day_cos, xat_year_sin, xat_year_cos])
xta_feat.drop(columns=['date_forecast'], inplace=True)
print(xta_feat.head())

   absolute_humidity_2m:gm3  air_density_2m:kgm3  clear_sky_energy_1h:J  \
0                    8.0250             1.230625               0.000000   
1                    7.9000             1.228750               0.000000   
2                    8.0125             1.224750               0.000000   
3                    8.3125             1.223250             104.324997   
4                    8.6625             1.222500           16234.075195   

   clear_sky_rad:W  dew_or_rime:idx  dew_point_2m:K  diffuse_rad:W  \
0            0.000              0.5      280.787506         0.0000   
1            0.000              0.5      280.574982         0.0000   
2            0.000              0.5      280.787506         0.0000   
3            0.375              0.5      281.362488         0.1500   
4           11.550              0.5      281.924988         5.9875   

   diffuse_rad_1h:J  direct_rad:W  direct_rad_1h:J  ...  total_cloud_cover:p  \
0          0.000000         0.000         0.0000

Here we need to do some extra feature engineering and make shure that all the elements in the dataset is of the same type. A lot of this is explained in the TensorFlow documentation https://www.tensorflow.org/tutorials/load_data/pandas_dataframe. 

# Training the model

In [7]:
SHUFFLE_BUFFER = 500
BATCH_SIZE = 2

xta_tensor = tf.convert_to_tensor(xta_feat)

normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(xta_tensor)
target = y_train_a
target.drop(columns='date_forecast', inplace=True)
print(len(target))


29667


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target.drop(columns='date_forecast', inplace=True)


In [8]:
def get_basic_model():
  model = tf.keras.Sequential([
    normalizer,
    tf.keras.layers.Dense(10, activation='relu'),
    #tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(1)
  ])

  model.compile(optimizer='adam',
                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                metrics=['accuracy'])
  return model



model = get_basic_model()
model.fit(xta_tensor, target, epochs=15, batch_size=BATCH_SIZE)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x289d3bc40>

# Make predictions

In [9]:
# Do some more stuff

# Evaluate prediction

In [10]:
# Evaluate the model

y_pred = []

# Create submission

In [11]:
y_test_pred = y_pred

test = pd.read_csv('../data/test.csv')
test['prediction'] = y_test_pred
sample_submission = pd.read_csv('../data/sample_submission.csv')
submission = sample_submission[['id']].merge(test[['id', 'prediction']], on='id', how='left')
submission.to_csv('submission.csv', index=False)

ValueError: Length of values (0) does not match length of index (2160)