In [19]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Notebook that handles data loading! 

This notebook is used to initially load and prepare data. This means loading into a datamanager, merging data (if wanted), and doing any preliminary alterations to the dataset. 

In [20]:
from data_prep import data_manager as DM

# instanciate a new datamanager 
dm = DM.Data_Manager()

# loads all data into the datamanager 
dm.data_loader()


## Before doing anything, we fill in missing values by imputing

In [21]:

print('x_train_observed')
print(dm.X_train_observed_a.isnull().sum()[dm.X_train_observed_a.isnull().sum() > 0])

print('x_train_estimated')
print(dm.X_train_estimated_a.isnull().sum()[dm.X_train_estimated_a.isnull().sum() > 0])

print('x_test_estimated')
print(dm.X_test_estimated_a.isnull().sum()[dm.X_test_estimated_a.isnull().sum() > 0])

x_train_observed
ceiling_height_agl:m     22247
cloud_base_agl:m          8066
snow_density:kgm3       115945
dtype: int64
x_train_estimated
ceiling_height_agl:m     3919
cloud_base_agl:m         2094
snow_density:kgm3       15769
dtype: int64
x_test_estimated
ceiling_height_agl:m     793
cloud_base_agl:m         298
snow_density:kgm3       2880
dtype: int64


In [22]:
imputes = dm.impute_data([dm.X_train_observed_a, 
                                   dm.X_train_observed_b, 
                                   dm.X_train_observed_c, 
                                   dm.X_train_estimated_a,
                                   dm.X_train_estimated_b,
                                   dm.X_train_estimated_c,
                                   dm.X_test_estimated_a,
                                   dm.X_test_estimated_b,
                                   dm.X_test_estimated_c])

dm.X_train_observed_a = imputes[0]
dm.X_train_observed_b = imputes[1] 
dm.X_train_observed_c = imputes[2] 
dm.X_train_estimated_a = imputes[3]
dm.X_train_estimated_b = imputes[4]
dm.X_train_estimated_c = imputes[5]
dm.X_test_estimated_a = imputes[6]
dm.X_test_estimated_b = imputes[7]
dm.X_test_estimated_c = imputes[8]

dm.set_info(dm.X_test_estimated_a)

dm.X_train_observed_a = dm.X_train_observed_a.drop("snow_density:kgm3", axis=1)
dm.X_train_observed_b = dm.X_train_observed_b.drop("snow_density:kgm3", axis=1) 
dm.X_train_observed_c = dm.X_train_observed_c.drop("snow_density:kgm3", axis=1) 
dm.X_train_estimated_a = dm.X_train_estimated_a.drop("snow_density:kgm3", axis=1)
dm.X_train_estimated_b = dm.X_train_estimated_b.drop("snow_density:kgm3", axis=1)
dm.X_train_estimated_c = dm.X_train_estimated_c.drop("snow_density:kgm3", axis=1)



100%|██████████| 9/9 [00:00<00:00, 28.62it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2880 entries, 0 to 2879
Data columns (total 45 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   absolute_humidity_2m:gm3        2880 non-null   float32       
 1   air_density_2m:kgm3             2880 non-null   float32       
 2   ceiling_height_agl:m            2880 non-null   float32       
 3   clear_sky_energy_1h:J           2880 non-null   float32       
 4   clear_sky_rad:W                 2880 non-null   float32       
 5   cloud_base_agl:m                2880 non-null   float32       
 6   dew_or_rime:idx                 2880 non-null   float32       
 7   dew_point_2m:K                  2880 non-null   float32       
 8   diffuse_rad:W                   2880 non-null   float32       
 9   diffuse_rad_1h:J                2880 non-null   float32       
 10  direct_rad:W                    2880 non-null   float32       
 11  dire




## Now we look at sample rates

the training weather data is sampled every 15 minutes, whilst the pv_measurements are every hour. 
We can either 
1. sample down. Making weather data be every hour using the mean value of every hour 
2. sample up. Making the pv_measurement every 15 min instead of every hour. 

In [23]:
# Sample down

# resamples = dm.resample_data([dm.X_train_observed_a, 
#                                    dm.X_train_observed_b, 
#                                    dm.X_train_observed_c, 
#                                    dm.X_train_estimated_a,
#                                    dm.X_train_estimated_b,
#                                    dm.X_train_estimated_c,
#                                    dm.X_test_estimated_a,
#                                    dm.X_test_estimated_b,
#                                    dm.X_test_estimated_c], "H")

# dm.X_train_observed_a = resamples[0]
# dm.X_train_observed_b = resamples[1] 
# dm.X_train_observed_c = resamples[2] 
# dm.X_train_estimated_a = resamples[3]
# dm.X_train_estimated_b = resamples[4]
# dm.X_train_estimated_c = resamples[5]
# dm.X_test_estimated_a = resamples[6]
# dm.X_test_estimated_b = resamples[7]
# dm.X_test_estimated_c = resamples[8]



# Sample up

resamples = dm.resample_data([
                            dm.X_test_estimated_a,
                            dm.X_test_estimated_b,
                            dm.X_test_estimated_c], "H")


dm.X_test_estimated_a = resamples[0]
dm.X_test_estimated_b = resamples[1]
dm.X_test_estimated_c = resamples[2]

resamples = dm.resample_data([dm.train_a, dm.train_b, dm.train_c], "15T")

train_a_15min = resamples[0]
train_b_15min = resamples[1]
train_c_15min = resamples[2]

dm.train_a = resamples[0]
dm.train_b = resamples[1]
dm.train_c = resamples[2]

train_a_15min


Unnamed: 0,pv_measurement,date_forecast
0,0.0,2019-06-02 22:00:00
1,,2019-06-02 22:15:00
2,,2019-06-02 22:30:00
3,,2019-06-02 22:45:00
4,0.0,2019-06-02 23:00:00
...,...,...
136512,0.0,2023-04-30 22:00:00
136513,,2023-04-30 22:15:00
136514,,2023-04-30 22:30:00
136515,,2023-04-30 22:45:00


Upsampling introduces alot of nans, which we impute

In [24]:
dm.train_a = dm.train_a.interpolate()
dm.train_b = dm.train_b.interpolate()
dm.train_c = dm.train_c.interpolate()

## Combining data into one training set 

In [25]:
data_A, data_B, data_C = dm.combine_data()


data_A

Unnamed: 0,pv_measurement,date_forecast,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,...,sun_azimuth:d,sun_elevation:d,super_cooled_liquid_water:kgm2,t_1000hPa:K,total_cloud_cover:p,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms
0,0.0,2019-06-02 22:00:00,7.7,1.230,1744.900024,0.0,0.0,1744.900024,0.0,280.299988,...,342.834015,-3.202,0.0,285.899994,100.000000,39640.101562,3.7,-3.6,-0.8,-0.0
1,0.0,2019-06-02 22:15:00,7.7,1.229,1734.000000,0.0,0.0,1734.000000,0.0,280.299988,...,346.294006,-3.650,0.0,286.100006,100.000000,40123.898438,3.6,-3.6,-0.6,-0.0
2,0.0,2019-06-02 22:30:00,7.7,1.228,1723.500000,0.0,0.0,1723.500000,0.0,280.299988,...,349.768005,-3.998,0.0,286.299988,100.000000,40628.300781,3.6,-3.6,-0.4,-0.0
3,0.0,2019-06-02 22:45:00,7.7,1.226,1713.400024,0.0,0.0,1713.400024,0.0,280.299988,...,353.251007,-4.247,0.0,286.600006,100.000000,41153.601562,3.5,-3.5,-0.2,-0.0
4,0.0,2019-06-02 23:00:00,7.7,1.225,1703.599976,0.0,0.0,1703.599976,0.0,280.299988,...,356.742004,-4.393,0.0,286.799988,100.000000,41699.898438,3.5,-3.5,0.0,-0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136237,0.0,2023-04-30 22:00:00,4.5,1.281,1532.900024,0.0,0.0,519.000000,0.0,272.299988,...,341.851013,-10.630,0.1,274.399994,97.400002,16674.900391,4.3,3.4,2.6,-0.0
136238,0.0,2023-04-30 22:15:00,4.5,1.281,1413.199951,0.0,0.0,524.900024,0.0,272.299988,...,345.524994,-11.102,0.1,274.399994,98.699997,15005.000000,4.2,3.3,2.6,-0.0
136239,0.0,2023-04-30 22:30:00,4.5,1.281,1293.599976,0.0,0.0,530.700012,0.0,272.299988,...,349.216003,-11.470,0.1,274.299988,99.500000,13323.200195,4.0,3.1,2.5,-0.0
136240,0.0,2023-04-30 22:45:00,4.5,1.281,1173.900024,0.0,0.0,536.500000,0.0,272.299988,...,352.920013,-11.731,0.1,274.200012,99.900002,11629.299805,3.9,2.9,2.5,-0.0


## Combining datasets into one observed set and one estimated set

In [26]:
data_A_obs, data_B_obs, data_C_obs, data_A_es, data_B_es, data_C_es  = dm.combine_data(False)

print(data_A_obs.shape,data_B_obs.shape,data_C_obs.shape, data_A_es.shape,data_B_es.shape,data_C_es.shape)

dm.set_info(data_C_es)

(118669, 46) (116928, 46) (93177, 46) (17573, 46) (14981, 46) (12293, 46)
<class 'pandas.core.frame.DataFrame'>
Index: 12293 entries, 116924 to 129312
Data columns (total 46 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   pv_measurement                  12293 non-null  float64       
 1   date_forecast                   12293 non-null  datetime64[us]
 2   absolute_humidity_2m:gm3        12293 non-null  float32       
 3   air_density_2m:kgm3             12293 non-null  float32       
 4   ceiling_height_agl:m            12293 non-null  float32       
 5   clear_sky_energy_1h:J           12293 non-null  float32       
 6   clear_sky_rad:W                 12293 non-null  float32       
 7   cloud_base_agl:m                12293 non-null  float32       
 8   dew_or_rime:idx                 12293 non-null  float32       
 9   dew_point_2m:K                  12293 non-null  float32       


In [27]:
%store dm

Stored 'dm' (Data_Manager)
