## Imports

In [1]:
import pathlib

import pandas as pd
import plotly.express as px

## Environment variables

In [2]:
pd.options.plotting.backend = "plotly"

## Constants definition

In [3]:
WEATHER_PATH: pathlib.Path = pathlib.Path("../data/predict-energy-behavior-of-prosumers/historical_weather.csv")

## Data load

- **origin_date** - The date when the day-ahead prices became available.
- **forecast_date** - Represents the start of the 1-hour period when the price is valid
- **euros_per_mwh** - The price of electricity on the day ahead markets in euros per megawatt hour.
- **data_block_id** - All rows sharing the same `data_block_id` will be available at the same forecast time. This is a function of what information is available when forecasts are actually made, at 11 AM each morning. For example, if the forecast weather `data_block_id` for predictins made on October 31st is 100 then the historic weather `data_block_id` for October 31st will be 101 as the historic weather data is only actually available the next day.

In [11]:
weather: pd.DataFrame = pd.read_csv(WEATHER_PATH)
weather.head(10)

Unnamed: 0,datetime,temperature,dewpoint,rain,snowfall,surface_pressure,cloudcover_total,cloudcover_low,cloudcover_mid,cloudcover_high,windspeed_10m,winddirection_10m,shortwave_radiation,direct_solar_radiation,diffuse_radiation,latitude,longitude,data_block_id
0,2021-09-01 00:00:00,14.2,11.6,0.0,0.0,1015.9,31,31,0,11,7.083333,8,0.0,0.0,0.0,57.6,21.7,1.0
1,2021-09-01 00:00:00,13.9,11.5,0.0,0.0,1010.7,33,37,0,0,5.111111,359,0.0,0.0,0.0,57.6,22.2,1.0
2,2021-09-01 00:00:00,14.0,12.5,0.0,0.0,1015.0,31,34,0,0,6.333333,355,0.0,0.0,0.0,57.6,22.7,1.0
3,2021-09-01 00:00:00,14.6,11.5,0.0,0.0,1017.3,0,0,0,0,8.083333,297,358.0,277.0,81.0,57.6,23.2,1.0
4,2021-09-01 00:00:00,15.7,12.9,0.0,0.0,1014.0,22,25,0,0,8.416667,5,0.0,0.0,0.0,57.6,23.7,1.0
5,2021-09-01 00:00:00,16.0,11.4,0.0,0.0,1016.7,15,12,7,0,6.388889,297,349.0,274.0,75.0,57.6,24.2,1.0
6,2021-09-01 00:00:00,12.2,10.5,0.0,0.0,1005.3,19,7,0,41,4.111111,359,0.0,0.0,0.0,57.6,24.7,1.0
7,2021-09-01 00:00:00,12.2,10.5,0.0,0.0,1006.1,28,8,0,68,4.111111,354,0.0,0.0,0.0,57.6,25.2,1.0
8,2021-09-01 00:00:00,12.4,10.6,0.0,0.0,1003.8,49,26,0,85,4.25,352,0.0,0.0,0.0,57.6,25.7,1.0
9,2021-09-01 00:00:00,12.9,11.0,0.0,0.0,1003.7,81,56,3,96,4.25,351,0.0,0.0,0.0,57.6,26.2,1.0


In [5]:
weather.columns

Index(['datetime', 'temperature', 'dewpoint', 'rain', 'snowfall',
       'surface_pressure', 'cloudcover_total', 'cloudcover_low',
       'cloudcover_mid', 'cloudcover_high', 'windspeed_10m',
       'winddirection_10m', 'shortwave_radiation', 'direct_solar_radiation',
       'diffuse_radiation', 'latitude', 'longitude', 'data_block_id'],
      dtype='object')

In [6]:
weather.dtypes

datetime                   object
temperature               float64
dewpoint                  float64
rain                      float64
snowfall                  float64
surface_pressure          float64
cloudcover_total            int64
cloudcover_low              int64
cloudcover_mid              int64
cloudcover_high             int64
windspeed_10m             float64
winddirection_10m           int64
shortwave_radiation       float64
direct_solar_radiation    float64
diffuse_radiation         float64
latitude                  float64
longitude                 float64
data_block_id             float64
dtype: object

## EDA

### Check database info

In [10]:
weather.info(verbose=True, memory_usage="deep", show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1710802 entries, 0 to 1710801
Data columns (total 18 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   datetime                1710802 non-null  object 
 1   temperature             1710802 non-null  float64
 2   dewpoint                1710802 non-null  float64
 3   rain                    1710802 non-null  float64
 4   snowfall                1710802 non-null  float64
 5   surface_pressure        1710802 non-null  float64
 6   cloudcover_total        1710802 non-null  int64  
 7   cloudcover_low          1710802 non-null  int64  
 8   cloudcover_mid          1710802 non-null  int64  
 9   cloudcover_high         1710802 non-null  int64  
 10  windspeed_10m           1710802 non-null  float64
 11  winddirection_10m       1710802 non-null  int64  
 12  shortwave_radiation     1710802 non-null  float64
 13  direct_solar_radiation  1710802 non-null  float64
 14  di

In [8]:
weather.describe()

Unnamed: 0,temperature,dewpoint,rain,snowfall,surface_pressure,cloudcover_total,cloudcover_low,cloudcover_mid,cloudcover_high,windspeed_10m,winddirection_10m,shortwave_radiation,direct_solar_radiation,diffuse_radiation,latitude,longitude,data_block_id
count,1710802.0,1710802.0,1710802.0,1710802.0,1710802.0,1710802.0,1710802.0,1710802.0,1710802.0,1710802.0,1710802.0,1710802.0,1710802.0,1710802.0,1710802.0,1710802.0,1710802.0
mean,5.740968,2.240312,0.04962012,0.01604896,1009.282,60.9127,46.68593,34.40698,36.05141,4.849871,197.8694,106.4905,64.45292,42.03759,58.65,24.95,319.2708
std,8.025647,7.224357,0.2079113,0.07462936,13.08891,37.76905,40.7476,38.32769,41.35852,2.47545,89.93798,179.9449,133.41,61.95225,0.6873871,2.015564,183.7298
min,-23.7,-25.9,0.0,0.0,942.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57.6,21.7,1.0
25%,0.0,-2.6,0.0,0.0,1001.5,25.0,3.0,0.0,0.0,3.0,139.0,0.0,0.0,0.0,57.9,23.2,160.0
50%,5.1,1.7,0.0,0.0,1010.4,72.0,39.0,16.0,10.0,4.5,208.0,1.0,0.0,1.0,58.5,24.7,319.0
75%,11.2,7.2,0.0,0.0,1018.0,100.0,94.0,72.0,85.0,6.277778,263.0,140.0,47.0,74.0,59.1,26.7,478.0
max,32.6,23.8,16.8,2.66,1049.3,100.0,100.0,100.0,100.0,21.75,360.0,849.0,754.0,388.0,59.7,28.2,637.0


## Conclusion

1. molto semplice, bisogna solo convertire le colonne