Pip install the following packages
1. wetterdienst
2. shapely
3. utm

In [1]:
import os
import pandas as pd
import polars as pl
import pathlib
import datetime as dt
import utm
from datetime import date, timedelta
from scipy.interpolate import LinearNDInterpolator
from shapely.geometry import Point, Polygon
from wetterdienst import Settings, Resolution, Period, Parameter
from wetterdienst.provider.dwd.observation import DwdObservationRequest, DwdObservationDataset, DwdObservationPeriod, DwdObservationResolution



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
settings = Settings(
    ts_shape="long",
    ts_humanize=True,
    ts_si_units=True
)

All the observations we would need are as follows
1. wind_speed
2. wind_direction
3. radiation_global
4. air_temperature
5. humidity
6. wind_gust_max
7. extreme_wind
8. wind_direction_gust_max

There are 298 weather stations in germany

**INTERPOLATION**

The function leverages the four closest stations to your specified latitude and longitude and employs the bilinear interpolation method provided by the scipy package (interp2d) to interpolate the given parameter values. Currently, this interpolation feature is exclusive to DWDObservationRequest and parameters temperature_air_mean_200, wind_speed, precipitation_height. As it is in its early stages, we welcome feedback to enhance and refine its functionality. Interpolation by nearby stations is limited to a distance of 40 km by default (20.0 km for precipitation). You can change this by setting the ts_interpolation_station_distance setting.


In [4]:
langenhorn = (54.679, 8.908)

Station ID - 02907
Latitude - 54.7903
Longitude - 8.9514
Name - "Leck"
State - Schleswig Holstein
Distance from langenhorn - 12.685km

In [5]:
def get_weather_datas(param):
    request = DwdObservationRequest(
        parameter=[
             param
        ],
        resolution="10_minutes",
        start_date="2019-12-31 22:50:00",
        end_date="2023-12-31 00:00:00",
        settings=settings
    )
    stations = request.filter_by_rank(latlon=langenhorn, rank=1)
    df = stations.values.all().df.drop_nulls()
    return df

**Weather attributes**

In [6]:
df_wind_speed = get_weather_datas("wind_speed") # mean of wind speed during the last 10 minutes in m/s
df_wind_speed = df_wind_speed.drop(["dataset", "parameter", "quality"])
df_wind_speed = df_wind_speed.rename({"value": "wind_speed_m/s"})

In [7]:
df_wind_direction = get_weather_datas("wind_direction") # mean of wind direction during the last 10 minutes in degrees
df_wind_direction = df_wind_direction.drop(["dataset", "parameter", "quality"])
df_wind_direction = df_wind_direction.rename({"value": "wind_direction_degrees"})

In [8]:
df_radiation_global = get_weather_datas("radiation_global") # 10min-sum of solar incoming radiation in J/m2
df_radiation_global = df_radiation_global.drop(["dataset", "parameter", "quality"])
df_radiation_global = df_radiation_global.rename({"value": "radiation_global_J/m2"})

In [9]:
df_air_temperature = get_weather_datas("temperature_air_mean_200") # air temperature at 2m height
df_air_temperature = df_air_temperature.drop(["dataset", "parameter", "quality"])
df_air_temperature = df_air_temperature.rename({"value": "air_temperature_K"})

In [10]:
df_humidity = get_weather_datas("humidity") # relative humidity at 2m height in %
df_humidity = df_humidity.drop(["dataset", "parameter", "quality"])
df_humidity = df_humidity.rename({"value": "humidity_percent"})

In [11]:
df_wind_gust_max = get_weather_datas("wind_gust_max")# # maximum wind gust of the last 10 minutes, the instrument samples the instantaneous wind velocity every 0.25 seconds, and writes out the max value of a 3 second period, the highest occuring within the 10min interval is given here as the maximum wind gust.
df_wind_gust_max = df_wind_gust_max.drop(["dataset", "parameter", "quality"])
df_wind_gust_max = df_wind_gust_max.rename({"value": "wind_gust_max_m/s"})

In [12]:
df_wind_direction_gust_max = get_weather_datas("wind_direction_gust_max") # wind direction of highest wind gust
df_wind_direction_gust_max = df_wind_direction_gust_max.drop(["dataset", "parameter", "quality"])
df_wind_direction_gust_max = df_wind_direction_gust_max.rename({"value": "wind_direction_gust_max_degrees"})

**Joining all dataframes into one master df**

In [13]:
new_df = df_wind_speed.join(df_wind_direction, on=["station_id","date"], how = "inner")

In [14]:
new_df = new_df.join(df_radiation_global, on=["station_id","date"], how = "inner")

In [15]:
new_df = new_df.join(df_air_temperature, on=["station_id","date"], how = "inner")

In [16]:
new_df = new_df.join(df_humidity, on=["station_id","date"], how = "inner")

In [17]:
new_df = new_df.join(df_wind_gust_max, on=["station_id","date"], how = "inner")

In [18]:
new_df = new_df.join(df_wind_direction_gust_max, on=["station_id","date"], how = "inner")

station_id,date,wind_speed_m/s,wind_direction_degrees,radiation_global_J/m2,air_temperature_K,humidity_percent,wind_gust_max_m/s,wind_direction_gust_max_degrees
str,"datetime[μs, UTC]",f64,f64,f64,f64,f64,f64,f64
"""02907""",2019-12-31 22:50:00 UTC,2.0,260.0,0.0,278.35,81.9,2.9,250.0
"""02907""",2019-12-31 23:00:00 UTC,2.1,250.0,0.0,278.45,80.7,3.3,250.0
"""02907""",2019-12-31 23:10:00 UTC,2.9,270.0,0.0,278.65,79.2,4.9,270.0
"""02907""",2019-12-31 23:20:00 UTC,2.5,260.0,0.0,278.45,80.7,3.6,260.0
"""02907""",2019-12-31 23:30:00 UTC,2.4,240.0,0.0,278.45,80.4,3.5,250.0


**Removing Station_id column since all the stations are the same - 02907**

In [19]:
new_df = new_df.drop(["station_id"])
new_df.head()

date,wind_speed_m/s,wind_direction_degrees,radiation_global_J/m2,air_temperature_K,humidity_percent,wind_gust_max_m/s,wind_direction_gust_max_degrees
"datetime[μs, UTC]",f64,f64,f64,f64,f64,f64,f64
2019-12-31 22:50:00 UTC,2.0,260.0,0.0,278.35,81.9,2.9,250.0
2019-12-31 23:00:00 UTC,2.1,250.0,0.0,278.45,80.7,3.3,250.0
2019-12-31 23:10:00 UTC,2.9,270.0,0.0,278.65,79.2,4.9,270.0
2019-12-31 23:20:00 UTC,2.5,260.0,0.0,278.45,80.7,3.6,260.0
2019-12-31 23:30:00 UTC,2.4,240.0,0.0,278.45,80.4,3.5,250.0


In [20]:
# Group the DataFrame by the rounded down timestamp at 5-minute intervals and aggregate the values with the mean function
df_resampled = new_df['date'].dt.offset_by("5m").alias("date_5m")

In [21]:
pandas_df = new_df.to_pandas()
pandas_df.info()

# Set the datetime column as the index
pandas_df.set_index('date', inplace=True)

# Resample the DataFrame to 5-minute intervals
df_resampled = pandas_df.resample('5T').mean()

# Interpolate the missing values to fill in the gaps
df_interpolated = df_resampled.interpolate(method='linear')

# Reset the index to make the datetime column a regular column again
df_interpolated.reset_index(inplace=True)

# Print the new DataFrame
df_interpolated

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209055 entries, 0 to 209054
Data columns (total 8 columns):
 #   Column                           Non-Null Count   Dtype              
---  ------                           --------------   -----              
 0   date                             209055 non-null  datetime64[ns, UTC]
 1   wind_speed_m/s                   209055 non-null  float64            
 2   wind_direction_degrees           209055 non-null  float64            
 3   radiation_global_J/m2            209055 non-null  float64            
 4   air_temperature_K                209055 non-null  float64            
 5   humidity_percent                 209055 non-null  float64            
 6   wind_gust_max_m/s                209055 non-null  float64            
 7   wind_direction_gust_max_degrees  209055 non-null  float64            
dtypes: datetime64[ns, UTC](1), float64(7)
memory usage: 12.8 MB


Unnamed: 0,date,wind_speed_m/s,wind_direction_degrees,radiation_global_J/m2,air_temperature_K,humidity_percent,wind_gust_max_m/s,wind_direction_gust_max_degrees
0,2019-12-31 22:50:00+00:00,2.00,260.0,0.0,278.35,81.90,2.90,250.0
1,2019-12-31 22:55:00+00:00,2.05,255.0,0.0,278.40,81.30,3.10,250.0
2,2019-12-31 23:00:00+00:00,2.10,250.0,0.0,278.45,80.70,3.30,250.0
3,2019-12-31 23:05:00+00:00,2.50,260.0,0.0,278.55,79.95,4.10,260.0
4,2019-12-31 23:10:00+00:00,2.90,270.0,0.0,278.65,79.20,4.90,270.0
...,...,...,...,...,...,...,...,...
420490,2023-12-30 23:40:00+00:00,3.40,170.0,0.0,279.75,95.00,4.80,160.0
420491,2023-12-30 23:45:00+00:00,3.40,170.0,0.0,279.75,94.90,4.75,165.0
420492,2023-12-30 23:50:00+00:00,3.40,170.0,0.0,279.75,94.80,4.70,170.0
420493,2023-12-30 23:55:00+00:00,3.40,170.0,0.0,279.70,94.85,5.25,170.0


**Converting UTC to UTC+1 or UTC+2**


Standard time:	UTC +1	Central European Time (CET)
Daylight saving time:	UTC +2	Central European Summertime (CEST)
![Screenshot 2024-02-19 at 10.19.15 AM.png](attachment:828188b3-b04a-4619-8b20-7a4f1014e0d3.png)


In [22]:
# Assuming df_interpolated is your Pandas DataFrame
weather_df_new = df_interpolated.copy()

def add_timedelta_based_on_month(dt):
    if dt.month in [4, 5, 6, 7, 8, 9, 10]:  # Summer months: April, May, June, July, August, September, October
        return dt + pd.Timedelta(hours=2)
    else:  # Winter months: November, December, January, February, March
        return dt + pd.Timedelta(hours=1)

# Apply the function to the 'date' column
weather_df_new['date'] = weather_df_new['date'].apply(add_timedelta_based_on_month)

# Remove the first row
weather_df_new = weather_df_new.iloc[1:]

# Print the head of the DataFrame

weather_df_new.set_index("date")

Unnamed: 0_level_0,wind_speed_m/s,wind_direction_degrees,radiation_global_J/m2,air_temperature_K,humidity_percent,wind_gust_max_m/s,wind_direction_gust_max_degrees
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-12-31 23:55:00+00:00,2.05,255.0,0.0,278.40,81.30,3.10,250.0
2020-01-01 00:00:00+00:00,2.10,250.0,0.0,278.45,80.70,3.30,250.0
2020-01-01 00:05:00+00:00,2.50,260.0,0.0,278.55,79.95,4.10,260.0
2020-01-01 00:10:00+00:00,2.90,270.0,0.0,278.65,79.20,4.90,270.0
2020-01-01 00:15:00+00:00,2.70,265.0,0.0,278.55,79.95,4.25,265.0
...,...,...,...,...,...,...,...
2023-12-31 00:40:00+00:00,3.40,170.0,0.0,279.75,95.00,4.80,160.0
2023-12-31 00:45:00+00:00,3.40,170.0,0.0,279.75,94.90,4.75,165.0
2023-12-31 00:50:00+00:00,3.40,170.0,0.0,279.75,94.80,4.70,170.0
2023-12-31 00:55:00+00:00,3.40,170.0,0.0,279.70,94.85,5.25,170.0


In [23]:
weather_df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420494 entries, 1 to 420494
Data columns (total 8 columns):
 #   Column                           Non-Null Count   Dtype              
---  ------                           --------------   -----              
 0   date                             420494 non-null  datetime64[ns, UTC]
 1   wind_speed_m/s                   420494 non-null  float64            
 2   wind_direction_degrees           420494 non-null  float64            
 3   radiation_global_J/m2            420494 non-null  float64            
 4   air_temperature_K                420494 non-null  float64            
 5   humidity_percent                 420494 non-null  float64            
 6   wind_gust_max_m/s                420494 non-null  float64            
 7   wind_direction_gust_max_degrees  420494 non-null  float64            
dtypes: datetime64[ns, UTC](1), float64(7)
memory usage: 25.7 MB


In [24]:
weather_df_new['date'] = weather_df_new['date'].dt.tz_localize(None)  # Convert datetime with UTC timezone to naive datetime
weather_df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420494 entries, 1 to 420494
Data columns (total 8 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   date                             420494 non-null  datetime64[ns]
 1   wind_speed_m/s                   420494 non-null  float64       
 2   wind_direction_degrees           420494 non-null  float64       
 3   radiation_global_J/m2            420494 non-null  float64       
 4   air_temperature_K                420494 non-null  float64       
 5   humidity_percent                 420494 non-null  float64       
 6   wind_gust_max_m/s                420494 non-null  float64       
 7   wind_direction_gust_max_degrees  420494 non-null  float64       
dtypes: datetime64[ns](1), float64(7)
memory usage: 25.7 MB


**Below code is the polars version of converting df from utc to german time**

In [25]:
#def add_timedelta_based_on_month(dt):
#    if dt.month in [4, 5, 6, 7, 8, 9, 10]:  # Summer months: April, May, June, July, August, September, October
#        return dt + pd.Timedelta(hours=2)
#    else:  # Winter months: November, December, January, February, March
#        return dt + pd.Timedelta(hours=1)

In [26]:
#weather_df = new_df.clone()

#weather_df = weather_df.with_columns(
#    pl.col("date").apply(add_timedelta_based_on_month)
#)

#weather_df = weather_df.slice(1,None) #removing the first row
#weather_df.head()

**Saving weather_df as a csv file**

In [31]:
path: pathlib.Path = "/content/drive/MyDrive/ms_wind_curtailment_prediction/weather_data.csv"
weather_df_new.to_csv(path, sep=",")