In [10]:
# Modify the API script to resample data to daily instead of hourly
import requests
import pandas as pd
import datetime as dt

# Define API details
DMI_URL = 'https://dmigw.govcloud.dk/v2/metObs/collections/observation/items'
api_key = '56642815-d535-418b-bda2-30a8a8ef4999'

# Specify the desired start and end time
start_time = pd.Timestamp(2002, 1, 1)
end_time = pd.Timestamp(2024, 12, 31)
datetime_str = start_time.tz_localize('UTC').isoformat() + '/' + end_time.tz_localize('UTC').isoformat()

# Station ID
stationIds = ['06102']

# List of required parameters
parameterIds = [
    "temp_min_past1h", "temp_max_past1h", "temp_mean_past1h",
    "temp_grass_mean_past1h", "temp_soil_min_past1h", "temp_soil_max_past1h", "temp_soil_mean_past1h",
    "humidity_past1h", "pressure", "wind_dir_past1h", "wind_min_past1h",
    "wind_gust_always_past1h", "wind_speed_past1h", "precip_past1h", "precip_dur_past1h",
    "radia_glob_past1h", "sun_last1h_glob"
]

# Fetch data for all parameters
dfs = []
for station in stationIds:
    for parameter in parameterIds:
        # Define API query parameters
        params = {
            'api-key': api_key,
            'datetime': datetime_str,
            'stationId': station,
            'parameterId': parameter,
            'limit': '300000',  # Max limit
        }

        # Send GET request
        response = requests.get(DMI_URL, params=params)

        # Check response status
        if response.status_code == 200:
            json_data = response.json()

            # Extract data into DataFrame
            if 'features' in json_data and json_data['features']:
                dfi = pd.json_normalize(json_data['features'])
                dfi['time'] = pd.to_datetime(dfi['properties.observed'])
                dfi = dfi[['time', 'properties.value', 'properties.stationId', 'properties.parameterId']]
                dfi.columns = [c.replace('properties.', '') for c in dfi.columns]
                dfi = dfi[~dfi.duplicated()]  # Remove duplicates
                dfi = dfi.set_index(['parameterId', 'stationId', 'time'])
                dfi = dfi['value'].unstack(['stationId', 'parameterId'])
                dfs.append(dfi)

# Combine all data into a single DataFrame
if dfs:
    df = pd.concat(dfs, axis='columns').sort_index()

    # Flatten MultiIndex by removing station ID level and keeping only parameter names
    df.columns = df.columns.get_level_values(1)

    # Define aggregation methods for daily resampling
    aggregation_methods = {
        "temp_min_past1h": "min",  # Minimum temperature of the day
        "temp_max_past1h": "max",  # Maximum temperature of the day
        "temp_mean_past1h": "mean",  # Daily mean temperature
        "temp_grass_mean_past1h": "mean",  # Mean grass temperature
        "temp_soil_min_past1h": "min",  # Minimum soil temperature
        "temp_soil_max_past1h": "max",  # Maximum soil temperature
        "temp_soil_mean_past1h": "mean",  # Mean soil temperature
        "humidity_past1h": "mean",  # Average humidity of the day
        "pressure": "mean",  # Mean atmospheric pressure
        "wind_dir_past1h": "mean",  # Average wind direction
        "wind_speed_past1h": "mean",  # Mean wind speed
        "wind_gust_always_past1h": "max",  # Maximum wind gust
        "precip_past1h": "sum",  # Total daily precipitation
        "precip_dur_past1h": "sum",  # Total minutes of precipitation
        "radia_glob_past1h": "sum",  # Total daily global radiation
        "sun_last1h_glob": "sum",  # Total sunshine duration per day
    }

    # Resample the DataFrame to daily using defined aggregation rules
    df_daily = df.resample("D").agg(aggregation_methods)

    # Save as CSV
    df_daily.to_csv("dmi_weather_data_daily.csv")

else:
    print("No data retrieved. Check API response and station/parameter configuration.")


In [11]:
df_daily.head() # Show first rows

parameterId,temp_min_past1h,temp_max_past1h,temp_mean_past1h,temp_grass_mean_past1h,temp_soil_min_past1h,temp_soil_max_past1h,temp_soil_mean_past1h,humidity_past1h,pressure,wind_dir_past1h,wind_speed_past1h,wind_gust_always_past1h,precip_past1h,precip_dur_past1h,radia_glob_past1h,sun_last1h_glob
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2002-01-01 00:00:00+00:00,-9.3,6.4,1.25,-0.0625,-0.2,0.1,0.0375,88.708333,,281.666667,6.995833,,2.2,34.0,271.0,0.0
2002-01-02 00:00:00+00:00,-6.3,6.4,-0.8375,-1.970833,-0.1,0.0,-0.033333,77.583333,,110.208333,2.695833,,0.0,0.0,805.0,328.0
2002-01-03 00:00:00+00:00,-7.6,-1.0,-5.166667,-6.9375,-0.2,0.0,-0.1,87.458333,,158.625,1.195833,,0.0,0.0,739.0,278.5
2002-01-04 00:00:00+00:00,-9.3,-1.3,-6.025,-7.95,-0.7,-0.2,-0.333333,80.916667,,221.708333,0.9875,,0.0,0.0,876.0,355.0
2002-01-05 00:00:00+00:00,-10.6,3.7,-3.466667,-4.3375,-1.1,-0.4,-0.754167,92.458333,,239.75,1.854167,,0.1,1.0,659.0,268.5


In [12]:
# Check for missing values
missing_values = df_daily.isnull().sum()
print("Missing Data Count:\n", missing_values[missing_values > 0])

Missing Data Count:
 parameterId
temp_min_past1h             193
temp_max_past1h             193
temp_mean_past1h            193
temp_grass_mean_past1h      208
temp_soil_min_past1h        181
temp_soil_max_past1h        181
temp_soil_mean_past1h       181
humidity_past1h             203
pressure                   6299
wind_dir_past1h             186
wind_speed_past1h           186
wind_gust_always_past1h     801
dtype: int64


In [13]:
df_daily.count()

parameterId
temp_min_past1h            8208
temp_max_past1h            8208
temp_mean_past1h           8208
temp_grass_mean_past1h     8193
temp_soil_min_past1h       8220
temp_soil_max_past1h       8220
temp_soil_mean_past1h      8220
humidity_past1h            8198
pressure                   2102
wind_dir_past1h            8215
wind_speed_past1h          8215
wind_gust_always_past1h    7600
precip_past1h              8401
precip_dur_past1h          8401
radia_glob_past1h          8401
sun_last1h_glob            8401
dtype: int64

In [14]:
# Define threshold for missing data (e.g., remove columns with >50% missing)
missing_threshold = 0.50  # 50%

# Calculate missing data percentage
missing_percent = df_daily.isnull().mean()

# Remove columns with too much missing data
columns_to_drop = missing_percent[missing_percent > missing_threshold].index
df_cleaned = df_daily.drop(columns=columns_to_drop)

print(f"Removed columns due to excessive missing data: {list(columns_to_drop)}")

Removed columns due to excessive missing data: ['pressure']


In [15]:
df_cleaned.head() # Show first rows

parameterId,temp_min_past1h,temp_max_past1h,temp_mean_past1h,temp_grass_mean_past1h,temp_soil_min_past1h,temp_soil_max_past1h,temp_soil_mean_past1h,humidity_past1h,wind_dir_past1h,wind_speed_past1h,wind_gust_always_past1h,precip_past1h,precip_dur_past1h,radia_glob_past1h,sun_last1h_glob
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2002-01-01 00:00:00+00:00,-9.3,6.4,1.25,-0.0625,-0.2,0.1,0.0375,88.708333,281.666667,6.995833,,2.2,34.0,271.0,0.0
2002-01-02 00:00:00+00:00,-6.3,6.4,-0.8375,-1.970833,-0.1,0.0,-0.033333,77.583333,110.208333,2.695833,,0.0,0.0,805.0,328.0
2002-01-03 00:00:00+00:00,-7.6,-1.0,-5.166667,-6.9375,-0.2,0.0,-0.1,87.458333,158.625,1.195833,,0.0,0.0,739.0,278.5
2002-01-04 00:00:00+00:00,-9.3,-1.3,-6.025,-7.95,-0.7,-0.2,-0.333333,80.916667,221.708333,0.9875,,0.0,0.0,876.0,355.0
2002-01-05 00:00:00+00:00,-10.6,3.7,-3.466667,-4.3375,-1.1,-0.4,-0.754167,92.458333,239.75,1.854167,,0.1,1.0,659.0,268.5


In [16]:
# Step 1: Identify missing values before processing
missing_before = df_cleaned.isnull().sum()
print(f"Missing values before processing:\n{missing_before[missing_before > 0]}\n")

# Step 2: Apply linear interpolation for gaps up to 12 hours
df_cleaned = df_cleaned.interpolate(method='linear', limit=12, limit_direction='both')

# Step 3: Add helper columns for seasonal mean imputation
df_cleaned["hour"] = df_cleaned.index.hour
df_cleaned["dayofyear"] = df_cleaned.index.dayofyear
df_cleaned["month"] = df_cleaned.index.month
df_cleaned["day"] = df_cleaned.index.day
df_cleaned["year"] = df_cleaned.index.year  # Keep track of leap years

# **Step 4: Keep February 29th!**
# We no longer filter out Feb 29

# Step 5: Fill remaining missing values using seasonal means, handling leap years properly
df_cleaned = df_cleaned.groupby(["month", "day", "hour"]).transform(lambda x: x.fillna(x.mean()))

# Step 6: Remove helper columns after filling
df_cleaned.drop(columns=["hour", "dayofyear", "month", "day", "year"], errors="ignore", inplace=True)

# Step 7: Save the cleaned dataset to a new CSV file
df_cleaned.to_csv("dmi_weather_data_cleaned.csv")

# Step 8: Identify missing values after processing
missing_after = df_cleaned.isnull().sum()
print(f"Missing values after processing:\n{missing_after[missing_after > 0]}")
print("\nData cleaning complete!")


Missing values before processing:
parameterId
temp_min_past1h            193
temp_max_past1h            193
temp_mean_past1h           193
temp_grass_mean_past1h     208
temp_soil_min_past1h       181
temp_soil_max_past1h       181
temp_soil_mean_past1h      181
humidity_past1h            203
wind_dir_past1h            186
wind_speed_past1h          186
wind_gust_always_past1h    801
dtype: int64

Missing values after processing:
Series([], dtype: int64)

Data cleaning complete!
