In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [2]:
def list_directory_tree_with_os_walk(starting_directory):
    for root, directories, files in os.walk(starting_directory):
        print(f"Directory: {root}")
        for file in files:
            print(f"  File: {file}")

list_directory_tree_with_os_walk('.')

Directory: .
  File: modelling_oneLoc.ipynb
  File: my_first_submission.csv
  File: .DS_Store
  File: explore_locA.ipynb
  File: test.csv
  File: Readme.md
  File: modelling_mixedData.ipynb
  File: modelling_allLoc.ipynb
  File: sample_submission.csv
  File: read_files.ipynb
Directory: ./A
  File: X_train_observed.parquet
  File: train_targets.parquet
  File: X_train_estimated.parquet
  File: X_test_estimated.parquet
Directory: ./C
  File: X_train_observed.parquet
  File: train_targets.parquet
  File: X_train_estimated.parquet
  File: X_test_estimated.parquet
Directory: ./B
  File: X_train_observed.parquet
  File: train_targets.parquet
  File: X_train_estimated.parquet
  File: X_test_estimated.parquet


In [3]:
train_a = pd.read_parquet('A/train_targets.parquet')
train_b = pd.read_parquet('B/train_targets.parquet')
train_c = pd.read_parquet('C/train_targets.parquet')

train_a['Location'] = 1
train_b['Location'] = 2
train_c['Location'] = 3

target = pd.concat([train_a, train_b, train_c], ignore_index=True)
target.reset_index(drop=True, inplace=True)
target.rename(columns={'time': 'date_forecast'}, inplace=True)
target

Unnamed: 0,date_forecast,pv_measurement,Location
0,2019-06-02 22:00:00,0.00,1
1,2019-06-02 23:00:00,0.00,1
2,2019-06-03 00:00:00,0.00,1
3,2019-06-03 01:00:00,0.00,1
4,2019-06-03 02:00:00,19.36,1
...,...,...,...
99083,2023-04-30 19:00:00,50.96,3
99084,2023-04-30 20:00:00,2.94,3
99085,2023-04-30 21:00:00,0.00,3
99086,2023-04-30 22:00:00,-0.00,3


In [4]:
locations = [(1, 'A/X_train_estimated.parquet'),
             (2, 'B/X_train_estimated.parquet'),
             (3, 'C/X_train_estimated.parquet')]

# Initialize an empty list to store the processed DataFrames
processed_dataframes = []

for location, file_path in locations:
    # Load the dataset
    df = pd.read_parquet(file_path)
    # Assuming 'date_forecast' is a datetime column
    df['hourly_timestamp'] = df['date_forecast'].dt.floor('H')
    # Group by 'hourly_timestamp' and calculate the mean for each group
    df = df.groupby('hourly_timestamp').mean().reset_index()
    # Drop the 'hourly_timestamp' column if you don't need it in the final DataFrame
    df.drop(columns=['hourly_timestamp'], inplace=True)
    df['date_forecast'] = df['date_forecast'].dt.floor('H')
    # Store the 'Location' information in a new column
    df['Location'] = location
    # Append the processed DataFrame to the list
    processed_dataframes.append(df)

# Concatenate all processed DataFrames into one
validation = pd.concat(processed_dataframes, ignore_index=True)
validation

Unnamed: 0,date_calc,date_forecast,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,direct_rad:W,direct_rad_1h:J,effective_cloud_cover:p,elevation:m,fresh_snow_12h:cm,fresh_snow_1h:cm,fresh_snow_24h:cm,fresh_snow_3h:cm,fresh_snow_6h:cm,is_day:idx,is_in_shadow:idx,msl_pressure:hPa,precip_5min:mm,precip_type_5min:idx,pressure_100m:hPa,pressure_50m:hPa,prob_rime:p,rain_water:kgm2,relative_humidity_1000hPa:p,sfc_pressure:hPa,snow_density:kgm3,snow_depth:cm,snow_drift:idx,snow_melt_10min:mm,snow_water:kgm2,sun_azimuth:d,sun_elevation:d,super_cooled_liquid_water:kgm2,t_1000hPa:K,total_cloud_cover:p,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,Location
0,2022-10-27 07:00:18,2022-10-28 22:00:00,8.350,1.23300,2416.250000,0.000000,0.000,1734.949951,1.0,281.274994,0.000,0.000000,0.0,0.000000,100.000000,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1000.599976,0.0000,0.0,988.049988,994.049988,0.0,0.075,92.674995,1000.150024,,0.0,0.0,0.0,0.475,347.822754,-39.065250,0.000,284.675018,100.000000,20712.525391,0.700,-0.325,0.650,0.0,1
1,2022-10-27 07:00:18,2022-10-28 23:00:00,8.100,1.23400,4035.199951,0.000000,0.000,2059.875000,1.0,280.850006,0.000,0.000000,0.0,0.000000,100.000000,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,999.525024,0.0000,0.0,987.000000,993.000000,0.0,0.100,92.949997,999.075012,,0.0,0.0,0.0,0.700,96.748253,-39.382252,0.000,284.325012,100.000000,5624.174805,0.775,0.425,0.525,0.0,1
2,2022-10-28 07:00:05,2022-10-29 00:00:00,8.150,1.22975,1882.000000,0.000000,0.000,1675.050049,1.0,280.924988,0.000,0.000000,0.0,0.000000,100.000000,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,996.974976,0.1175,1.0,984.424988,990.424988,0.0,0.200,89.199997,996.474976,,0.0,0.0,0.0,1.175,24.241001,-38.040501,0.200,283.875000,100.000000,3240.425049,1.500,1.050,0.975,0.0,1
3,2022-10-28 07:00:05,2022-10-29 01:00:00,8.200,1.22850,1622.400024,0.000000,0.000,1547.550049,1.0,281.049988,0.000,0.000000,0.0,0.000000,100.000000,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,996.349976,0.1000,1.0,983.849976,989.849976,0.0,0.225,90.175003,995.849976,,0.0,0.0,0.0,0.875,41.830750,-34.390499,0.250,284.350006,100.000000,3243.500000,1.575,1.000,1.200,0.0,1
4,2022-10-28 07:00:05,2022-10-29 02:00:00,8.375,1.22700,1322.275024,0.000000,0.000,1053.724976,1.0,281.349976,0.000,0.000000,0.0,0.000000,100.000000,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,996.174988,0.2025,1.0,983.599976,989.574951,0.0,0.300,92.324997,995.599976,,0.0,0.0,0.0,1.000,57.993248,-29.268749,0.250,284.524994,100.000000,2528.449951,2.200,1.825,1.175,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13177,2023-04-29 07:00:05,2023-04-30 19:00:00,4.400,1.27550,1456.574951,84010.148438,4.175,551.224976,0.0,272.024994,2.775,54774.000000,0.0,9934.575195,97.724998,24.0,0.0,0.0,0.3,0.0,0.0,0.5,1.0,1014.900024,0.0000,0.0,999.049988,1005.275024,0.0,0.000,74.800003,1011.525024,,0.0,0.0,0.0,0.000,304.936493,-0.201500,0.000,274.924988,97.724998,25028.000000,4.075,3.600,1.875,0.0,3
13178,2023-04-29 07:00:05,2023-04-30 20:00:00,4.400,1.27850,1476.349976,2206.800049,0.000,564.099976,0.0,271.950012,0.000,4984.049805,0.0,0.000000,95.449997,24.0,0.0,0.0,0.3,0.0,0.0,0.0,1.0,1014.849976,0.0000,0.0,999.025024,1005.250000,0.0,0.000,76.974998,1011.549988,,0.0,0.0,0.0,0.000,318.620483,-5.204000,0.000,274.575012,95.850006,23995.599609,3.600,2.950,2.125,0.0,3
13179,2023-04-29 07:00:05,2023-04-30 21:00:00,4.400,1.27900,1516.300049,0.000000,0.000,578.700012,0.0,271.899994,0.000,0.000000,0.0,0.000000,93.925003,24.0,0.0,0.0,0.3,0.0,0.0,0.0,1.0,1014.650024,0.0000,0.0,998.900024,1005.125000,0.0,0.000,77.724998,1011.400024,,0.0,0.0,0.0,0.000,332.780243,-8.984500,0.025,274.399994,95.925003,23068.599609,3.600,2.625,2.400,0.0,3
13180,2023-04-29 07:00:05,2023-04-30 22:00:00,4.400,1.27975,1240.599976,0.000000,0.000,551.500000,0.0,271.950012,0.000,0.000000,0.0,0.000000,98.375000,24.0,0.0,0.0,0.3,0.0,0.0,0.0,1.0,1014.500000,0.0000,0.0,998.724976,1004.974976,0.0,0.000,79.400002,1011.224976,,0.0,0.0,0.0,0.075,347.373230,-11.270500,0.125,274.225006,99.425003,11856.700195,3.275,2.325,2.325,0.0,3


In [5]:
locations = [(1, 'A/X_train_observed.parquet'),
             (2, 'B/X_train_observed.parquet'),
             (3, 'C/X_train_observed.parquet')]

# Initialize an empty list to store the processed DataFrames
processed_dataframes = []

for location, file_path in locations:
    # Load the dataset
    df = pd.read_parquet(file_path)
    # Assuming 'date_forecast' is a datetime column
    df['hourly_timestamp'] = df['date_forecast'].dt.floor('H')
    # Group by 'hourly_timestamp' and calculate the mean for each group
    df = df.groupby('hourly_timestamp').mean().reset_index()
    # Drop the 'hourly_timestamp' column if you don't need it in the final DataFrame
    df.drop(columns=['hourly_timestamp'], inplace=True)
    df['date_forecast'] = df['date_forecast'].dt.floor('H')
    # Store the 'Location' information in a new column
    df['Location'] = location
    # Append the processed DataFrame to the list
    processed_dataframes.append(df)

# Concatenate all processed DataFrames into one
training = pd.concat(processed_dataframes, ignore_index=True)
training

Unnamed: 0,date_forecast,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,direct_rad:W,direct_rad_1h:J,effective_cloud_cover:p,elevation:m,fresh_snow_12h:cm,fresh_snow_1h:cm,fresh_snow_24h:cm,fresh_snow_3h:cm,fresh_snow_6h:cm,is_day:idx,is_in_shadow:idx,msl_pressure:hPa,precip_5min:mm,precip_type_5min:idx,pressure_100m:hPa,pressure_50m:hPa,prob_rime:p,rain_water:kgm2,relative_humidity_1000hPa:p,sfc_pressure:hPa,snow_density:kgm3,snow_depth:cm,snow_drift:idx,snow_melt_10min:mm,snow_water:kgm2,sun_azimuth:d,sun_elevation:d,super_cooled_liquid_water:kgm2,t_1000hPa:K,total_cloud_cover:p,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,Location
0,2019-06-02 22:00:00,7.700,1.22825,1728.949951,0.000000,0.000,1728.949951,0.0,280.299988,0.000000,0.000000,0.00,0.000000,99.074997,6.0,0.0,0.0,0.0,0.0,0.0,0.00,1.0,1006.299988,0.0,0.0,993.750000,999.775024,0.0,0.000,71.674995,1005.799988,,0.0,0.0,0.0,0.175,348.036743,-3.774250,0.000,286.225006,100.000000,40386.476562,3.600,-3.575,-0.500,0.0,1
1,2019-06-02 23:00:00,7.700,1.22350,1689.824951,0.000000,0.000,1689.824951,0.0,280.299988,0.000000,0.000000,0.00,0.000000,99.750000,6.0,0.0,0.0,0.0,0.0,0.0,0.00,1.0,1005.200012,0.0,0.0,992.674988,998.650024,0.0,0.025,68.000000,1004.650024,,0.0,0.0,0.0,0.200,91.980751,-4.357250,0.000,286.899994,100.000000,33770.648438,3.350,-3.350,0.275,0.0,1
2,2019-06-03 00:00:00,7.875,1.21975,1563.224976,0.000000,0.000,1563.224976,0.0,280.649994,0.000000,0.000000,0.00,0.000000,100.000000,6.0,0.0,0.0,0.0,0.0,0.0,0.00,1.0,1004.525024,0.0,0.0,992.000000,997.974976,0.0,0.100,67.949997,1003.950012,,0.0,0.0,0.0,0.400,14.934750,-3.309500,0.000,286.950012,100.000000,13595.500000,3.050,-2.950,0.750,0.0,1
3,2019-06-03 01:00:00,8.425,1.21800,1283.425049,208.649994,0.750,1283.425049,0.0,281.674988,0.300000,526.775024,0.00,0.000000,100.000000,6.0,0.0,0.0,0.0,0.0,0.0,0.25,1.0,1004.025024,0.0,0.0,991.500000,997.449951,0.0,0.125,73.875000,1003.449951,,0.0,0.0,0.0,0.550,28.630251,-0.822500,0.000,286.750000,100.000000,2321.850098,2.725,-2.600,0.875,0.0,1
4,2019-06-03 02:00:00,8.950,1.21800,1003.500000,32468.150391,23.100,1003.500000,0.0,282.500000,11.975000,22068.949219,0.15,282.975006,84.875000,6.0,0.0,0.0,0.0,0.0,0.0,1.00,0.0,1003.099976,0.0,0.0,990.550049,996.500000,0.0,0.100,79.925003,1002.500000,,0.0,0.0,0.0,0.250,41.997501,3.051250,0.000,286.450012,99.224998,11634.799805,2.550,-2.350,0.925,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88103,2022-05-01 18:00:00,5.650,1.25125,1280.150024,387630.375000,58.125,921.000000,0.0,275.725006,35.700001,199808.343750,0.45,3454.600098,98.500000,24.0,0.0,0.0,0.0,0.0,0.0,1.00,0.5,1017.349976,0.0,0.0,1001.750000,1007.849976,0.0,0.000,65.000000,1013.950012,,0.0,0.0,0.0,0.000,291.603760,6.131250,0.050,280.424988,100.000000,38724.476562,3.100,3.075,0.400,0.0,3
88104,2022-05-01 19:00:00,5.350,1.25350,3248.175049,96742.523438,5.675,1823.900024,0.0,274.850006,3.750000,70994.523438,0.00,790.424988,85.050003,24.0,0.0,0.0,0.0,0.0,0.0,0.50,1.0,1016.700012,0.0,0.0,1001.150024,1007.275024,0.0,0.000,63.599998,1013.424988,,0.0,0.0,0.0,0.000,304.903015,0.223000,0.075,280.000000,100.000000,38617.699219,3.425,2.975,1.700,0.0,3
88105,2022-05-01 20:00:00,5.100,1.25600,5035.875000,3593.824951,0.000,2554.350098,0.0,274.174988,0.000000,6722.825195,0.00,0.000000,72.775002,24.0,0.0,0.0,0.0,0.0,0.0,0.00,1.0,1015.674988,0.0,0.0,1000.099976,1006.250000,0.0,0.000,63.400002,1012.400024,,0.0,0.0,0.0,0.000,318.553497,-4.783250,0.000,279.600006,100.000000,38236.101562,4.200,2.900,3.000,0.0,3
88106,2022-05-01 21:00:00,4.875,1.25600,2237.000000,0.000000,0.000,1458.474976,0.0,273.524994,0.000000,0.000000,0.00,0.000000,97.500000,24.0,0.0,0.0,0.0,0.0,0.0,0.00,1.0,1014.349976,0.0,0.0,998.799988,1004.924988,0.0,0.000,62.900002,1011.099976,,0.0,0.0,0.0,0.000,332.671509,-8.572001,0.075,279.250000,100.000000,38384.398438,4.850,3.250,3.600,0.0,3


In [6]:
locations = [(1, 'A/X_test_estimated.parquet'),
             (2, 'B/X_test_estimated.parquet'),
             (3, 'C/X_test_estimated.parquet')]

# Initialize an empty list to store the processed DataFrames
processed_dataframes = []

for location, file_path in locations:
    # Load the dataset
    df = pd.read_parquet(file_path)
    # Assuming 'date_forecast' is a datetime column
    df['hourly_timestamp'] = df['date_forecast'].dt.floor('H')
    # Group by 'hourly_timestamp' and calculate the mean for each group
    df = df.groupby('hourly_timestamp').mean().reset_index()
    # Drop the 'hourly_timestamp' column if you don't need it in the final DataFrame
    df.drop(columns=['hourly_timestamp'], inplace=True)
    df['date_forecast'] = df['date_forecast'].dt.floor('H')
    # Store the 'Location' information in a new column
    df['Location'] = location
    # Append the processed DataFrame to the list
    processed_dataframes.append(df)

# Concatenate all processed DataFrames into one
test = pd.concat(processed_dataframes, ignore_index=True)
test

Unnamed: 0,date_calc,date_forecast,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,direct_rad:W,direct_rad_1h:J,effective_cloud_cover:p,elevation:m,fresh_snow_12h:cm,fresh_snow_1h:cm,fresh_snow_24h:cm,fresh_snow_3h:cm,fresh_snow_6h:cm,is_day:idx,is_in_shadow:idx,msl_pressure:hPa,precip_5min:mm,precip_type_5min:idx,pressure_100m:hPa,pressure_50m:hPa,prob_rime:p,rain_water:kgm2,relative_humidity_1000hPa:p,sfc_pressure:hPa,snow_density:kgm3,snow_depth:cm,snow_drift:idx,snow_melt_10min:mm,snow_water:kgm2,sun_azimuth:d,sun_elevation:d,super_cooled_liquid_water:kgm2,t_1000hPa:K,total_cloud_cover:p,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,Location
0,2023-04-30 07:00:04,2023-05-01 00:00:00,4.325,1.28675,912.700012,0.000000,0.000000,1061.550049,0.0,271.650024,0.000000,0.000000,0.000000,0.000000,74.949997,6.0,0.0,0.0,0.0,0.0,0.0,0.00,1.0,1013.674988,0.0,0.0,1000.549988,1006.799988,0.0,0.0,80.275002,1013.099976,,0.0,0.0,0.0,0.0,16.026501,-10.541000,0.000,273.799988,74.949997,29907.500000,3.950,2.100,3.350,0.0,1
1,2023-04-30 07:00:04,2023-05-01 01:00:00,4.275,1.28600,1482.099976,0.000000,0.000000,1075.100098,0.0,271.450012,0.000000,0.000000,0.000000,0.000000,77.474998,6.0,0.0,0.0,0.0,0.0,0.0,0.00,1.0,1013.150024,0.0,0.0,1000.049988,1006.299988,0.0,0.0,79.824997,1012.599976,,0.0,0.0,0.0,0.0,30.497250,-7.894500,0.000,273.799988,77.474998,29519.074219,3.825,1.925,3.300,0.0,1
2,2023-04-30 07:00:04,2023-05-01 02:00:00,4.150,1.28375,1791.300049,0.000000,0.000000,1200.400024,0.0,271.049988,0.000000,0.000000,0.000000,0.000000,88.099998,6.0,0.0,0.0,0.0,0.0,0.0,0.00,1.0,1012.674988,0.0,0.0,999.500000,1005.799988,0.0,0.0,78.000000,1012.049988,,0.0,0.0,0.0,0.0,44.517250,-3.815500,0.000,273.849976,88.099998,31009.125000,3.650,1.750,3.200,0.0,1
3,2023-04-30 07:00:04,2023-05-01 03:00:00,4.025,1.28200,2312.875000,10124.424805,11.675000,1179.849976,0.0,270.649994,9.375000,16845.226562,2.100000,3765.350098,68.599998,6.0,0.0,0.0,0.0,0.0,0.0,0.75,0.5,1012.174988,0.0,0.0,998.974976,1005.224976,0.0,0.0,75.625000,1011.525024,,0.0,0.0,0.0,0.0,58.083000,1.412500,0.000,273.899994,68.599998,34552.500000,3.500,1.450,3.150,0.0,1
4,2023-04-30 07:00:04,2023-05-01 04:00:00,3.900,1.28100,2198.299805,141748.593750,76.875000,920.049988,0.0,270.375000,47.400002,102209.703125,25.450001,49571.199219,66.300003,6.0,0.0,0.0,0.0,0.0,0.0,1.00,0.0,1011.724976,0.0,0.0,998.549988,1004.750000,0.0,0.0,74.224998,1011.049988,,0.0,0.0,0.0,0.0,71.341003,7.468500,0.000,273.924988,66.300003,35483.875000,3.325,1.300,3.050,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2155,2023-07-02 07:00:31,2023-07-03 19:00:00,8.400,1.19675,3493.649902,475981.375000,84.775002,1967.375000,0.0,281.700012,32.200001,162567.531250,13.450001,53064.921875,87.474998,24.0,0.0,0.0,0.0,0.0,0.0,1.00,0.0,992.150024,0.0,0.0,977.575012,983.400024,0.0,0.0,71.650002,989.299988,,0.0,0.0,0.0,0.0,306.688995,8.131001,0.100,286.850006,88.175003,41007.898438,2.175,1.900,-1.075,0.0,3
2156,2023-07-02 07:00:31,2023-07-03 20:00:00,8.600,1.20000,3078.750000,183261.343750,24.549999,1449.500000,0.0,282.024994,13.875000,82875.351562,2.400000,28523.800781,78.074997,24.0,0.0,0.0,0.0,0.0,0.0,1.00,0.5,992.575012,0.0,0.0,977.974976,983.849976,0.0,0.0,75.324997,989.750000,,0.0,0.0,0.0,0.0,319.703003,3.239500,0.000,286.450012,78.949997,41315.949219,2.200,2.000,-0.925,0.0,3
2157,2023-07-02 07:00:31,2023-07-03 21:00:00,8.875,1.20350,2308.399902,36831.074219,1.225000,1543.650024,0.0,282.350006,1.250000,27210.250000,0.000000,4322.475098,79.550003,24.0,0.0,0.0,0.0,0.0,0.0,0.25,1.0,992.950012,0.0,0.0,978.250000,984.150024,0.0,0.0,80.425003,990.075012,,0.0,0.0,0.0,0.0,333.037231,-0.477000,0.000,285.950012,80.449997,41665.898438,2.250,2.050,-0.950,0.0,3
2158,2023-07-02 07:00:31,2023-07-03 22:00:00,9.000,1.20650,2000.449951,344.575012,0.000000,1725.949951,0.0,282.600006,0.000000,2242.149902,0.000000,0.000000,100.000000,24.0,0.0,0.0,0.0,0.0,0.0,0.00,1.0,993.275024,0.0,0.0,978.650024,984.549988,0.0,0.0,83.800003,990.474976,,0.0,0.0,0.0,0.0,346.682739,-2.787750,0.075,285.450012,100.000000,39007.601562,1.875,1.700,-0.775,0.0,3


In [7]:
training = pd.concat([training, validation], axis=0)
training

Unnamed: 0,date_forecast,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,direct_rad:W,direct_rad_1h:J,effective_cloud_cover:p,elevation:m,fresh_snow_12h:cm,fresh_snow_1h:cm,fresh_snow_24h:cm,fresh_snow_3h:cm,fresh_snow_6h:cm,is_day:idx,is_in_shadow:idx,msl_pressure:hPa,precip_5min:mm,precip_type_5min:idx,pressure_100m:hPa,pressure_50m:hPa,prob_rime:p,rain_water:kgm2,relative_humidity_1000hPa:p,sfc_pressure:hPa,snow_density:kgm3,snow_depth:cm,snow_drift:idx,snow_melt_10min:mm,snow_water:kgm2,sun_azimuth:d,sun_elevation:d,super_cooled_liquid_water:kgm2,t_1000hPa:K,total_cloud_cover:p,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,Location,date_calc
0,2019-06-02 22:00:00,7.700,1.22825,1728.949951,0.000000,0.000,1728.949951,0.0,280.299988,0.000,0.000000,0.00,0.000000,99.074997,6.0,0.0,0.0,0.0,0.0,0.0,0.00,1.0,1006.299988,0.0,0.0,993.750000,999.775024,0.0,0.000,71.674995,1005.799988,,0.0,0.0,0.0,0.175,348.036743,-3.77425,0.000,286.225006,100.000000,40386.476562,3.600,-3.575,-0.500,0.0,1,NaT
1,2019-06-02 23:00:00,7.700,1.22350,1689.824951,0.000000,0.000,1689.824951,0.0,280.299988,0.000,0.000000,0.00,0.000000,99.750000,6.0,0.0,0.0,0.0,0.0,0.0,0.00,1.0,1005.200012,0.0,0.0,992.674988,998.650024,0.0,0.025,68.000000,1004.650024,,0.0,0.0,0.0,0.200,91.980751,-4.35725,0.000,286.899994,100.000000,33770.648438,3.350,-3.350,0.275,0.0,1,NaT
2,2019-06-03 00:00:00,7.875,1.21975,1563.224976,0.000000,0.000,1563.224976,0.0,280.649994,0.000,0.000000,0.00,0.000000,100.000000,6.0,0.0,0.0,0.0,0.0,0.0,0.00,1.0,1004.525024,0.0,0.0,992.000000,997.974976,0.0,0.100,67.949997,1003.950012,,0.0,0.0,0.0,0.400,14.934750,-3.30950,0.000,286.950012,100.000000,13595.500000,3.050,-2.950,0.750,0.0,1,NaT
3,2019-06-03 01:00:00,8.425,1.21800,1283.425049,208.649994,0.750,1283.425049,0.0,281.674988,0.300,526.775024,0.00,0.000000,100.000000,6.0,0.0,0.0,0.0,0.0,0.0,0.25,1.0,1004.025024,0.0,0.0,991.500000,997.449951,0.0,0.125,73.875000,1003.449951,,0.0,0.0,0.0,0.550,28.630251,-0.82250,0.000,286.750000,100.000000,2321.850098,2.725,-2.600,0.875,0.0,1,NaT
4,2019-06-03 02:00:00,8.950,1.21800,1003.500000,32468.150391,23.100,1003.500000,0.0,282.500000,11.975,22068.949219,0.15,282.975006,84.875000,6.0,0.0,0.0,0.0,0.0,0.0,1.00,0.0,1003.099976,0.0,0.0,990.550049,996.500000,0.0,0.100,79.925003,1002.500000,,0.0,0.0,0.0,0.250,41.997501,3.05125,0.000,286.450012,99.224998,11634.799805,2.550,-2.350,0.925,0.0,1,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13177,2023-04-30 19:00:00,4.400,1.27550,1456.574951,84010.148438,4.175,551.224976,0.0,272.024994,2.775,54774.000000,0.00,9934.575195,97.724998,24.0,0.0,0.0,0.3,0.0,0.0,0.50,1.0,1014.900024,0.0,0.0,999.049988,1005.275024,0.0,0.000,74.800003,1011.525024,,0.0,0.0,0.0,0.000,304.936493,-0.20150,0.000,274.924988,97.724998,25028.000000,4.075,3.600,1.875,0.0,3,2023-04-29 07:00:05
13178,2023-04-30 20:00:00,4.400,1.27850,1476.349976,2206.800049,0.000,564.099976,0.0,271.950012,0.000,4984.049805,0.00,0.000000,95.449997,24.0,0.0,0.0,0.3,0.0,0.0,0.00,1.0,1014.849976,0.0,0.0,999.025024,1005.250000,0.0,0.000,76.974998,1011.549988,,0.0,0.0,0.0,0.000,318.620483,-5.20400,0.000,274.575012,95.850006,23995.599609,3.600,2.950,2.125,0.0,3,2023-04-29 07:00:05
13179,2023-04-30 21:00:00,4.400,1.27900,1516.300049,0.000000,0.000,578.700012,0.0,271.899994,0.000,0.000000,0.00,0.000000,93.925003,24.0,0.0,0.0,0.3,0.0,0.0,0.00,1.0,1014.650024,0.0,0.0,998.900024,1005.125000,0.0,0.000,77.724998,1011.400024,,0.0,0.0,0.0,0.000,332.780243,-8.98450,0.025,274.399994,95.925003,23068.599609,3.600,2.625,2.400,0.0,3,2023-04-29 07:00:05
13180,2023-04-30 22:00:00,4.400,1.27975,1240.599976,0.000000,0.000,551.500000,0.0,271.950012,0.000,0.000000,0.00,0.000000,98.375000,24.0,0.0,0.0,0.3,0.0,0.0,0.00,1.0,1014.500000,0.0,0.0,998.724976,1004.974976,0.0,0.000,79.400002,1011.224976,,0.0,0.0,0.0,0.075,347.373230,-11.27050,0.125,274.225006,99.425003,11856.700195,3.275,2.325,2.325,0.0,3,2023-04-29 07:00:05


In [8]:
training = training.merge(target[['Location', 'date_forecast', 'pv_measurement']], 
                        how='left', 
                        on=['Location', 'date_forecast'], 
                        suffixes=('', '_target'))

# Fill NaN values in 'pv_measurement' column with 0 if needed
#training['pv_measurement'].fillna(0, inplace=True)
training = training.dropna(subset=['pv_measurement'])

In [9]:
training.columns

Index(['date_forecast', 'absolute_humidity_2m:gm3', 'air_density_2m:kgm3',
       'ceiling_height_agl:m', 'clear_sky_energy_1h:J', 'clear_sky_rad:W',
       'cloud_base_agl:m', 'dew_or_rime:idx', 'dew_point_2m:K',
       'diffuse_rad:W', 'diffuse_rad_1h:J', 'direct_rad:W', 'direct_rad_1h:J',
       'effective_cloud_cover:p', 'elevation:m', 'fresh_snow_12h:cm',
       'fresh_snow_1h:cm', 'fresh_snow_24h:cm', 'fresh_snow_3h:cm',
       'fresh_snow_6h:cm', 'is_day:idx', 'is_in_shadow:idx',
       'msl_pressure:hPa', 'precip_5min:mm', 'precip_type_5min:idx',
       'pressure_100m:hPa', 'pressure_50m:hPa', 'prob_rime:p',
       'rain_water:kgm2', 'relative_humidity_1000hPa:p', 'sfc_pressure:hPa',
       'snow_density:kgm3', 'snow_depth:cm', 'snow_drift:idx',
       'snow_melt_10min:mm', 'snow_water:kgm2', 'sun_azimuth:d',
       'sun_elevation:d', 'super_cooled_liquid_water:kgm2', 't_1000hPa:K',
       'total_cloud_cover:p', 'visibility:m', 'wind_speed_10m:ms',
       'wind_speed_u_10m:ms',

In [10]:
correlated_variables = [
    "air_density_2m:kgm3",
    "dew_point_2m:K",
    "t_1000hPa:K",
    "clear_sky_rad:W",
    "diffuse_rad:W",
    "diffuse_rad_1h:J",
    "sun_elevation:d",
    "direct_rad_1h:J",
    "total_cloud_cover:p",
    "fresh_snow_12h:cm",
    "fresh_snow_3h:cm",
    "is_in_shadow:idx",
    "pressure_100m:hPa",
    "pressure_50m:hPa",
    "sfc_pressure:hPa"
]

In [11]:
weird_plots = [
    "elevation:m",
    "is_day:idx",
    "snow_depth:cm",
    "snow_drift:idx",
    "snow_melt_10min:mm",
    "sun_azimuth:d",
    "wind_speed_w_1000hPa:ms",
    "ceiling_height_agl:m",
    "cloud_base_agl:m",
    "snow_density:kgm3"
]

In [12]:
from sklearn.model_selection import train_test_split

# Remove rows with NaN values
merged_data = training.dropna()
merged_data = merged_data.select_dtypes(include=['number'])

# Split the dataset into features (X) and the target variable (y)
X = merged_data.drop(columns=['pv_measurement'])  # Features
X.drop(columns=weird_plots, inplace=True)
X.drop(columns=correlated_variables, inplace=True)
y = merged_data['pv_measurement']  # Target variable

# Split the data into training and test sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Step 2: Create and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=10, random_state=10)  # You can adjust hyperparameters
rf_model.fit(X_train, y_train)

# Step 3: Evaluate the model's performance on the validation data
y_pred = rf_model.predict(X_test)

# Calculate the mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error on validation data: {mse:.2f}')

Mean Squared Error on validation data: 9857.85


In [14]:
test = test.drop(columns=['date_calc'])
test = test.drop(columns=['date_forecast'])
test.drop(columns=weird_plots, inplace=True)
test.drop(columns=correlated_variables, inplace=True)
test.fillna(0, inplace=True)

y_pred_test = rf_model.predict(test)

In [15]:
test2 = pd.read_csv('test.csv')
test2

Unnamed: 0,id,time,prediction,location
0,0,2023-05-01 00:00:00,0,A
1,1,2023-05-01 01:00:00,0,A
2,2,2023-05-01 02:00:00,0,A
3,3,2023-05-01 03:00:00,0,A
4,4,2023-05-01 04:00:00,0,A
...,...,...,...,...
2155,2155,2023-07-03 19:00:00,0,C
2156,2156,2023-07-03 20:00:00,0,C
2157,2157,2023-07-03 21:00:00,0,C
2158,2158,2023-07-03 22:00:00,0,C


In [16]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission

Unnamed: 0,id,prediction
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
2155,2155,0
2156,2156,0
2157,2157,0
2158,2158,0


In [17]:
# Example, let the predictions be random values
test2['prediction'] = y_pred_test
sample_submission = sample_submission[['id']].merge(test2[['id', 'prediction']], on='id', how='left')
sample_submission.to_csv('my_first_submission.csv', index=False)