In [2]:
import climatedata_functions as climf

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date, timedelta, datetime
from tqdm import tqdm


# Joining the weather data

This notebook joins weather data of different time points (0:00, 6:00, 12:00, and 18:00) for seven days. The weather data format is in a column ('t' for temperature, 'si10' for wind, and 'dswrf' for radiation) containing seven values (day of the sampling, 1, 2, 3, 4, 5, 6 days before sampling). This notebook will put the values in only one column containing all 28 values for one parameter for all dates and time points beginning with the data of 6 days before sampling (0:00, 6:00, 12:00, 18:00). 

## Temperature

In [3]:
metadata = pd.read_csv("../data/metadata_temp_12_01_06_complete_without_a.csv")
features = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1','longitude_trans','x_grid','y_grid','temp_01']
temp_00 = pd.read_csv("../data/temp_00.csv")
temp_18 = pd.read_csv("../data/temp_18.csv")
metadata.drop(features, axis=1, inplace=True)

In [4]:
metadata = metadata.merge(temp_00[['uid','t_0']] , how="inner", on='uid' , validate="1:1")
metadata = metadata.merge(temp_18[['uid','t_18']] , how="inner", on='uid' , validate="1:1")
metadata = metadata.rename(columns={'temp_06': 't_6','temp_12': 't_12'})

features = ['t_0', 't_6', 't_12', 't_18']
temp = climf.convert_str_to_list(metadata, features)
temp.head(1)

Unnamed: 0,uid,latitude,longitude,date,split,t_12,t_6,t_0,t_18
0,aabm,39.080319,-86.430867,2018-05-14,train,"[287.3912, 287.32434, 287.29132, 287.193, 287....","[287.36844, 287.32245, 287.33054, 287.1744, 28...","[287.33194, 287.3078, 287.20935, 287.0335, 286...","[287.39413, 287.3707, 287.28082, 287.17728, 28..."


In [5]:
temp.to_csv(f'../data/temperature.csv', index=False)

In [None]:
#temperature
for row in (pbar := tqdm(temp.itertuples(), total=len(temp))):
    print(row)
    time_ser = climf.join_time_values(row.t_0, row.t_6, row.t_12, row.t_18)
    temp.loc[temp['uid'] == row.uid, ['temp']] = str(time_ser)

In [7]:
temp = temp.drop(['t_12', 't_6', 't_0', 't_18'], axis=1)
temp.to_csv('../data/temperature_series.csv', index=False)
temp.head(1)

Unnamed: 0,uid,latitude,longitude,date,split,temp
0,aabm,39.080319,-86.430867,2018-05-14,train,"[286.6784, 286.74725, 286.69934, 286.7568, 286..."


### Join with data 14days prior sampling

In [8]:
temp_before = pd.read_csv("../data/t_18_14_15_d_before.csv")
features = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1', 'temp_01', 'temp_12', 'temp_06', 'dswrf_0']
temp_before = temp_before.drop(features, axis = 1)
temp_before.head(1)

Unnamed: 0,uid,latitude,longitude,date,split,longitude_trans,x_grid,y_grid,t_0,t_6,t_12,t_18
0,aabm,39.080319,-86.430867,2018-05-14,train,273.569133,570,1217,"[286.12405, 286.0968]","[286.211, 286.15918]","[286.21497, 286.1375]","[286.18774, 286.1701]"


In [9]:
temp = temp.merge(temp_before[['uid','t_0', 't_6', 't_12', 't_18', 'x_grid', 'y_grid']] , how="inner", on='uid' , validate="1:1")
temp.head(1)

Unnamed: 0,uid,latitude,longitude,date,split,temp,t_0,t_6,t_12,t_18,x_grid,y_grid
0,aabm,39.080319,-86.430867,2018-05-14,train,"[286.6784, 286.74725, 286.69934, 286.7568, 286...","[286.12405, 286.0968]","[286.211, 286.15918]","[286.21497, 286.1375]","[286.18774, 286.1701]",570,1217


In [10]:
#there ws one erronous line --> the data for that was dowloaded again with this code:
days = 2  #how many days to go back?
hour = 12 #which our of the day (UTS time!!!)# we want to test: 6,12,18,24
param_layer = ':TMP:surface' # options: ':TMP:surface', ' ":DSWRF:surface"'  surface temperature, #available parameters and layes: https://www.nco.ncep.noaa.gov/pmb/products/hrrr/hrrr.t00z.wrfsfcf00.grib2.shtml
forecast_param = 't'#for temperature: 't', for wind:10maboveground: 'si10', for dswrf: 'dswrf' for name of the column in the metadata and the grib2 data


#from tqdm import tqdm
#pbar.set_description(f"Processing {start_date} --> {temp_list[0][0]}")

from datetime import timedelta
import warnings
warnings.simplefilter(action='ignore')

#make empty list to store the dates already sampled
start_date = temp.date.iloc[4853]#4853
date_index_list = temp.index[temp.date == start_date]#list with all the indexes of dates with the same date
temp_list = [[0] * days for i in range(len(date_index_list))]#make list of lists to store the values inside
start_date = climf.get_start_date(start_date)#formate to time object
print(start_date)
for x in range(days):
    count = 0
    #x = x*2 (if we want to take only every second day)
    day_date = start_date - timedelta(days=x+14)
    #ds, stop = climf.get_ds_aws_array(day_date, hour,param_layer, forecast_param)#getting the temperature array for the specified date
    ds, stop = climf.get_ds_aws_array(day_date, hour,param_layer, forecast_param)#getting the temperature array for the specified date
    for index in date_index_list:
        x_grid = temp.x_grid[index]
        y_grid = temp.y_grid[index]
        if stop == True:
            temp_list[count][x] = np.nan
        else:
            temp_list[count][x] = ds[x_grid][y_grid]
        count += 1
        if x == days-1 and index == date_index_list[len(date_index_list)-1]:#if condition is met put the values in the metadata file
            for i in range(len(temp_list)):                    
                temp[forecast_param+'_'+str(hour)].loc[date_index_list[i]] = temp_list[i] #not index but  


2020-09-15


In [11]:
features = ['t_0', 't_6', 't_12', 't_18']
temp = climf.convert_to_str(temp, features)
temp = climf.convert_str_to_list(temp, features)
temp.head(1)

Unnamed: 0,uid,latitude,longitude,date,split,temp,t_0,t_6,t_12,t_18,x_grid,y_grid
0,aabm,39.080319,-86.430867,2018-05-14,train,"[286.6784, 286.74725, 286.69934, 286.7568, 286...","[286.12405, 286.0968]","[286.211, 286.15918]","[286.21497, 286.1375]","[286.18774, 286.1701]",570,1217


In [None]:
#temperature
for row in (pbar := tqdm(temp.itertuples(), total=len(temp))):
    print(row)
    time_ser = climf.join_time_values(row.t_0, row.t_6, row.t_12, row.t_18)
    temp.loc[temp['uid'] == row.uid, ['temp_14_15d_before']] = str(time_ser) 
temp.head(3)

In [13]:
temp = temp.drop(['t_12', 't_6', 't_0', 't_18', 'x_grid', 'y_grid'], axis=1)
temp.to_csv('../data/temperature_series.csv', index=False)


## Wind

In [14]:
wind_0 = pd.read_csv("../data/wind_0.csv")
wind_6 = pd.read_csv("../data/metadata_si10_6_complete.csv")
wind_12 = pd.read_csv("../data/metadata_si10_12_complete.csv")
wind_18 = pd.read_csv("../data/metadata_si10_18_complete.csv")

In [15]:
wind = wind_0.merge(wind_6[['uid','si10_6']] , how="inner", on='uid' , validate="1:1")
wind = wind.merge(wind_12[['uid','si10_12']] , how="inner", on='uid' , validate="1:1")
wind = wind.merge(wind_18[['uid','si10_18']] , how="inner", on='uid' , validate="1:1")

features = ['si10_0','si10_6','si10_12','si10_18']
wind = climf.convert_str_to_list(wind, features)
wind.head(2)

Unnamed: 0,uid,latitude,longitude,date,split,longitude_trans,x_grid,y_grid,si10_0,si10_6,si10_12,si10_18
0,aabm,39.080319,-86.430867,2018-05-14,train,273.569133,570,1217,"[3.2827246, 3.181132, 4.5628977, 1.469874, 5.1...","[2.754265, 2.4434078, 4.9966908, 1.8574135, 4....","[3.8764026, 4.698638, 4.848489, 2.2207417, 2.2...","[3.8426483, 3.9012308, 5.519078, 5.944332, 4.4..."
1,aabn,36.5597,-121.51,2016-08-31,test,238.49,550,192,"[nan, nan, 7.3650117, 6.7087016, 6.459569, nan...","[4.7222066, 5.3414426, 4.505594, 3.556739, nan...","[4.2186155, 4.8103733, 4.7934113, 3.5959115, 3...","[nan, 6.190797, nan, 5.934439, 6.4784923, 6.44..."


In [None]:
#wind
for row in (pbar := tqdm(wind.itertuples(), total=len(wind))):
    time_ser = climf.join_time_values(row.si10_0, row.si10_6, row.si10_12, row.si10_18)
    wind.loc[wind['uid'] == row.uid, ['wind']] = str(time_ser)

In [44]:
wind = wind.drop(['si10_0','si10_6','si10_12','si10_18'], axis=1)
wind.to_csv('../data/wind_series.csv', index=False)

## Radiation

In [17]:
rad_0 = pd.read_csv("../data/metadata_dswrf_0.csv")
rad_6 = pd.read_csv("../data/metadata_dswrf_6.csv")
rad_12 = pd.read_csv("../data/metadata_dswrf_12.csv")
rad_17 = pd.read_csv("../data/metadata_dswrf_17.csv")
rad_18 = pd.read_csv("../data/metadata_dswrf_18.csv")


In [18]:
rad = rad_17.merge(rad_0[['uid','dswrf_0']] , how="inner", on='uid' , validate="1:1")
#rad = rad.merge(rad_6[['uid','dswrf_6']] , how="inner", on='uid' , validate="1:1")
rad = rad.merge(rad_18[['uid','dswrf_18']] , how="inner", on='uid' , validate="1:1")
#rad = rad.merge(rad_12[['uid','dswrf_12']] , how="inner", on='uid' , validate="1:1")

features = ['dswrf_0','dswrf_17','dswrf_18']#add later: 'dswrf_12','dswrf_6',
rad = climf.convert_to_str(rad, features)
rad = climf.convert_str_to_list(rad, features)
rad.head(1)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,uid,latitude,longitude,date,split,longitude_trans,x_grid,y_grid,dswrf_17,dswrf_0,dswrf_18
0,0,0,aabm,39.080319,-86.430867,2018-05-14,train,273.569133,570,1217,"[944.0, 946.0, 147.0, 961.0, 963.0, 391.0, 994.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[902.0, 243.0, 274.0, 911.0, 916.0, 905.0, 948.0]"


In [None]:
#radiation
for row in (pbar := tqdm(rad.itertuples(), total=len(rad))):
    time_ser = climf.join_time_values_three(row.dswrf_0, row.dswrf_17, row.dswrf_18)# add laterthe right one for 12 o'clock!!!!!!
    rad.loc[rad['uid'] == row.uid, ['rad_0_17_18']] = str(time_ser)

In [63]:
features = ['x_grid', 'y_grid','longitude_trans', 'Unnamed: 0.1', 'Unnamed: 0' ]
rad = rad.drop(features, axis=1)
rad.head()

Unnamed: 0,uid,latitude,longitude,date,split,dswrf_17,dswrf_0,dswrf_18,rad_0_17_18
0,aabm,39.080319,-86.430867,2018-05-14,train,"[944.0, 946.0, 147.0, 961.0, 963.0, 391.0, 994.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[902.0, 243.0, 274.0, 911.0, 916.0, 905.0, 948.0]","[0.0, 994.0, 948.0, 0.0, 391.0, 905.0, 0.0, 96..."
1,aabn,36.5597,-121.51,2016-08-31,test,"[nan, 771.0, 630.0, 748.2203, 597.0, 781.0, 56...","[nan, nan, 265.8, 274.1, 265.6, nan, nan]","[nan, 875.0, nan, 854.0, 850.0, 809.0, nan]","[nan, 565.0, nan, nan, 781.0, 809.0, 265.6, 59..."
2,aacd,35.875083,-78.878434,2020-11-19,train,"[574.4, 585.9, 583.6, 588.1, 114.4, 558.1, 380.8]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[482.8, 471.6, 476.4, 496.7, 109.3, 492.2, 393.0]","[0.0, 380.8, 393.0, 0.0, 558.1, 492.2, 0.0, 11..."
3,aaee,35.487,-79.062133,2016-08-24,train,"[nan, nan, 966.0, 941.0, 80.0, 938.0, 473.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[nan, nan, 890.0, 377.0, nan, 868.0, 875.0]","[0.0, 473.0, 875.0, 0.0, 938.0, 868.0, 0.0, 80..."
4,aaff,38.049471,-99.827001,2019-07-23,train,"[991.0, 981.0, 848.0, 982.0, 976.0, 972.0, 975.0]","[126.3, 122.6, 126.7, 128.4, 129.3, 128.7, 129.4]","[1008.0, 975.0, 916.0, 996.0, 988.0, 987.0, 99...","[129.4, 975.0, 990.0, 128.7, 972.0, 987.0, 129..."


In [64]:
rad.to_csv('../data/radiation.csv', index=False)