In [1]:
import datetime
import json
import os
import typing

import pandas as pd
import numpy as np
import tensorflow as tf
import tqdm

def prepare_dataloader(
        dataframe: pd.DataFrame,
        target_datetimes: typing.List[datetime.datetime],
        stations: typing.Dict[typing.AnyStr, typing.Tuple[float, float, float]],
        target_time_offsets: typing.List[datetime.timedelta],
        config: typing.Dict[typing.AnyStr, typing.Any],
) -> tf.data.Dataset:
    """This function should be modified in order to prepare & return your own data loader.
    Note that you can use either the netCDF or HDF5 data. Each iteration over your data loader should return a
    2-element tuple containing the tensor that should be provided to the model as input, and the target values. In
    this specific case, you will not be able to provide the latter since the dataframe contains no GHI, and we are
    only interested in predictions, not training. Therefore, you must return a placeholder (or ``None``) as the second
    tuple element.
    Reminder: the dataframe contains imagery paths for every possible timestamp requested in ``target_datetimes``.
    However, we expect that you will use some of the "past" imagery (i.e. imagery at T<=0) for any T in
    ``target_datetimes``, but you should NEVER rely on "future" imagery to generate predictions (for T>0). We
    will be inspecting data loader implementations to ensure this is the case, and those who "cheat" will be
    dramatically penalized.
    See https://github.com/mila-iqia/ift6759/tree/master/projects/project1/evaluation.md for more information.
    Args:
        dataframe: a pandas dataframe that provides the netCDF file path (or HDF5 file path and offset) for all
            relevant timestamp values over the test period.
        target_datetimes: a list of timestamps that your data loader should use to provide imagery for your model.
            The ordering of this list is important, as each element corresponds to a sequence of GHI values
            to predict. By definition, the GHI values must be provided for the offsets given by ``target_time_offsets``
            which are added to each timestamp (T=0) in this datetimes list.
        stations: a map of station names of interest paired with their coordinates (latitude, longitude, elevation).
        target_time_offsets: the list of timedeltas to predict GHIs for (by definition: [T=0, T+1h, T+3h, T+6h]).
        config: configuration dictionary holding any extra parameters that might be required by the user. These
            parameters are loaded automatically if the user provided a JSON file in their submission. Submitting
            such a JSON file is completely optional, and this argument can be ignored if not needed.
    Returns:
        A ``tf.data.Dataset`` object that can be used to produce input tensors for your model. One tensor
        must correspond to one sequence of past imagery data. The tensors must be generated in the order given
        by ``target_sequences``.
    """
    ################################## MODIFY BELOW ##################################
    # WE ARE PROVIDING YOU WITH A DUMMY DATA GENERATOR FOR DEMONSTRATION PURPOSES.
    # MODIFY EVERYTHINGIN IN THIS BLOCK AS YOU SEE FIT

    def dummy_data_generator():
        """
        Generate dummy data for the model, only for example purposes.
        """
        batch_size = 32
        image_dim = (64, 64)
        n_channels = 5
        output_seq_len = 4

        for i in range(0, len(target_datetimes), batch_size):
            batch_of_datetimes = target_datetimes[i:i+batch_size]
            samples = tf.random.uniform(shape=(
                len(batch_of_datetimes), image_dim[0], image_dim[1], n_channels
            ))
            targets = tf.zeros(shape=(
                len(batch_of_datetimes), output_seq_len
            ))
            # Remember that you do not have access to the targets.
            # Your dataloader should handle this accordingly.
            yield (samples, samples), targets

    data_loader = tf.data.Dataset.from_generator(
        dummy_data_generator, (tf.float32, tf.float32)
    )

    ################################### MODIFY ABOVE ##################################

    return data_loader

In [2]:

import pickle
import json


def read_configuration_file(filename):
    with open(filename) as json_file:
        configuration = json.load(json_file)
    
    catalog_path = configuration['dataframe_path']
    catalog = pickle.load(open(catalog_path,"rb"))
    stations = configuration['stations']
    target_datetimes = configuration['target_datetimes']
    target_time_offsets = configuration['target_time_offsets']
    #catalog = catalog[catalog.index.isin(target_datetimes)]
    return (catalog, target_datetimes, stations, target_time_offsets)
    #g = prepare_dataloader(catalog, target_datetimes, stations, target_time_offsets, None)
    #from src import data

In [3]:
catalog, target_datetimes, stations, target_time_offsets = read_configuration_file('tests/data/dummy_test_cfg.json')

In [4]:
dl = prepare_dataloader(catalog, target_datetimes, stations, target_time_offsets, None)
isinstance(dl, tf.data.Dataset)

True

In [5]:
target_time_offsets

['P0DT0H0M0S', 'P0DT1H0M0S', 'P0DT3H0M0S', 'P0DT6H0M0S']

In [8]:
catalog.index

DatetimeIndex(['2015-01-01 00:00:00', '2015-01-01 00:15:00',
               '2015-01-01 00:30:00', '2015-01-01 00:45:00',
               '2015-01-01 01:00:00', '2015-01-01 01:15:00',
               '2015-01-01 01:30:00', '2015-01-01 01:45:00',
               '2015-01-01 02:00:00', '2015-01-01 02:15:00',
               ...
               '2015-12-31 21:30:00', '2015-12-31 21:45:00',
               '2015-12-31 22:00:00', '2015-12-31 22:15:00',
               '2015-12-31 22:30:00', '2015-12-31 22:45:00',
               '2015-12-31 23:00:00', '2015-12-31 23:15:00',
               '2015-12-31 23:30:00', '2015-12-31 23:45:00'],
              dtype='datetime64[ns]', name='iso-datetime', length=35040, freq=None)

In [6]:
target_datetimes[10]

'2015-01-06T16:00:00'

In [7]:
catalog = pickle.load(open("tests/data/dummy_test_catalog.pkl","rb"))

In [73]:
catalog[catalog.index.isin(['2015-01-01 00:30:00', '2015-01-01 00:45:00',
               '2015-01-01 01:00:00', '2015-01-01 01:15:00'])]

Unnamed: 0_level_0,ncdf_path,hdf5_8bit_path,hdf5_8bit_offset,hdf5_16bit_path,hdf5_16bit_offset,BND_DAYTIME,BND_CLEARSKY_GHI,TBL_DAYTIME,TBL_CLEARSKY_GHI,DRA_DAYTIME,DRA_CLEARSKY_GHI,FPK_DAYTIME,FPK_CLEARSKY_GHI,GWN_DAYTIME,GWN_CLEARSKY_GHI,PSU_DAYTIME,PSU_CLEARSKY_GHI,SXF_DAYTIME,SXF_CLEARSKY_GHI
iso-datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2015-01-01 00:30:00,,/project/cq-training-1/project1/data/hdf5v7_8b...,66,/project/cq-training-1/project1/data/hdf5v5_16...,66,0,0.0,0,0.0,1,1.445237,0,0.0,0,0.0,0,0.0,0,0.0
2015-01-01 00:45:00,/project/cq-training-1/project1/data/netcdf/GO...,/project/cq-training-1/project1/data/hdf5v7_8b...,67,/project/cq-training-1/project1/data/hdf5v5_16...,67,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2015-01-01 01:00:00,/project/cq-training-1/project1/data/netcdf/GO...,/project/cq-training-1/project1/data/hdf5v7_8b...,68,/project/cq-training-1/project1/data/hdf5v5_16...,68,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2015-01-01 01:15:00,/project/cq-training-1/project1/data/netcdf/GO...,/project/cq-training-1/project1/data/hdf5v7_8b...,69,/project/cq-training-1/project1/data/hdf5v5_16...,69,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0


In [84]:
catalog = pickle.load(open("tests/data/catalog-test.pkl","rb"))
catalog.index

DatetimeIndex(['2010-01-01 00:00:00', '2010-01-01 00:15:00',
               '2010-01-01 00:30:00', '2010-01-01 00:45:00',
               '2010-01-01 01:00:00', '2010-01-01 01:15:00',
               '2010-01-01 01:30:00', '2010-01-01 01:45:00',
               '2010-01-01 02:00:00', '2010-01-01 02:15:00',
               ...
               '2010-11-14 19:15:00', '2015-02-15 03:30:00',
               '2014-07-21 14:30:00', '2013-12-31 11:45:00',
               '2011-11-22 17:30:00', '2010-08-15 23:00:00',
               '2010-05-11 19:00:00', '2013-02-15 14:15:00',
               '2010-03-03 01:00:00', '2011-02-08 17:45:00'],
              dtype='datetime64[ns]', name='iso-datetime', length=2100, freq=None)

In [70]:
bli = [Timestamp('2015-01-01 02:30:00'),
 Timestamp('2015-01-01 02:45:00'),
 Timestamp('2015-01-01 03:00:00'),
 Timestamp('2015-01-01 03:15:00'),
 Timestamp('2015-01-01 03:30:00'),
 Timestamp('2015-01-01 03:45:00'),
 Timestamp('2015-01-01 04:00:00')]

NameError: name 'Timestamp' is not defined

In [96]:

blip = np.zeros((10,10))
blip[0,1] = 2
blip

array([[0., 2., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [79]:
target_datetimes = ['2014-07-21 14:30:00', '2013-12-31 11:45:00', '2011-11-22 17:30:00', '2010-08-15 23:00:00']

In [97]:
catalog[catalog.index.isin(target_datetimes)]

Unnamed: 0_level_0,ncdf_path,hdf5_8bit_path,hdf5_8bit_offset,hdf5_16bit_path,hdf5_16bit_offset,BND_DAYTIME,BND_CLEARSKY_GHI,BND_CLOUDINESS,BND_GHI,TBL_DAYTIME,...,GWN_CLOUDINESS,GWN_GHI,PSU_DAYTIME,PSU_CLEARSKY_GHI,PSU_CLOUDINESS,PSU_GHI,SXF_DAYTIME,SXF_CLEARSKY_GHI,SXF_CLOUDINESS,SXF_GHI
iso-datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-07-21 14:30:00,/project/cq-training-1/project1/data/netcdf/GO...,/project/cq-training-1/project1/data/hdf5v7_8b...,26,/project/cq-training-1/project1/data/hdf5v5_16...,26,1,599.918596,clear,586.106667,1,...,clear,559.646667,1,707.57424,cloudy,123.353333,1,505.626071,cloudy,236.726667
2013-12-31 11:45:00,/project/cq-training-1/project1/data/netcdf/GO...,/project/cq-training-1/project1/data/hdf5v7_8b...,15,/project/cq-training-1/project1/data/hdf5v5_16...,15,0,0.0,night,-3.353333,0,...,night,-2.74,0,0.0,night,-2.22,0,0.0,night,-23.68
2011-11-22 17:30:00,/project/cq-training-1/project1/data/netcdf/GO...,/project/cq-training-1/project1/data/hdf5v7_8b...,38,/project/cq-training-1/project1/data/hdf5v5_16...,38,1,491.948725,cloudy,35.62,1,...,cloudy,47.093333,1,470.043666,cloudy,39.246667,1,409.632339,cloudy,132.22
2010-08-15 23:00:00,/project/cq-training-1/project1/data/netcdf/GO...,/project/cq-training-1/project1/data/hdf5v7_8b...,60,/project/cq-training-1/project1/data/hdf5v5_16...,60,1,245.150706,clear,304.24,1,...,clear,312.386667,1,111.478788,variable,116.246667,1,363.082551,variable,302.82


In [None]:

    def test_load_metadata_target_datetimes(self):
        loader = MetadataLoader(CATALOG_PATH)
        target_datetimes = ['2014-07-21 14:30:00', '2013-12-31 11:45:00', '2011-11-22 17:30:00', '2010-08-15 23:00:00']
        target_offsets = [26, 15, 38, 60]
        metadata = loader.load(A_STATION, night_time=True, target_datetimes=target_datetimes)
        i=0
        for datapoint in metadata:
            self.assertEqual(datapoint.image_offset, target_offsets[i])
            i=i+1


In [155]:
from datetime import datetime
datetime.strptime("2014-07-21 14:30:00", "%Y-%m-%d %H:%M:%S")

datetime.datetime(2014, 7, 21, 14, 30)

In [21]:
catalog.loc[catalog.index.tolist()[0]]["BND_DAYTIME"]

0

In [29]:
target_timestamps = catalog.index.tolist()
target_timestamps

[Timestamp('2010-01-01 00:00:00'),
 Timestamp('2010-01-01 00:15:00'),
 Timestamp('2010-01-01 00:30:00'),
 Timestamp('2010-01-01 00:45:00'),
 Timestamp('2010-01-01 01:00:00'),
 Timestamp('2010-01-01 01:15:00'),
 Timestamp('2010-01-01 01:30:00'),
 Timestamp('2010-01-01 01:45:00'),
 Timestamp('2010-01-01 02:00:00'),
 Timestamp('2010-01-01 02:15:00'),
 Timestamp('2010-01-01 02:30:00'),
 Timestamp('2010-01-01 02:45:00'),
 Timestamp('2010-01-01 03:00:00'),
 Timestamp('2010-01-01 03:15:00'),
 Timestamp('2010-01-01 03:30:00'),
 Timestamp('2010-01-01 03:45:00'),
 Timestamp('2010-01-01 04:00:00'),
 Timestamp('2010-01-01 04:15:00'),
 Timestamp('2010-01-01 04:30:00'),
 Timestamp('2010-01-01 04:45:00'),
 Timestamp('2010-01-01 05:00:00'),
 Timestamp('2010-01-01 05:15:00'),
 Timestamp('2010-01-01 05:30:00'),
 Timestamp('2010-01-01 05:45:00'),
 Timestamp('2010-01-01 06:00:00'),
 Timestamp('2010-01-01 06:15:00'),
 Timestamp('2010-01-01 06:30:00'),
 Timestamp('2010-01-01 06:45:00'),
 Timestamp('2010-01-

In [33]:
catalog = pickle.load(open("tests/data/catalog-test.pkl","rb"))

In [32]:
catalog[catalog["BND_DAYTIME"]==1].loc[pd.Timestamp("2010-06-19 22:15:00")]

'/project/cq-training-1/project1/data/netcdf/GOES2010/GridSat-CONUS.goes13.2010.06.19.2215.v01.nc'

In [37]:
catalog.loc(pd.Timestamp("2015-02-13 16:00:00"))

ValueError: No axis named 2015-02-13 16:00:00 for object type <class 'pandas.core.frame.DataFrame'>

In [38]:
catalog

Unnamed: 0_level_0,ncdf_path,hdf5_8bit_path,hdf5_8bit_offset,hdf5_16bit_path,hdf5_16bit_offset,BND_DAYTIME,BND_CLEARSKY_GHI,BND_CLOUDINESS,BND_GHI,TBL_DAYTIME,...,GWN_CLOUDINESS,GWN_GHI,PSU_DAYTIME,PSU_CLEARSKY_GHI,PSU_CLOUDINESS,PSU_GHI,SXF_DAYTIME,SXF_CLEARSKY_GHI,SXF_CLOUDINESS,SXF_GHI
iso-datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-01 00:00:00,,,,,,0,0.000000,night,-4.100000,0,...,night,-7.500000,0,0.000000,night,-1.500000,0,0.000000,night,-3.800000
2010-01-01 00:15:00,,,,,,0,0.000000,night,-4.100000,0,...,night,-7.493333,0,0.000000,night,-1.446667,0,0.000000,night,-3.893333
2010-01-01 00:30:00,,,,,,0,0.000000,night,-4.100000,0,...,night,-7.420000,0,0.000000,night,-1.766667,0,0.000000,night,-4.166667
2010-01-01 00:45:00,,,,,,0,0.000000,night,-4.100000,0,...,night,-7.533333,0,0.000000,night,-2.033333,0,0.000000,night,-4.040000
2010-01-01 01:00:00,,,,,,0,0.000000,night,-4.100000,0,...,night,-7.586667,0,0.000000,night,-2.006667,0,0.000000,night,-4.133333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2010-08-15 23:00:00,/project/cq-training-1/project1/data/netcdf/GO...,/project/cq-training-1/project1/data/hdf5v7_8b...,60,/project/cq-training-1/project1/data/hdf5v5_16...,60,1,245.150706,clear,304.240000,1,...,clear,312.386667,1,111.478788,variable,116.246667,1,363.082551,variable,302.820000
2010-05-11 19:00:00,/project/cq-training-1/project1/data/netcdf/GO...,/project/cq-training-1/project1/data/hdf5v7_8b...,44,/project/cq-training-1/project1/data/hdf5v5_16...,44,1,863.057654,variable,722.440000,1,...,variable,871.813333,1,829.535316,cloudy,94.206667,1,897.140557,cloudy,204.586667
2013-02-15 14:15:00,/project/cq-training-1/project1/data/netcdf/GO...,/project/cq-training-1/project1/data/hdf5v7_8b...,25,/project/cq-training-1/project1/data/hdf5v5_16...,25,1,207.685968,cloudy,57.306667,1,...,variable,244.513333,1,338.500992,cloudy,120.926667,1,73.395188,slightly cloudy,44.460000
2010-03-03 01:00:00,,/project/cq-training-1/project1/data/hdf5v7_8b...,68,/project/cq-training-1/project1/data/hdf5v5_16...,68,0,0.000000,night,-3.500000,0,...,night,-4.686667,0,0.000000,night,-2.346667,0,0.000000,night,-5.253333


In [108]:
import pvlib
from pvlib import clearsky, atmosphere, solarposition
from pvlib.location import Location
from pvlib.iotools import read_tmy3
stations

{'BND': [40.05192, -88.37309, 230],
 'TBL': [40.12498, -105.2368, 1689],
 'DRA': [36.62373, -116.01947, 1007],
 'FPK': [48.30783, -105.1017, 634],
 'GWN': [34.2547, -89.8729, 98],
 'PSU': [40.72012, -77.93085, 376],
 'SXF': [43.73403, -96.62328, 473]}

In [212]:
bnd = Location(latitude = stations["BND"][0], longitude = stations["BND"][1], altitude=stations["BND"][2])
ghi = bnd.get_clearsky( pd.date_range(start='2010-01-01 13:00:00', periods=7, freq="1H"))["ghi"]

In [214]:
catalog[catalog.index.isin(ghi.index)][["BND_GHI"]]

Unnamed: 0_level_0,BND_GHI
iso-datetime,Unnamed: 1_level_1
2010-01-01 13:00:00,-3.58
2010-01-01 14:00:00,29.106667
2010-01-01 15:00:00,211.873333
2010-01-01 16:00:00,356.273333
2010-01-01 17:00:00,452.686667
2010-01-01 18:00:00,480.426667
2010-01-01 19:00:00,481.046667


In [9]:
ghi = bnd.get_clearsky( pd.date_range(start='2012-10-17 18:00:00', periods=7, freq="1H"))["ghi"]
ghi.index.tolist()

NameError: name 'bnd' is not defined

In [198]:
catalog.index.isin(ghi.index.tolist()).sum()

0

In [126]:
 pd.date_range(start='2014-07-21 14:30:00', end='2014-07-21 20:30:00', periods=1 )

DatetimeIndex(['2014-07-21 14:30:00'], dtype='datetime64[ns]', freq=None)

In [121]:
pd.DataFrame(target_datetimes).set_index(0)

2014-07-21 14:30:00
2013-12-31 11:45:00
2011-11-22 17:30:00
2010-08-15 23:00:00


In [178]:
class AutoNumber(IntEnum):
    def __new__(cls):
        value = len(cls.__members__)
        obj = int.__new__(cls)
        obj._value_ = value
        return obj

In [179]:
from enum import IntEnum
class Targets(AutoNumber):
    """Mapping for the targets to their location in the target tensor."""
    GHI_T = ()
    GHI_T_1h = ()
    GHI_T_3h = ()
    GHI_T_6h = ()

In [180]:
len(Targets)

4

In [183]:
Targets.GHI_T_3h

<Targets.GHI_T_3h: 2>