In [2]:
# import packages 

import numpy as np
import os
import datetime
import time
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline


# folder path and name
project_path = os.getcwd()
data_folder = os.path.join(os.getcwd(),'data')
pred_folder = os.path.join(data_folder,'data_forecast')
pv_data_path = os.path.join(data_folder,'pv_data','pv_output_valid.pkl')
data_path = os.path.join(data_folder,'video_prediction_224.h5')

image_name_format = '%Y%m%d%H%M%S'

# Operating parameter
stack_height = 15 # 15 minute
forecast_horizon = 15 # 15 minutes ahead forecast
sampling_interval_all = [2]
output_img_shape = [224, 224, 3]

start_date = datetime.datetime(2017,1,1) #NOTE: Inclusive of start date
end_date = datetime.datetime(2018,1,1) #NOTE: Exclusive of end date (only end up with 2017 data)

In [3]:
# Setting up test set
sunny_day = [(2017,9,15),(2017,10,6),(2017,10,22),(2018,2,16),(2018,6,12),(2018,6,23),(2019,1,25),(2019,6,23),(2019,7,14),(2019,10,14)]
cloudy_day = [(2017,6,24),(2017,9,20),(2017,10,11),(2018,1,25),(2018,3,9),(2018,10,4),(2019,5,27),(2019,6,28),(2019,8,10),(2019,10,19)]

sunny_datetime = [datetime.datetime(day[0],day[1],day[2]) for day in sunny_day]
cloudy_datetime = [datetime.datetime(day[0],day[1],day[2]) for day in cloudy_day]
test_dates = sunny_datetime + cloudy_datetime

In [4]:
test_dates

[datetime.datetime(2017, 9, 15, 0, 0),
 datetime.datetime(2017, 10, 6, 0, 0),
 datetime.datetime(2017, 10, 22, 0, 0),
 datetime.datetime(2018, 2, 16, 0, 0),
 datetime.datetime(2018, 6, 12, 0, 0),
 datetime.datetime(2018, 6, 23, 0, 0),
 datetime.datetime(2019, 1, 25, 0, 0),
 datetime.datetime(2019, 6, 23, 0, 0),
 datetime.datetime(2019, 7, 14, 0, 0),
 datetime.datetime(2019, 10, 14, 0, 0),
 datetime.datetime(2017, 6, 24, 0, 0),
 datetime.datetime(2017, 9, 20, 0, 0),
 datetime.datetime(2017, 10, 11, 0, 0),
 datetime.datetime(2018, 1, 25, 0, 0),
 datetime.datetime(2018, 3, 9, 0, 0),
 datetime.datetime(2018, 10, 4, 0, 0),
 datetime.datetime(2019, 5, 27, 0, 0),
 datetime.datetime(2019, 6, 28, 0, 0),
 datetime.datetime(2019, 8, 10, 0, 0),
 datetime.datetime(2019, 10, 19, 0, 0)]

In [5]:
def find_idx_with_dates(all_times,test_dates):
    idx=[]
    for test_day in test_dates:
        test_day_end = test_day + datetime.timedelta(days = 1)
        idx+=np.nonzero((all_times>test_day)*(all_times<test_day_end))[0].tolist()
    return idx

# This two function does the same thing. Just that one is for np, the other for pd.

def find_time_within_nparray(time_array,time_point):
    probable_idx = np.searchsorted(time_array,time_point)
    
    # If the time point is after all the time in pv_data
    if probable_idx == len(time_array):
        return None   
    
    # See if the time point is actually a match 
    if time_array[probable_idx]== time_point: 
        return probable_idx
        
    else:
        return None

def find_time_within_pdseries(time_array,time_point):
    probable_idx = np.searchsorted(time_array,time_point)
    
    # If the time point is after all the time in pv_data
    if probable_idx == len(time_array):
        return None   
    
    # See if the time point is actually a match 
    if time_array[probable_idx] == time_point: 
        return probable_idx
        
    else:
        return None

In [6]:
def store_trainval_test(all_times,image_log,pv_log,pv_pred,pred_folder):
    
    ## Splitting into Trainval and Test set 
    idx_test = find_idx_with_dates(all_times,test_dates)
    image_log_test = image_log[idx_test]
    pv_log_test = pv_log[idx_test]
    pv_pred_test = pv_pred[idx_test]
    times_test = all_times[idx_test]

    # the rest become the trainval set
    mask_trainval = np.ones_like(pv_pred,dtype = bool)
    mask_trainval[idx_test] = 0
    image_log_trainval = image_log[mask_trainval]
    pv_log_trainval = pv_log[mask_trainval]
    pv_pred_trainval = pv_pred[mask_trainval]
    times_trainval = all_times[mask_trainval]
    
    print("times_trainval.shape",times_trainval.shape)
    print("image_log_trainval.shape",image_log_trainval.shape)
    print("pv_log_trainval.shape",pv_log_trainval.shape)
    print("pv_pred_trainval.shape",pv_pred_trainval.shape)
    
    print("times_test.shape",times_test.shape)
    print("image_log_test.shape",image_log_test.shape)
    print("pv_log_test.shape",pv_log_test.shape)
    print("pv_pred_test.shape",pv_pred_test.shape)
    
    ## Storing information
    # storing the training set
    np.save(os.path.join(pred_folder,'image_log_trainval.npy'), image_log_trainval)
    np.save(os.path.join(pred_folder,'pv_log_trainval.npy'), pv_log_trainval)
    np.save(os.path.join(pred_folder,'pv_pred_trainval.npy'),pv_pred_trainval)
    np.save(os.path.join(pred_folder,'times_trainval.npy'),times_trainval)

    # storing the testing set
    np.save(os.path.join(pred_folder,'image_log_test.npy'), image_log_test)
    np.save(os.path.join(pred_folder,'pv_log_test.npy'), pv_log_test)
    np.save(os.path.join(pred_folder,'pv_pred_test.npy'),pv_pred_test)
    np.save(os.path.join(pred_folder,'times_test.npy'),times_test)

In [6]:
# Load in  high frequency data
# the image here are ones that have corresponding PV value
all_times = np.load(os.path.join(data_folder,'data_expanded','all_times_highfreq.npy'), allow_pickle=True)
all_images = np.load(os.path.join(data_folder,'data_expanded','all_images_highfreq.npy'), allow_pickle=True)
pv_data = np.load(pv_data_path, allow_pickle=True)
#pv_data = np.load(os.path.join(data_folder,'data_expanded','pv_outputs_highfreq.npy'), allow_pickle=True)

# only pick out the relevant time period
#relevant_mask = (all_times>=start_date)&(all_times<end_date)
#all_times = all_times[relevant_mask]
#all_images = all_images[relevant_mask]
#pv_data = pv_data[start_date:end_date]

n_images = all_times.shape[0]

In [7]:
print(pv_data)

2017-01-03 08:14:20    0.010697
2017-01-03 08:14:30    0.023494
2017-01-03 08:14:40    0.036292
2017-01-03 08:14:50    0.049089
2017-01-03 08:15:00    0.061887
                         ...   
2019-10-26 17:58:40    0.020961
2019-10-26 17:58:50    0.015540
2019-10-26 17:59:00    0.010119
2019-10-26 17:59:10    0.005942
2019-10-26 17:59:20    0.001765
Length: 3887473, dtype: float64


In [8]:
print(n_images)

363375


In [9]:
print(pv_data.index)

DatetimeIndex(['2017-01-03 08:14:20', '2017-01-03 08:14:30',
               '2017-01-03 08:14:40', '2017-01-03 08:14:50',
               '2017-01-03 08:15:00', '2017-01-03 08:15:10',
               '2017-01-03 08:15:20', '2017-01-03 08:15:30',
               '2017-01-03 08:15:40', '2017-01-03 08:15:50',
               ...
               '2019-10-26 17:57:50', '2019-10-26 17:58:00',
               '2019-10-26 17:58:10', '2019-10-26 17:58:20',
               '2019-10-26 17:58:30', '2019-10-26 17:58:40',
               '2019-10-26 17:58:50', '2019-10-26 17:59:00',
               '2019-10-26 17:59:10', '2019-10-26 17:59:20'],
              dtype='datetime64[ns]', length=3887473, freq=None)


In [11]:
# Create forecast training data file
import h5py
#mmap_array = np.memmap('video_prediction_224.dat', dtype='uint8', mode='w+', shape=(n_images, 224, 224, 3))
sampling_interval = 2

batch_size = 7000
#for sampling_interval in sampling_interval_all:
resume_idx = 0
with h5py.File('video_prediction_224.h5', 'w') as f:

	#image_log = f.create_dataset('image_log', shape = (n_images, 224, 224, 3), dtype ='uint8', )

	if resume_idx and 'image_log' in f:
        # Resume mode - datasets already exist
		image_log_ds = f['image_log']
		pv_log_ds = f['pv_log'] 
		pv_pred_ds = f['pv_pred']
		print(f"Resuming from existing datasets. Current size: {image_log_ds.shape[0]}")
	else:
        # First run - create new datasets
		image_log_ds = f.create_dataset(
			'image_log',
			shape=(n_images, stack_height+1, *output_img_shape),
			compression = "gzip",
			dtype='uint8'
		)
		pv_log_ds = f.create_dataset(
			'pv_log',
			shape=(n_images, stack_height+1),
			dtype='float64'
		)
		pv_pred_ds = f.create_dataset(
			'pv_pred',
			shape=(n_images,),
			dtype='float64'
		)
		print("Creating new datasets")

	
	#image_log = np.empty([0,stack_height+1]+output_img_shape,dtype = 'uint8')
	#pv_log = np.empty([0, stack_height+1])
	#pv_pred = np.empty([0])

	last_valid_index = 0
	curr_size = 0

	tic = time.process_time()
	for b in range(resume_idx if (resume_idx and 'image_log' in f) else 0, n_images, batch_size):
		current_batch_size =  min(batch_size, n_images-b)
		# Initialize variables to save pv values
		#image_log_batch = np.zeros([n_images,stack_height+1]+output_img_shape,dtype = 'uint8')
		image_log_batch = np.zeros([current_batch_size,stack_height+1]+output_img_shape,dtype = 'uint8')
		#all_times_batch = all_times[b*batch_size : b*batch_size + current_batch_size]
		pv_log_batch = np.zeros((current_batch_size,stack_height+1))
		pv_pred_batch = np.zeros(current_batch_size)
		validity_mask = np.ones(current_batch_size,dtype = bool)
		
		

		sampling_interval_td = datetime.timedelta(minutes = sampling_interval) - datetime.timedelta(seconds=1)
		for i in range(current_batch_size):
			count = b+i
			# See if the specified sampling frequency is met 
			if all_times[count] - all_times[last_valid_index] > sampling_interval_td:

				# Collecting groud truth for predicted value
				pred_time = all_times[count]+datetime.timedelta(minutes=forecast_horizon)
				
				pv_pred_idx = find_time_within_nparray(pv_data.index,pred_time)
				if pv_pred_idx is None:# if prediction ground truth not found
					validity_mask[i] = False
					#print(all_times[i],'has no PV pred')
				else: 
					pv_pred_batch[i] = pv_data.iloc[pv_pred_idx] 

				# Collecting image log and PV log
				for j in range(stack_height+1):
					log_time = all_times[count] - datetime.timedelta(minutes = j)
					# Collecting a stack of image
					log_time_idx = find_time_within_nparray(all_times,log_time)
					if log_time_idx is not None:
						image_log_batch[i,j] = all_images[log_time_idx]
					else:
						validity_mask[i] = False
						#print(all_times[count],'has no image log')
						break

					# Collecting a stack of PV value
					pv_log_idx = find_time_within_nparray(pv_data.index,log_time)
					# Check if PV value present
					if pv_log_idx is None:
						validity_mask[i] = False
						#print(all_times[count],'has no PV log')
						break
					else: 
						pv_log_batch[i,j] = pv_data.iloc[pv_log_idx]    

			else: # if this is in between the sampling points, discard
				validity_mask[i] = False
			
			if validity_mask[i]:
				last_valid_index = i
			
		# Prompt progress of current work

		print('processed {0}/{1} images'.format(b+batch_size,len(all_times)))
			
		
		# Only pick out the valid time points
		#all_times_batch = all_times_batch[validity_mask]
		image_log_batch = image_log_batch[validity_mask]
		pv_log_batch = pv_log_batch[validity_mask]
		pv_pred_batch = pv_pred_batch[validity_mask]
		# Store information
		
		image_log_ds[curr_size:curr_size+validity_mask.sum()] = image_log_batch
		pv_log_ds[curr_size:curr_size+validity_mask.sum()] = pv_log_batch
		pv_pred_ds[curr_size:curr_size+validity_mask.sum()] = pv_pred_batch
		curr_size += validity_mask.sum()

		print('For sampling frequency: ',sampling_interval,' minutes')
		#print('Expected finishing time:', datetime.datetime.now()+
		#		datetime.timedelta(seconds = (time.process_time() - tic)*(len(all_times)/(b+batch_size))))
		f.flush()

	pred_folder_child = os.path.join(pred_folder,'frequency_'+str(sampling_interval))
	#store_trainval_test(all_times,image_log,pv_log,pv_pred,pred_folder_child)

Creating new datasets
processed 7000/363375 images
For sampling frequency:  2  minutes
processed 14000/363375 images
For sampling frequency:  2  minutes
processed 21000/363375 images
For sampling frequency:  2  minutes
processed 28000/363375 images
For sampling frequency:  2  minutes
processed 35000/363375 images
For sampling frequency:  2  minutes
processed 42000/363375 images
For sampling frequency:  2  minutes
processed 49000/363375 images
For sampling frequency:  2  minutes
processed 56000/363375 images
For sampling frequency:  2  minutes
processed 63000/363375 images
For sampling frequency:  2  minutes
processed 70000/363375 images
For sampling frequency:  2  minutes
processed 77000/363375 images
For sampling frequency:  2  minutes
processed 84000/363375 images
For sampling frequency:  2  minutes
processed 91000/363375 images
For sampling frequency:  2  minutes
processed 98000/363375 images
For sampling frequency:  2  minutes
processed 105000/363375 images
For sampling frequency: 

In [12]:
print(current_batch_size)

6375
