In [1]:
import torch
import math
import pandas as pd
import numpy as np

import torch
import math
from refit_loader.data_loader import REFIT_Loader

In [52]:
from datetime import timezone

class Sequence2PointGenerator(torch.utils.data.Dataset):
    """
    Class that takes the X values and corresponding Y values. Makes windows of provided sequence length for X and tags along the middle index value of Y
    """
    def __init__(self, data):
        try:
            super().__init__()
        
        except Exception as e:
            print("Error occured in initialization of Sequence2PointGenerator class due to ", e)
            
        finally:
            self.sequence_length = MODEL_CONFIG['SEQUENCE_LENGTH']
            lst = [0] * math.floor(self.sequence_length/2)   
            self.time = data.index
            self.X = pd.concat([ pd.Series(lst), data['aggregate'] , pd.Series(lst)])
            self.y = data[data.columns[-1]]

    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        # return (torch.tensor(np.array(self.X.iloc[index:index + self.sequence_length])), torch.tensor(np.array(self.y.iloc[[index]])))
        return np.array(time.mktime(self.time[index].timetuple())), np.array(self.X.iloc[index:index + self.sequence_length]), np.array(self.y.iloc[[index]])

In [53]:
class Seq2PointDataLoader():
    """
    This class creates a REFIT_Loader object to load the data for the target_appliance and provided train, validate and test houses
    Further it resamples that data using the SAMPLING_PERIOD, WINDOW_LIMIT and fill additional nans using FILL_VALUE as specified by 'dataset_config.json'
    Then it creates the generator using Sequence2PointGenerator and use that to create pytorch dataloaders and return those created loaders for training, validation and testing
    """
    def __init__(self, target_appliance='kettle', target_houses= dict , proportion= {'train_percent':0.7, 'validate_percent':0.2} , subset_days = None):
        try:
            self.__target_appliance = target_appliance
            self.__target_houses = target_houses
            self.__proportion = proportion
            self.__subset_days = subset_days
            
            if self.__same_house_approach()==True:

                self.__appliance_obj = REFIT_Loader().get_appliance_data(appliance=self.__target_appliance, houses=self.__target_houses['TRAIN'])
                self.__appliance_obj.resample(sampling_period = DATASET_CONFIG['SAMPLING_PERIOD'], fill_value = float(DATASET_CONFIG['FILL_VALUE']), window_limit = float(DATASET_CONFIG['WINDOW_LIMIT']) )

                if bool(self.__subset_days)==True:
                    self.__appliance_obj.subset_data(self.__subset_days)
                    self.__train_df, self.__val_df, self.__test_df = self.__get_proportioned_data(self.__appliance_obj.active_data[self.__target_houses['TRAIN'][0]])

                else:
                    self.__train_df, self.__val_df, self.__test_df = self.__get_proportioned_data(self.__appliance_obj.data[self.__target_houses['TRAIN'][0]])
                                
            else:
                
                self.__appliance_obj = REFIT_Loader().get_appliance_data(appliance=self.__target_appliance, houses=[house for lst_houses in [self.__target_houses['TRAIN'],self.__target_houses['VALIDATE'] , self.__target_houses['TEST']] for house in lst_houses ])
                self.__appliance_obj.resample(sampling_period = DATASET_CONFIG['SAMPLING_PERIOD'], fill_value = float(DATASET_CONFIG['FILL_VALUE']), window_limit = float(DATASET_CONFIG['WINDOW_LIMIT']) )
                
                if bool(self.__subset_days)==True:
                    self.__appliance_obj.subset_data(self.__subset_days)
                    self.__train_df, self.__val_df, self.__test_df = self.__appliance_obj.active_data[self.__target_houses['TRAIN'][0]], self.__appliance_obj.active_data[self.__target_houses['VALIDATE'][0]], self.__appliance_obj.active_data[self.__target_houses['TEST'][0]]
                
                else:
                    self.__train_df, self.__val_df, self.__test_df = self.__appliance_obj.data[self.__target_houses['TRAIN'][0]], self.__appliance_obj.data[self.__target_houses['VALIDATE'][0]], self.__appliance_obj.data[self.__target_houses['TEST'][0]]
        
        except Exception as e:
            print("Error occured in initialization of Seq2PointDataLoader class due to ", e)
            
        finally:
            self.__create_dataloaders()
                
                    
    def __get_proportioned_data(self, tmp_df):
        """
        """
        try:
            self.__train_end = tmp_df.index[math.floor(self.__proportion['train_percent'] * len(tmp_df))]
            self.__val_end = tmp_df.index[math.floor((self.__proportion['train_percent'] + self.__proportion['validate_percent']) * len(tmp_df))]
            return tmp_df[:self.__train_end] , tmp_df[self.__train_end:self.__val_end], tmp_df[self.__val_end:]

        except Exception as e:
            print("Error occured in __get_proportioned_data method due to ", e)
  

    def __same_house_approach(self):
        """
        """
        try:
            if self.__target_houses['TRAIN']== self.__target_houses['VALIDATE'] and self.__target_houses['TRAIN'] == self.__target_houses['TEST']:
                return True
            else:
                return False

        except Exception as e:
            print("Error occured in __same_house_approach method due to ", e)
                
                
    def __create_dataloaders(self):
        """
        """
        try:     
            self.__train_generator = Sequence2PointGenerator(self.__train_df)
            self.train_dataloader = torch.utils.data.DataLoader(dataset=self.__train_generator, 
                                                  batch_size=TRAINING_CONFIG['TRAIN_BATCH_SIZE'], # how many samples per batch
                                                  num_workers=0, # how many subprocesses to use for data loading (higher = more)
                                                  shuffle=False) # shuffle the data

            self.__validation_generator = Sequence2PointGenerator(self.__val_df)
            self.validation_dataloader = torch.utils.data.DataLoader(dataset=self.__validation_generator, 
                                                  batch_size=TRAINING_CONFIG['VALIDATION_BATCH_SIZE'], # how many samples per batch
                                                  num_workers=0, # how many subprocesses to use for data loading (higher = more)
                                                  shuffle=False) # shuffle the data    

            self.__test_generator = Sequence2PointGenerator(self.__test_df)
            self.test_dataloader = torch.utils.data.DataLoader(dataset=self.__test_generator, 
                                                  batch_size=TRAINING_CONFIG['TEST_BATCH_SIZE'], # how many samples per batch
                                                  num_workers=0, # how many subprocesses to use for data loading (higher = more)
                                                  shuffle=False) # shuffle the data
        except Exception as e:
            print("Error occured in create_dataloaders method due to ", e)

In [54]:
dtes = pd.date_range(start = '2022-11-21', end = '2022-11-22', freq='T')
df = pd.DataFrame({'time': dtes, 'aggregate':np.arange(0,len(dtes)), 'y': np.arange(0,len(dtes))}).set_index('time')

In [55]:
df

Unnamed: 0_level_0,aggregate,y
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-11-21 00:00:00,0,0
2022-11-21 00:01:00,1,1
2022-11-21 00:02:00,2,2
2022-11-21 00:03:00,3,3
2022-11-21 00:04:00,4,4
...,...,...
2022-11-21 23:56:00,1436,1436
2022-11-21 23:57:00,1437,1437
2022-11-21 23:58:00,1438,1438
2022-11-21 23:59:00,1439,1439


In [56]:
df.index[0].replace(tzinfo=timezone.utc).timestamp()
import time

In [50]:
time.mktime(df.index[0].timetuple())

1668985200.0

In [57]:
from utils.configuration import get_config_from_json
import builtins
builtins.MODEL_CONFIG = get_config_from_json(description="Model Parameters", config_file="configs/model_config.json")
gen = Sequence2PointGenerator(df)

In [58]:
len(gen)

1441

In [59]:
loader = torch.utils.data.DataLoader(dataset=gen, 
                                      batch_size=1,
                                      num_workers=0, # how many subprocesses to use for data loading (higher = more)
                                      shuffle=False) # shuffle the data

In [60]:
from datetime import datetime, timedelta

unix_ts = time.mktime(df.index[0].timetuple())
dt = (datetime.fromtimestamp(unix_ts)).strftime('%Y-%m-%d %H:%M:%S')
print(dt)

2022-11-21 00:00:00


In [61]:
for indx, (timestep, x_value, y_value) in enumerate(loader):
    print(indx)
    print(timestep.item())
    t = dt = (datetime.fromtimestamp(timestep.item())).strftime('%Y-%m-%d %H:%M:%S')
    print(t)
    

0
1668985200.0
2022-11-21 00:00:00
1
1668985260.0
2022-11-21 00:01:00
2
1668985320.0
2022-11-21 00:02:00
3
1668985380.0
2022-11-21 00:03:00
4
1668985440.0
2022-11-21 00:04:00
5
1668985500.0
2022-11-21 00:05:00
6
1668985560.0
2022-11-21 00:06:00
7
1668985620.0
2022-11-21 00:07:00
8
1668985680.0
2022-11-21 00:08:00
9
1668985740.0
2022-11-21 00:09:00
10
1668985800.0
2022-11-21 00:10:00
11
1668985860.0
2022-11-21 00:11:00
12
1668985920.0
2022-11-21 00:12:00
13
1668985980.0
2022-11-21 00:13:00
14
1668986040.0
2022-11-21 00:14:00
15
1668986100.0
2022-11-21 00:15:00
16
1668986160.0
2022-11-21 00:16:00
17
1668986220.0
2022-11-21 00:17:00
18
1668986280.0
2022-11-21 00:18:00
19
1668986340.0
2022-11-21 00:19:00
20
1668986400.0
2022-11-21 00:20:00
21
1668986460.0
2022-11-21 00:21:00
22
1668986520.0
2022-11-21 00:22:00
23
1668986580.0
2022-11-21 00:23:00
24
1668986640.0
2022-11-21 00:24:00
25
1668986700.0
2022-11-21 00:25:00
26
1668986760.0
2022-11-21 00:26:00
27
1668986820.0
2022-11-21 00:27:00
28