In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [None]:
def process_data(data, date_col, reading_col, window):

  data[date_col] = pd.to_datetime(data[date_col], infer_datetime_format = True)

  data['I'] = 0
  data['D'] = 0
  data['H'] = 0
  data['Avg'] = 0
  data['Std'] = 0
  data['Avg_w'] = 0
  data['Std_w'] = 0

  #to store reading for each timestamp for each day to be used in derived feature calculation
  #no of readings per day = 48
  bar = np.zeros((int(np.ceil(len(data)/48)), 48)) 
  
  for i in  range(0, len(data)): 
    ts = data.loc[i, date_col]

    #basic features
    if (ts.minute == 0):
        I = (ts.hour * 2) + 0

    elif (ts.minute == 30):
        I = (ts.hour * 2) + 1

    data.loc[i, 'I'] = I
    weekday = ts.weekday()

    data.loc[i, 'D'] = weekday

    if (weekday > 4):
        data.loc[i, 'H'] = 1
    
    day = int(i/48)
    if(day >= window):
      data.loc[i, 'Avg'] = np.mean(bar[day-window:day, I])
      data.loc[i, 'Std'] = np.std(bar[day-window:day, I])

    bar[day, I] = data.loc[i, reading_col]
    

    #avg and std of window
    if(i >= window - 1):
      window_values = data.loc[i+1-window: i, reading_col].values
      data.loc[i, 'Avg_w'] = np.mean(window_values)
      data.loc[i, 'Std_w'] = np.std(window_values)

  data.rename(columns={reading_col: 'E'}, inplace=True)
  data = data[['DATETIME', 'E', 'I', 'D', 'H', 'Avg', 'Std', 'Avg_w', 'Std_w']]

In [None]:
src_dir = '../data/sampled'
dest_dir = '../data/processed'

for file in os.listdir(src_dir):

  if(os.path.splitext(file)[-1] != '.csv'):
    continue

  path = os.path.join(src_dir, file)
  data = pd.read_csv(path)

  process_data(data, 'DATETIME', 'USAGE_kW', 2)
  
  path = os.path.join(dest_dir, file)
  data.to_csv(path)