## Importing necessary packages

In [1]:
import time
import random
from typing import List
from datetime import datetime, timedelta
import numpy as np
import pandas as pd

## Loading data into dataframes

The dataset is an excel file with 3 sheets. So each sheet will be loaded as a
dataframe.

In [4]:
# Loading meters table
meter_table = pd.read_excel('gorilla_test_data.xlsx', sheet_name='meter_list')
# displaying all rows
meter_table

Unnamed: 0,meter_id,aq_kwh,exit_zone
0,14676236,28978,EA1
1,34509937,78324,SO1
2,50264822,265667,NT1
3,88357331,484399,SE2


NOTICE that every **meter ID** is linked to an **exit zone** and also to an 
**annual quantity**.

In [10]:
# meter_table infos
meter_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   meter_id   4 non-null      int64 
 1   aq_kwh     4 non-null      int64 
 2   exit_zone  4 non-null      object
dtypes: int64(2), object(1)
memory usage: 224.0+ bytes


**meter_id** and **aq_kwh** colmns are integers. **exit_zone** is a string.

In [5]:
# Mapping meter ID to exit zone
meter_zone_dict = {row.meter_id: row.exit_zone for row in meter_table.itertuples()}
print(meter_zone_dict)

{14676236: 'EA1', 34509937: 'SO1', 50264822: 'NT1', 88357331: 'SE2'}


In [6]:
# Mapping meter ID to AQ
meter_aq_dict = {row.meter_id: row.aq_kwh for row in meter_table.itertuples()}
print(meter_aq_dict)

{14676236: 28978, 34509937: 78324, 50264822: 265667, 88357331: 484399}


In [3]:
# Loading rate table
rate_table = pd.read_excel('gorilla_test_data.xlsx', sheet_name='rate_table')
# displaying first 5 rows
rate_table.head()

Unnamed: 0,date,exit_zone,aq_min_kwh,aq_max_kwh,rate_p_per_kwh
0,2020-04-01,EA1,0,73200.0,0.2652
1,2020-04-01,EA1,73200,732000.0,0.198
2,2020-04-01,EA1,732000,,0.2875
3,2020-04-01,EA2,0,73200.0,0.297
4,2020-04-01,EA2,73200,732000.0,0.1524


In [8]:
# rate_table infos
rate_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1140 entries, 0 to 1139
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            1140 non-null   datetime64[ns]
 1   exit_zone       1140 non-null   object        
 2   aq_min_kwh      1140 non-null   int64         
 3   aq_max_kwh      760 non-null    float64       
 4   rate_p_per_kwh  1140 non-null   float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(1)
memory usage: 44.7+ KB


In [19]:
# Checking for duplicated rows
rate_table.duplicated().any()

False

In [7]:
# Loading forecast table
forecast_table = pd.read_excel('gorilla_test_data.xlsx', sheet_name='forecast_table')
# displaying first 5 rows
forecast_table.head()

Unnamed: 0,meter_id,date,kwh
0,14676236,2020-06-01,22.070768
1,14676236,2020-06-02,19.17072
2,14676236,2020-06-03,23.555111
3,14676236,2020-06-04,18.220712
4,14676236,2020-06-05,14.196134


In [16]:
# forecast_table infos
forecast_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3412 entries, 0 to 3411
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   meter_id  3412 non-null   int64         
 1   date      3412 non-null   datetime64[ns]
 2   kwh       3412 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 80.1 KB


Notice that **date** column is a datetime but not a string.

In [18]:
# Check duplicated rows
forecast_table.duplicated().any()

False