In [1]:
import numpy as np
import os
import pandas as pd

In [2]:
# Load the Billing Details file
RAW_DATA_DIR = "Data//RAW"
PROCESSED_DATA = "Data//PROCESSED"
#BILLING_UNITS_DETAILS = "BillingUnitDetails.xlsx"
SANCTIONED_LOAD_HISTORY = "SanctionLoadHistory.xlsx"
MONTHLY_CONSUMPTION = "MonthlyConsumptionStandardisedDF.csv"
CUSTOMER_MASTER_TABLE = "CustomerMasterDF.csv"

In [7]:
# Read the sanctioned load history table
# For Cosumer Number :
# 2000026928 : 29.10.2015 - 23.03.2016 : 10.78 KVA
# Add
SanctionedLoadHistoryDF = pd.read_excel(os.path.join(RAW_DATA_DIR,SANCTIONED_LOAD_HISTORY))
NewVariableNames = ['ConsumerNumber','ContractAcc','ConsumerName','HouseNo','Street','RateCategory','Voltage',\
                   'Installation','LoadUoM','Load','ValidFrom','ValidTo']

SanctionedLoadHistoryDF.columns = NewVariableNames

# Set the largest unit to 31.12.2025
SanctionedLoadHistoryDF.loc[SanctionedLoadHistoryDF['ValidTo'] == '31.12.9999','ValidTo'] = '31.12.2025'

SanctionedLoadHistoryDF.ValidFrom = pd.to_datetime(SanctionedLoadHistoryDF.ValidFrom, format = '%d.%m.%Y')
SanctionedLoadHistoryDF.ValidTo = pd.to_datetime(SanctionedLoadHistoryDF.ValidTo, format = '%d.%m.%Y')

# Remove time information from the date variable
SanctionedLoadHistoryDF.ValidFrom = pd.DatetimeIndex(SanctionedLoadHistoryDF.ValidFrom).normalize()
SanctionedLoadHistoryDF.ValidTo = pd.DatetimeIndex(SanctionedLoadHistoryDF.ValidTo).normalize()

# Set the change date to start of the month
SanctionedLoadHistoryDF.ValidFrom = SanctionedLoadHistoryDF.ValidFrom + pd.offsets.MonthBegin(-1)
SanctionedLoadHistoryDF.ValidTo = SanctionedLoadHistoryDF.ValidTo + pd.offsets.MonthBegin(-1)

# Convert all sanctioned load into kilo Watts
nrows = SanctionedLoadHistoryDF.shape[0]
LoadinKW = [0] * nrows
SanctionedLoadHistoryDF['LoadinKW'] = LoadinKW
SanctionedLoadHistoryDF.loc[SanctionedLoadHistoryDF.LoadUoM == "C.LOAD.BHP",'LoadinKW'] = SanctionedLoadHistoryDF.Load[SanctionedLoadHistoryDF.LoadUoM == "C.LOAD.BHP"] * 0.745699872
SanctionedLoadHistoryDF.loc[SanctionedLoadHistoryDF.LoadUoM == "C.LOAD.KVA",'LoadinKW'] = SanctionedLoadHistoryDF.Load[SanctionedLoadHistoryDF.LoadUoM == "C.LOAD.KVA"] * 0.9
SanctionedLoadHistoryDF.loc[SanctionedLoadHistoryDF.LoadUoM == "C.LOAD.KW",'LoadinKW'] = SanctionedLoadHistoryDF.Load[SanctionedLoadHistoryDF.LoadUoM == "C.LOAD.KW"]

# Select only the variables
select_cols = ['ConsumerNumber','RateCategory','LoadinKW','ValidFrom','ValidTo']
SanctionedLoadHistoryDF = SanctionedLoadHistoryDF[select_cols]
#SanctionedLoadHistoryDF.tail(10)

# Get the monthly consumption data
MonthlyConsumptionDF = pd.read_csv(os.path.join(PROCESSED_DATA,MONTHLY_CONSUMPTION))
MonthlyConsumptionDF['Consumption_KWH'] = 0.9 * MonthlyConsumptionDF[['Consumption_KVAH']]

MonthlyConsumptionDF.MonthofConsumption = pd.to_datetime(MonthlyConsumptionDF.MonthofConsumption, format = '%m/%d/%y', errors = "ignore")
MonthlyConsumptionDF.MonthofConsumption = pd.DatetimeIndex(MonthlyConsumptionDF.MonthofConsumption).normalize()

# Take the monthly data on and after Jan 2015 and before July 2017
time_idx = (MonthlyConsumptionDF.MonthofConsumption >= '2015-01-01') & \
(MonthlyConsumptionDF.MonthofConsumption < '2018-01-01')
MonthlyConsumptionDF = MonthlyConsumptionDF.loc[time_idx]

# Add the santioned load history data to the file
MonthlyConsumptionDF = pd.merge(MonthlyConsumptionDF,
                               SanctionedLoadHistoryDF,
                               on = ['ConsumerNumber'],
                               how = 'left')

select_idx = (MonthlyConsumptionDF.MonthofConsumption >= MonthlyConsumptionDF.ValidFrom)&\
(MonthlyConsumptionDF.MonthofConsumption <= MonthlyConsumptionDF.ValidTo)
MonthlyConsumptionDF = MonthlyConsumptionDF.loc[select_idx]
MonthlyConsumptionDF = MonthlyConsumptionDF.groupby(['ConsumerNumber','MonthofConsumption','Consumption_KVAH','Consumption_KWH']).\
agg({'LoadinKW':'max'}).reset_index()

# Read customer master data
CustomerMasterDF = pd.read_csv(os.path.join(PROCESSED_DATA,CUSTOMER_MASTER_TABLE))
CustomerMasterDF = CustomerMasterDF[['ConsumerNumber','MoveInDate','StreetType','ConsumerType']]
CustomerMasterDF.MoveInDate = pd.to_datetime(CustomerMasterDF.MoveInDate, format = '%Y-%m-%d')
MonthlyConsumptionDF = pd.merge(MonthlyConsumptionDF,
                               CustomerMasterDF[['ConsumerNumber','MoveInDate']],
                               on = ['ConsumerNumber'],
                               how = 'left')

MonthlyTimeseriesDF = pd.pivot_table(MonthlyConsumptionDF, values='Consumption_KWH', index=['ConsumerNumber',\
                        'MoveInDate'],
                          columns=['MonthofConsumption'], aggfunc = np.sum).reset_index()
MonthlyTimeseriesDF = pd.melt(MonthlyTimeseriesDF, id_vars=['ConsumerNumber','MoveInDate'])
MonthlyTimeseriesDF.rename(columns={'value':'Consumption_kWh'}, inplace=True)
from pandas.tseries.offsets import MonthEnd
MonthlyTimeseriesDF['Days']=(pd.to_datetime(MonthlyTimeseriesDF['MonthofConsumption']) + MonthEnd(0)).dt.day

import pandasql as pdsql
pysql = lambda q: pdsql.sqldf(q, globals())
str1 = "select A.*,B.LoadinKW from MonthlyTimeseriesDF as A LEFT JOIN SanctionedLoadHistoryDF as B ON A.ConsumerNumber == B.ConsumerNumber where A.MonthofConsumption >= B.ValidFrom AND A.MonthofConsumption\
<= B.ValidTo;"
MonthlyTimeseriesDF = pysql(str1)
MonthlyTimeseriesDF.MoveInDate = pd.DatetimeIndex(MonthlyTimeseriesDF.MoveInDate).normalize()
MonthlyTimeseriesDF.MonthofConsumption = pd.DatetimeIndex(MonthlyTimeseriesDF.MonthofConsumption).normalize()
# Perform the duplicate check and if detected, select the largest value of LaodinKW
MonthlyTimeseriesDF = MonthlyTimeseriesDF.groupby(['ConsumerNumber','MoveInDate','MonthofConsumption','Consumption_kWh','Days']).agg(\
              {'LoadinKW':'max'}).reset_index()

MonthlyTimeseriesDF = pd.merge(MonthlyTimeseriesDF,
                CustomerMasterDF[['ConsumerNumber','StreetType','ConsumerType']],
                on = 'ConsumerNumber',
                how = 'left')

# Compute the load factor
MonthlyTimeseriesDF['LoadFactor'] = 100 * MonthlyTimeseriesDF.Consumption_kWh /\
                                    (24 * MonthlyTimeseriesDF.Days * MonthlyTimeseriesDF.LoadinKW)
#MonthlyTimeseriesDF.loc[MonthlyTimeseriesDF.ConsumerNumber == 2000109087]

# Sort the dataframe by Consumer Number , Month of Consumption
MonthlyTimeseriesDF = MonthlyTimeseriesDF.sort_values(['ConsumerNumber', 'MonthofConsumption'], ascending=[True,True])
MonthlyTimeseriesDF.head(40)

#MonthlyTimeseriesDF.to_csv(os.path.join(PROCESSED_DATA,"MonthlyConsumptionData.csv"), index=False)

Unnamed: 0,ConsumerNumber,MoveInDate,MonthofConsumption,Consumption_kWh,Days,LoadinKW,StreetType,ConsumerType,LoadFactor
0,2000000871,2005-04-01,2015-01-01,862.11,31,25.0,RURAL,Domestic,4.635
1,2000000871,2005-04-01,2015-02-01,810.744828,28,25.0,RURAL,Domestic,4.825862
2,2000000871,2005-04-01,2015-03-01,1059.583744,31,25.0,RURAL,Domestic,5.696687
3,2000000871,2005-04-01,2015-04-01,1290.214286,30,25.0,RURAL,Domestic,7.167857
4,2000000871,2005-04-01,2015-05-01,1636.367143,31,25.0,RURAL,Domestic,8.797673
5,2000000871,2005-04-01,2015-06-01,1346.805,30,25.0,RURAL,Domestic,7.48225
6,2000000871,2005-04-01,2015-07-01,1407.015,31,25.0,RURAL,Domestic,7.564597
7,2000000871,2005-04-01,2015-08-01,1809.433636,31,25.0,RURAL,Domestic,9.728138
8,2000000871,2005-04-01,2015-09-01,1437.136364,30,25.0,RURAL,Domestic,7.984091
9,2000000871,2005-04-01,2015-10-01,1147.5,31,25.0,RURAL,Domestic,6.169355


In [6]:
MonthlyTimeseriesDF.to_csv(os.path.join(PROCESSED_DATA,"MonthlyConsumptionData.csv"), index=False)