# Normalizing and Creating a General Data Set
- Consumption we need to normalize somehow to MWh
    - [We can find energy conversions here](https://www.eia.gov/tools/faqs/faq.php?id=667&t=6)
    - The end result in `total_consumption` will be in MWh
    - This meets my expectations as I expect in a month millions of MWh to power an entire state
- Balance is actually pretty clean, we just have to aggregate somehow to the daily
- Weather is also relatively clean, I left some things the same depending on the downstream processing

In [59]:
import pandas as pd
import altair as alt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

pd.set_option('display.max_columns', 500)
pd.set_option('future.no_silent_downcasting', True)
alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

## Data Transforms

In [40]:
start_date = pd.to_datetime('2016-01-01')
coal = 0.88 * 2000 # kWh/pound * M/1000k * 2000pound/1ton * 1000thousand
nat_gas = 0.13 # kWh/cubic foot * M/1000k * 1000thousand
petrol_l = 12.90 / 42 # kWh/gallon * M/1000k * barrel/42gallon * 1000thousand
petrol_c = 1.18 * 2000 # kWh/pound * M/1000k * 2000pound/1ton * 1000thousand

In [54]:
balance = pd.read_csv("../../data/balance_sheet.csv", parse_dates=['data_date']) # megawatts
bcols = [
    'demand',
    'net_generation', 
    'total_interchange',	
    'net_generation_coal', 
    'net_generation_natural_gas', 
    'net_generation_nuclear',
    'net_generation_hydropower_and_pumped_storage', 
    'net_generation_solar', 
    'net_generation_wind', 
    'net_generation_other_fuel_sources'
]
balance[bcols] = balance[bcols].replace(',', '', regex=True).astype(int)
# balance.groupby('data_date')[bcols].mean().round(0).reset_index()
# balance.groupby([balance.data_date.dt.month, balance.data_date.dt.year])[bcols].mean().round(0)
# balance.groupby([balance.data_date.dt.day, balance.data_date.dt.month, balance.data_date.dt.year])[bcols].mean().round(0)

In [111]:
consumption = pd.read_csv("../../data/consumption_mi.csv", header=4).dropna(axis=0).drop(['units', 'source key'], axis=1)
consumption['description'] = consumption['description'].apply(lambda row: '_'.join(row.split(': ')[1].split(' ')))
consumption = consumption.T
consumption.columns = consumption.iloc[0, :]
consumption = consumption.drop('description', axis=0).reset_index(names='date')
consumption['date'] = pd.to_datetime(consumption['date'], format='%b %Y')
consumption['petroleum_liquids'] = consumption['petroleum_liquids'].replace('NM', 0.0)
consumption['total_consumption'] = consumption['coal'].astype(float)*coal + \
    consumption['petroleum_liquids'].astype(float)*petrol_l + \
    consumption['petroleum_coke'].astype(float)*petrol_c + \
    consumption['natural_gas'].astype(float)*nat_gas
consumption[consumption['date'] >= date].to_csv("../../data/targets.csv")

In [91]:
weather = pd.read_csv("../../data/WeatherReport.csv", parse_dates=['DATE'])
wcols = [
    'DAPR',
    'MDPR',
    'PRCP',
    'SN52',
    'SN53',
    'SNOW',
    'SNWD',
    'SX52',
    'SX53',
    'TMAX',
    'TMIN',
    'TOBS',
    'WESD',
    'WSFG',
    'WT01',
    'WT03',
    'WT04',
    'WT05',
    'WT06',
    'WT11'
]
# weather.groupby([weather.DATE.dt.month, weather.DATE.dt.year])[wcols].mean()
# weather[wcols].describe()
# weather[weather['DATE'] >= date].groupby([weather.DATE.dt.day, weather.DATE.dt.month, weather.DATE.dt.year])[wcols].mean()

In [92]:
b = balance.groupby([balance.data_date.dt.day, balance.data_date.dt.month, balance.data_date.dt.year])[bcols] \
    .mean() \
    .round(0)
b.index = b.index.set_names(['day', 'month', 'year'])
b = b.reset_index()
b.insert(1, 'date', pd.to_datetime(b[['day', 'month', 'year']]))
b = b.drop(['day', 'month', 'year'], axis=1)

In [99]:
w = weather[weather['DATE'] >= date].groupby([weather.DATE.dt.day, weather.DATE.dt.month, weather.DATE.dt.year])[wcols] \
    .mean() \
    .round(0)
w.index = w.index.set_names(['day', 'month', 'year'])
w = w.reset_index()
w.insert(1, 'date', pd.to_datetime(w[['day', 'month', 'year']]))
w = w.drop(['day', 'month', 'year'], axis=1)

In [110]:
b.merge(w, on='date').sort_values('date').to_csv("../../data/features.csv")