In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn_pandas import DataFrameMapper

In [2]:
pd.options.display.max_columns = 40

I received some excel files from my father-in-law containing information about electricity consumption in Ontario. He looked up the data [on the web](http://www.webroots.ca/static/ontarioelectricity/ontarioelectricity.html) three times daily and manually entered it into a spreadsheet.  There are three files in xlsx format. File 1 is observations from 2018, file 2 is observations from 2019, and file 3 is 1000 observations starting from August 2018 (approximately 13 months).

Loaded files in LibreOffice Calc to preview data. 

Most date entries are in dd-mmm format but a few are in dd-mmm-yy.

Column headers span two rows: row 1 is name and row 2 is units. Header in file 2 is an image instead of text.

File 3 contains two sheets with data. Looks like 1st sheet is a copy of the 2018 data and 2nd sheet is combined data from 2018-08-25 through 2019-09-30.

I see at least one data entry error (cell F12 in file 1).

Files 1 and 3 contain summary statisitics:
Three rows of stats at the bottom of the sheet: min, max, and avg of each column.
The two rightmost columns show average daily totals per month, and name of the month.

![](images/spreadsheet.png)

Fixed headers to contain both name and units in one row.

Set the format of all the date columns to yyyy-mm-dd.

Some of the numbers in the data contain commas (cell E6 and all of column R). Fixed by setting excel number format to 0.0 for percent columns and 0 for MW columns.

Deleted summary statistics.

Exported all four sheets as CSV.

Diffed the exported csv files. The extra sheet in file 3 is an unchanged copy of the data from file 1. Generated checksums to verify.

In [3]:
!diff data/electricity1.csv data/electricity3a.csv
!cksum data/*.csv

1070850776 26838 data/electricity1.csv
4245893866 90971 data/electricity2.csv
3911391547 96667 data/electricity3.csv
1070850776 26838 data/electricity3a.csv
2750527202 121891 data/electricity4.csv


In [4]:
df1 = pd.read_csv('data/electricity1.csv') # 2018 data
df2 = pd.read_csv('data/electricity2.csv') # 2019 data
df3 = pd.read_csv('data/electricity3.csv') # ~1000 observations starting August 2018

In [5]:
df4 = df1.append(df2)
df4.to_csv('data/electricity4.csv', index=False) # all data in one file

In [6]:
print(df1.shape)
print(df2.shape)
print(df3.shape)
print(df4.shape)

(293, 18)
(1018, 18)
(1075, 18)
(1311, 18)


In [7]:
target = 'Total MW'
X = df4.drop(target, axis=1)
y = df4[target]
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [8]:
mapper = DataFrameMapper([
    (['Date'], None), # [CategoricalImputer(), LabelBinarizer()]
    (['Day'], None),  # [CategoricalImputer(), LabelBinarizer()]
    (['Hour'], None), # [CategoricalImputer(), LabelBinarizer()]
    (['Nuclear %'], [SimpleImputer(), StandardScaler()]),
    (['Nuclear MW'], [SimpleImputer(), StandardScaler()]),
    (['Hydro %'], [SimpleImputer(), StandardScaler()]),
    (['Hydro MW'], [SimpleImputer(), StandardScaler()]),
    (['Gas %'], [SimpleImputer(), StandardScaler()]),
    (['Gas MW'], [SimpleImputer(), StandardScaler()]),
    (['Wind %'], [SimpleImputer(), StandardScaler()]),
    (['Wind MW'], [SimpleImputer(), StandardScaler()]),
    (['Solar %'], [SimpleImputer(), StandardScaler()]),
    (['Solar MW'], [SimpleImputer(), StandardScaler()]),
    (['Biofuel %'], [SimpleImputer(), StandardScaler()]),
    (['Biofuel MW'], [SimpleImputer(), StandardScaler()]),
    (['Nuc.+Hyd. %'], [SimpleImputer(), StandardScaler()]),
    (['Nuc.+Hyd. MW'], [SimpleImputer(), StandardScaler()]),
    (['Total MW'], [SimpleImputer(), StandardScaler()])
    ], df_out=True
)

In [9]:
#Z_train = mapper.fit_transform(X_train)
#Z_test = mapper.transform(X_test)