In [1]:
import os
import pandas as pd
import numpy as np
from common.utils import load_data, extract_data
# adjust the format of the data set
# os.chdir("../../")
pd.options.display.float_format = '{:,.2f}'.format
np.set_printoptions(precision=2)
data_dir = "data"
# if not done already, extract zipped data and save as csv
ts_data = load_data(data_dir)

ts_data.head(10) 

Unnamed: 0,load,temp
2012-01-01 00:00:00,2698.0,32.0
2012-01-01 01:00:00,2558.0,32.67
2012-01-01 02:00:00,2444.0,30.0
2012-01-01 03:00:00,2402.0,31.0
2012-01-01 04:00:00,2403.0,32.0
2012-01-01 05:00:00,2453.0,31.33
2012-01-01 06:00:00,2560.0,30.0
2012-01-01 07:00:00,2719.0,29.0
2012-01-01 08:00:00,2916.0,29.0
2012-01-01 09:00:00,3105.0,33.33


In [2]:
from scipy import stats
temp_mode = stats.mode(ts_data['temp']).mode.item()
ts_data['temp'] = ts_data['temp'].fillna(temp_mode)
ts_data.isnull().sum()

load    0
temp    0
dtype: int64

In [3]:
# Below is an example of normalizing the ts_data set: as the example shows, the
# scaler requires data to be provided as a matrix of rows and columns. The load
# data is loaded as a pandas DataFrame. It must then be reshaped into a matrix
# of one column:

from pandas import Series
from sklearn.preprocessing import MinMaxScaler
# prepare data for normalization

values = ts_data['load'].values
values = values.reshape((len(values), 1))
# train the normalization
# scaler = MinMaxScaler(feature_range=(0, 1))
# scaler = scaler.fit(values)
# print('Min: %f, Max: %f' % (scaler.data_min_, scaler.data_max_))
values

array([[2698.],
       [2558.],
       [2444.],
       ...,
       [3671.],
       [3499.],
       [3345.]])

In [4]:
# train the normalization
scaler = MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit(values)
print('Min: %f, Max: %f' % (scaler.data_min_, scaler.data_max_))


Min: 1979.000000, Max: 5224.000000


  print('Min: %f, Max: %f' % (scaler.data_min_, scaler.data_max_))


In [5]:
# normalize the data set and print the first 5 rows
normalized = scaler.transform(values)
for i in range(5):
    print(normalized[i])

[0.22]
[0.18]
[0.14]
[0.13]
[0.13]


In [7]:
# transform it back
inversed = scaler.inverse_transform(normalized) 
for i in range(5):
    print(inversed[i])

[2698.]
[2558.]
[2444.]
[2402.]
[2403.]


In [8]:
# Standardize time series data
from sklearn.preprocessing import StandardScaler
from math import sqrt

In [9]:
# prepare data for standardization
values = ts_data['load'].values
values = values.reshape((len(values), 1))
# train the standardization
scaler = StandardScaler()
scaler = scaler.fit(values)
print('Mean: %f, StandardDeviation: %f' % (scaler.mean_, sqrt(scaler.var_)))

Mean: 3303.769199, StandardDeviation: 564.568521


  print('Mean: %f, StandardDeviation: %f' % (scaler.mean_, sqrt(scaler.var_)))


In [11]:
normalized = scaler.transform(values)
for i in range(5):
    print(normalized[i])

[-1.07]
[-1.32]
[-1.52]
[-1.6]
[-1.6]


In [12]:
# inverse transform and print the first 5 rows
inversed = scaler.inverse_transform(normalized)
for i in range(5):
    print(inversed[i])

[2698.]
[2558.]
[2444.]
[2402.]
[2403.]
