# Heating plant

We have a dataset from a real heating plant located in a medium-sized city in Europe. The heating plant heats water and distributes the heat around the city. Our goal is to predict temperature of the returning water based on actual and historical power settings of the plant, output water temperature and the outside temperatures measured at different locations in the city.

## Data import

In [1]:
import pandas as pd

#power of sectors 1 and 2 (in megawatts)
power12 = pd.read_csv('https://raw.githubusercontent.com/mlcollege/rbi/master/specializations/Time-Series-Analysis/data/power12.csv', sep=',')
#power of sector 3 (in megawatts)
power3 = pd.read_csv('https://raw.githubusercontent.com/mlcollege/rbi/master/specializations/Time-Series-Analysis/data/power3.csv', sep=',')
#power of sector 4 (in megawatts)
power4 = pd.read_csv('https://raw.githubusercontent.com/mlcollege/rbi/master/specializations/Time-Series-Analysis/data/power4.csv', sep=',')


#temperatures at location #1
temp1 = pd.read_csv('https://raw.githubusercontent.com/mlcollege/rbi/master/specializations/Time-Series-Analysis/data/temp1.csv', sep=',')
#temperatures at location #2
temp2 = pd.read_csv('https://raw.githubusercontent.com/mlcollege/rbi/master/specializations/Time-Series-Analysis/data/temp2.csv', sep=',')
#temperatures at location #3
temp3 = pd.read_csv('https://raw.githubusercontent.com/mlcollege/rbi/master/specializations/Time-Series-Analysis/data/temp3.csv', sep=',')
#temperatures at location #4
temp4 = pd.read_csv('https://raw.githubusercontent.com/mlcollege/rbi/master/specializations/Time-Series-Analysis/data/temp4.csv', sep=',')
#temperatures at location #5
temp5 = pd.read_csv('https://raw.githubusercontent.com/mlcollege/rbi/master/specializations/Time-Series-Analysis/data/temp5.csv', sep=',')


#input water temperatures
temp_in = pd.read_csv('https://raw.githubusercontent.com/mlcollege/rbi/master/specializations/Time-Series-Analysis/data/temp_in.csv', sep=',')
#output water temperatures
temp_out = pd.read_csv('https://raw.githubusercontent.com/mlcollege/rbi/master/specializations/Time-Series-Analysis/data/temp_out.csv', sep=',')

## Data cleaning and preparation




### TASK 1: Merge all files into one data frame

Don't forget to convert the 'ts' column to time stamp type using 'pandas.to_datetime()'.

In [2]:
#merge powers of all sectors
data = power12[['ts','power12']].merge(power3[['ts','power3']], on='ts').merge(power4[['ts','power4']], on='ts')
data.head()

Unnamed: 0,ts,power12,power3,power4
0,2018-12-31 20:00:00+00:00,53.46134,87.796478,105.498682
1,2018-12-31 21:00:00+00:00,54.46413,74.151459,105.017897
2,2018-12-31 22:00:00+00:00,54.191047,70.918662,105.600836
3,2018-12-31 23:00:00+00:00,54.243952,69.949829,106.664194
4,2019-01-01 00:00:00+00:00,53.277664,68.645953,106.001002


In [3]:
#merge temperatures at all locations
data = data.merge(temp1[['ts', 'temp1']], on='ts')
data = data.merge(temp2[['ts', 'temp2']], on='ts')
data = data.merge(temp3[['ts', 'temp3']], on='ts')
data = data.merge(temp4[['ts', 'temp4']], on='ts')
data = data.merge(temp5[['ts', 'temp5']], on='ts')
data.head()

Unnamed: 0,ts,power12,power3,power4,temp1,temp2,temp3,temp4,temp5
0,2018-12-31 20:00:00+00:00,53.46134,87.796478,105.498682,3.011382,4.319544,3.913138,4.772481,3.598558
1,2018-12-31 21:00:00+00:00,54.46413,74.151459,105.017897,3.43944,4.909789,4.428925,5.561646,4.411865
2,2018-12-31 22:00:00+00:00,54.191047,70.918662,105.600836,3.637795,5.066491,4.603145,5.854397,4.684823
3,2018-12-31 23:00:00+00:00,54.243952,69.949829,106.664194,3.694362,5.293024,4.542157,5.981506,4.747482
4,2019-01-01 00:00:00+00:00,53.277664,68.645953,106.001002,3.73937,5.414588,4.739108,6.081391,4.883813


In [4]:
#merge data with input and output temperatures
data = data.merge(temp_in[['ts', 'temp_in']], on='ts')
data = data.merge(temp_out[['ts', 'temp_out']], on='ts')
data.head()

Unnamed: 0,ts,power12,power3,power4,temp1,temp2,temp3,temp4,temp5,temp_in,temp_out
0,2018-12-31 20:00:00+00:00,53.46134,87.796478,105.498682,3.011382,4.319544,3.913138,4.772481,3.598558,45.34601,79.611295
1,2018-12-31 21:00:00+00:00,54.46413,74.151459,105.017897,3.43944,4.909789,4.428925,5.561646,4.411865,45.350819,80.416664
2,2018-12-31 22:00:00+00:00,54.191047,70.918662,105.600836,3.637795,5.066491,4.603145,5.854397,4.684823,45.396163,80.992841
3,2018-12-31 23:00:00+00:00,54.243952,69.949829,106.664194,3.694362,5.293024,4.542157,5.981506,4.747482,45.40238,81.630635
4,2019-01-01 00:00:00+00:00,53.277664,68.645953,106.001002,3.73937,5.414588,4.739108,6.081391,4.883813,45.440688,81.685051


In [5]:
data['ts'] = pd.to_datetime(data['ts'])

### Task 2: Clean data
Check whether the data files are aligned, if there are some outliers, missing data, etc. Based on your findings suggest and implement a solution.

We are going to remove invalid records and split the data frame at gap borders.

In [6]:
print (len(data))

13829


In [7]:
data = data[data['temp_in'] != 0.0]
print (len(data))

13104


In [8]:
from datetime import timedelta

#identify gaps in the data
data['gap'] = (data['ts'] - data['ts'].shift(1)) != timedelta(hours=1)

In [9]:
# create a list of data frames based on the gap positions

attributes = ['power12', 'power3', 'power4', 'temp1', 'temp2', 'temp3', 'temp4', 'temp5', 'temp_in', 'temp_out']

dflist = []

start = 0
for stop in range(1, len(data)):
    if data.iloc[stop]['gap']:
        dflist.append(data[start:stop][attributes])
        start = stop
len(dflist)

23

##Task 3: Create time series data



In [10]:
w = 5 #window size
s = 1 #step
X_all = []
y_all = []

In [11]:
for df in dflist:
    for i in range(0, len(df)-w-1, s):
        X_all.append(df[i:i+w].values)
        y_all.append(df.iloc[i+w]['temp_in'])

In [12]:
import numpy as np

X_all = np.array(X_all)
y_all = np.array(y_all)

print (X_all.shape)
print (y_all.shape)

(12976, 5, 10)
(12976,)


##Task 4: Split data into validation and train data sets and train an ML model

Hints: 
 - Before training an ML model, suggest and implement some baseline
 - As the data is not big enough, try to use cross validation
 - A recurrent neural network is not necessarily the best model for this task

We are going to implement a GBT on short sequences evaluated using cross validation.

In [13]:
def rmse(predictions, true_values):
    return np.sqrt(((predictions - true_values) ** 2).mean())

In [14]:
import random
from sklearn.ensemble import GradientBoostingRegressor


mean_baseline = []
last_values_baseline = []
ml_model = []
iters = 20
train_size = int(0.9*len(X_all))

X_all_flat = X_all.reshape(X_all.shape[0], -1)
y_all_baseline = X_all_flat[:, -2]

for i in range(iters):
    all_data = list(zip(X_all_flat, y_all, y_all_baseline))
    random.shuffle(all_data)
    X_all_flat_rand, y_all_rand, y_all_rand_baseline = zip(*all_data)
    X_train = np.array(X_all_flat_rand[:train_size])
    y_train = np.array(y_all_rand[:train_size])
    X_test = np.array(X_all_flat_rand[train_size:])
    y_test = np.array(y_all_rand[train_size:])
    y_baseline = np.array(y_all_rand_baseline[train_size:])
    
    print ("Training iteration {}.".format(i+1))
    regr = GradientBoostingRegressor(n_estimators=500)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    
    m = np.mean(y_train)
    y_mean = np.array([m for i in range(len(y_test))])
    
    mean_baseline.append(rmse(y_mean, y_test))
    last_values_baseline.append(rmse(y_baseline, y_test))
    ml_model.append(rmse(y_pred, y_test))

Training iteration 1.
Training iteration 2.
Training iteration 3.
Training iteration 4.
Training iteration 5.
Training iteration 6.
Training iteration 7.
Training iteration 8.
Training iteration 9.
Training iteration 10.
Training iteration 11.
Training iteration 12.
Training iteration 13.
Training iteration 14.
Training iteration 15.
Training iteration 16.
Training iteration 17.
Training iteration 18.
Training iteration 19.
Training iteration 20.


In [15]:
print ("Mean baseline: {} +- {}.".format(np.mean(mean_baseline), np.std(mean_baseline)))
print ("Last values baseline: {} +- {}.".format(np.mean(last_values_baseline), np.std(last_values_baseline)))
print ("ML model: {} +- {}.".format(np.mean(ml_model), np.std(ml_model)))

Mean baseline: 3.712397637097331 +- 0.10373306678078355.
Last values baseline: 0.9189555404941198 +- 0.27558580279085165.
ML model: 0.8754540195163033 +- 0.21807511212629327.
