Code preliminaries

In [1]:
import  os

import numpy as np
import pandas as pd


from create_df_larger import read_df_larger
df, dfc, all_homes, appliance_min, national_average = read_df_larger()

df = df.rename(columns={'house_num_rooms':'num_rooms',
                        'num_occupants':'total_occupants',
                        'difference_ratio_min_max':'ratio_difference_min_max'})
K_min, K_max = 1,6
F_min, F_max=1,8

from all_functions import *
from features_larger import *

In [2]:
appliance="hvac"
month=6
all_homes = df['%s_%d' %(appliance, month)].dropna().index

Selecting a small subset of the data.

In [3]:
subset = df.head(15)

In [4]:
subset_hvac = subset[['%s_%d' %(appliance, month) for month in range(5, 11)]].dropna()

Considering only the HVAC consumption from these few homes

In [6]:
subset_hvac

Unnamed: 0,hvac_5,hvac_6,hvac_7,hvac_8,hvac_9,hvac_10
26,242.395737,543.781433,761.394043,902.418823,610.035828,395.169098
59,77.613205,197.951508,457.950378,700.443054,244.340973,81.064682
94,19.836948,66.621819,126.121681,155.045364,92.979385,33.39275
101,191.965454,325.858032,462.708923,485.186005,350.491455,192.567566
222,478.951691,896.467285,1077.890137,1139.740479,887.044739,584.328979
364,236.925064,535.010376,736.468933,850.080811,642.858459,378.799713
545,350.414551,778.204224,1065.381104,1120.907593,788.107544,375.30304
624,72.737854,191.232803,307.978149,389.598602,239.580887,22.099518
661,221.594315,648.794067,942.087769,1069.414429,756.477905,399.598877
871,84.128731,231.698425,403.408813,514.838928,319.413025,140.866745


Description of the various attributes. Columns represent energy usage in months 5 through 10.

In [7]:
subset_hvac.describe()

Unnamed: 0,hvac_5,hvac_6,hvac_7,hvac_8,hvac_9,hvac_10
count,14.0,14.0,14.0,14.0,14.0,14.0
mean,211.506496,419.086451,636.733641,737.874454,520.313531,264.495624
std,154.569877,246.81185,351.783626,377.517354,327.900609,171.895097
min,19.836948,66.621819,126.121681,155.045364,92.979385,22.099518
25%,79.242086,206.388237,354.939331,455.239014,291.055527,129.153437
50%,219.673515,386.361679,533.972839,691.79361,401.385147,261.240059
75%,241.028069,541.588669,896.914337,1027.665527,728.073044,391.076752
max,512.822205,896.467285,1300.884033,1507.603027,1297.922485,584.328979


Now, making one entry missing (Home 26, month #7) by putting NaN. True energy for this <home, appliance> is 761 units.

In [8]:
subset_hvac.loc[26, 'hvac_7']=np.NAN

Normalising each column to be on the range (0, 1)

In [9]:
col_max = subset_hvac.max()
col_min = subset_hvac.min()

In [10]:
col_max

hvac_5      512.822205
hvac_6      896.467285
hvac_7     1300.884033
hvac_8     1507.603027
hvac_9     1297.922485
hvac_10     584.328979
dtype: float64

Just checking if the normalisation is working correctly or not

In [11]:
(subset_hvac['hvac_5']-col_min['hvac_5'])/(col_max['hvac_5']-col_min['hvac_5'])

26      0.451451
59      0.117197
94      0.000000
101     0.349155
222     0.931295
364     0.440354
545     0.670563
624     0.107307
661     0.409256
871     0.130413
946     0.401464
1169    0.002558
1283    1.000000
1310    0.432097
Name: hvac_5, dtype: float64

Seems to be working great. All the numbers are in the range(0, 1). Now, doing this for the entire matrix.

In [12]:
subset_hvac_normalised = subset_hvac.copy()
for col in subset_hvac.columns:
    subset_hvac_normalised[col] = (subset_hvac[col]-col_min[col])/(col_max[col]-col_min[col])

In [13]:
subset_hvac_normalised

Unnamed: 0,hvac_5,hvac_6,hvac_7,hvac_8,hvac_9,hvac_10
26,0.451451,0.574998,,0.552563,0.429113,0.663554
59,0.117197,0.158258,0.282465,0.403234,0.125617,0.104877
94,0.0,0.0,0.0,0.0,0.0,0.020087
101,0.349155,0.312391,0.286515,0.244086,0.213713,0.3032
222,0.931295,1.0,0.81018,0.728024,0.659007,1.0
364,0.440354,0.564429,0.51955,0.513868,0.456353,0.634439
545,0.670563,0.857488,0.799531,0.714101,0.576897,0.62822
624,0.107307,0.150162,0.154803,0.173415,0.121667,0.0
661,0.409256,0.701543,0.69458,0.67603,0.550647,0.671433
871,0.130413,0.198925,0.236037,0.26601,0.187921,0.211243


Employing matrix factorisation now

In [14]:
import numpy
numpy.random.seed(0)
def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
    Q = Q.T
    for step in xrange(steps):
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j]>0:
                    eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])
                    for k in xrange(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eR = numpy.dot(P,Q)
        e = 0
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - numpy.dot(P[i,:],Q[:,j]), 2)
                    for k in xrange(K):
                        e = e + (beta/2) * ( pow(P[i][k],2) + pow(Q[k][j],2) )
        if e < 0.001:
            break
    return P, Q.T

Choosing K=2 latent features

In [15]:
R = subset_hvac_normalised.values
N = len(R)
M = len(R[0])
K = 2

P = numpy.random.rand(N,K)
Q = numpy.random.rand(M,K)

nP, nQ = matrix_factorization(R, P, Q, K)

In [16]:
pred_df = pd.DataFrame(numpy.dot(nP, nQ.T), index=subset_hvac_normalised.index, columns=subset_hvac_normalised.columns)

Reconstructed matrix on (0, 1) scale

In [18]:
pred_df

Unnamed: 0,hvac_5,hvac_6,hvac_7,hvac_8,hvac_9,hvac_10
26,0.525125,0.54611,0.56449,0.539838,0.469831,0.557974
59,0.195456,0.203097,0.212085,0.204079,0.176932,0.206261
94,0.213156,0.219759,0.251548,0.254802,0.214037,0.210379
101,0.271361,0.280902,0.306959,0.303246,0.258664,0.27737
222,0.849052,0.887698,0.857489,0.784963,0.702192,0.941851
364,0.51743,0.540505,0.528151,0.487253,0.433737,0.569973
545,0.675099,0.700375,0.745619,0.725707,0.624736,0.703016
624,0.134283,0.13935,0.14785,0.143617,0.123787,0.140166
661,0.57123,0.589567,0.666605,0.670882,0.565774,0.569187
871,0.204976,0.213451,0.217021,0.205434,0.179937,0.220186


In [19]:
pred_26_7 = pred_df.loc[26, 'hvac_7']

In [20]:
(col_max['hvac_7']-col_min['hvac_7'])*pred_26_7 + col_min['hvac_7']

789.26356979335708

Great, our method predicts energy usage to be 790 units when true consumption was 760 units. Error of 30 units only, which means the method worked really great here.

Now, I'll try to remove more entries from the matrix. Specifically, I'll remove an entry from the same month for a different home. I'll remove the entry for home #1169. Since, this entry is not the lowest one, I don't need to normalise again from scratch. The true energy for this home is ~330 units.

In [21]:
subset_hvac_normalised.loc[1169, 'hvac_7'] = np.NAN

Confirming that we have only 2 missing values in the matrix.

In [26]:
assert(subset_hvac_normalised.isnull().sum().sum()==2)

In [27]:
numpy.random.seed(0)
R = subset_hvac_normalised.values
N = len(R)
M = len(R[0])
K = 2

P = numpy.random.rand(N,K)
Q = numpy.random.rand(M,K)

nP, nQ = matrix_factorization(R, P, Q, K)

In [28]:
pred_df = pd.DataFrame(numpy.dot(nP, nQ.T), index=subset_hvac_normalised.index, columns=subset_hvac_normalised.columns)

In [29]:
pred_df

Unnamed: 0,hvac_5,hvac_6,hvac_7,hvac_8,hvac_9,hvac_10
26,0.525252,0.546342,0.565182,0.540056,0.470007,0.558244
59,0.195344,0.20301,0.212359,0.204149,0.176959,0.206131
94,0.213187,0.219816,0.252969,0.255115,0.214288,0.210422
101,0.271078,0.280661,0.307573,0.302925,0.258419,0.277217
222,0.849419,0.888274,0.856032,0.785051,0.702246,0.942496
364,0.517585,0.540781,0.527469,0.487283,0.433748,0.570288
545,0.674622,0.699993,0.746862,0.725564,0.624575,0.702636
624,0.134171,0.139255,0.148077,0.1436,0.123757,0.140059
661,0.570436,0.588821,0.668958,0.670553,0.565474,0.568465
871,0.204926,0.213444,0.216974,0.205323,0.17985,0.220218


In [30]:
pred_1169_7 = pred_df.loc[1169, 'hvac_7']

In [31]:
(col_max['hvac_7']-col_min['hvac_7'])*pred_1169_7 + col_min['hvac_7']

320.40357004031023

We predict the usage to be 320 units when the actual consumption is about 330 units. Again, fairly good recommendation!

Now, making the matrix much more sparse. 

In [48]:
subset_hvac_sparse = subset_hvac.copy()

In [49]:
for col in subset_hvac_sparse.columns:
    for row in subset_hvac_sparse.index:
        if numpy.random.random()>0.6:
            subset_hvac_sparse.loc[row, col]=np.NaN

In [51]:
subset_hvac_sparse.isnull().sum().sum()

35

We now have 35 missing entries in the marix.

Month wise #missing entries is 

In [53]:
subset_hvac_sparse.isnull().sum()

hvac_5     7
hvac_6     5
hvac_7     3
hvac_8     6
hvac_9     8
hvac_10    6
dtype: int64

Home wise #missing entries is

In [55]:
subset_hvac_sparse.isnull().sum(axis=1).describe()

count    14.000000
mean      2.500000
std       1.506397
min       0.000000
25%       2.000000
50%       3.000000
75%       3.000000
max       5.000000
dtype: float64

In [56]:
subset_hvac_sparse

Unnamed: 0,hvac_5,hvac_6,hvac_7,hvac_8,hvac_9,hvac_10
26,242.395737,543.781433,,902.418823,,
59,,,457.950378,,,
94,,66.621819,126.121681,,92.979385,33.39275
101,191.965454,,462.708923,,350.491455,
222,,,,,887.044739,
364,236.925064,535.010376,736.468933,850.080811,642.858459,378.799713
545,350.414551,,1065.381104,1120.907593,,375.30304
624,,,307.978149,389.598602,,22.099518
661,221.594315,648.794067,,,756.477905,
871,84.128731,231.698425,403.408813,514.838928,,140.866745


Now, normalising this matrix.

In [57]:
col_max = subset_hvac_sparse.max()
col_min = subset_hvac_sparse.min()

In [58]:
subset_hvac_sparse_normalised = subset_hvac_sparse.copy()
for col in subset_hvac.columns:
    subset_hvac_sparse_normalised[col] = (subset_hvac_sparse[col]-col_min[col])/(col_max[col]-col_min[col])

In [59]:
subset_hvac_sparse_normalised

Unnamed: 0,hvac_5,hvac_6,hvac_7,hvac_8,hvac_9,hvac_10
26,0.59435,0.819619,,0.458692,,
59,,,0.282465,,,
94,,0.0,0.0,,0.0,0.026268
101,0.404966,,0.286515,,0.324296,
222,,,,,1.0,
364,0.573806,0.804553,0.51955,0.411879,0.692486,0.829691
545,1.0,,0.799531,0.65412,,0.821558
624,,,0.154803,0.0,,0.0
661,0.516233,1.0,,,0.835572,
871,0.0,0.283553,0.236037,0.112021,,0.276255


In [60]:
numpy.random.seed(0)
R = subset_hvac_sparse_normalised.values
N = len(R)
M = len(R[0])
K = 2

P = numpy.random.rand(N,K)
Q = numpy.random.rand(M,K)

nP, nQ = matrix_factorization(R, P, Q, K)

In [61]:
pred_df = pd.DataFrame(numpy.dot(nP, nQ.T), index=subset_hvac_normalised.index, columns=subset_hvac_normalised.columns)

In [62]:
pred_df

Unnamed: 0,hvac_5,hvac_6,hvac_7,hvac_8,hvac_9,hvac_10
26,0.626751,0.725728,0.574429,0.507956,0.723713,0.715155
59,0.446145,0.471976,0.372976,0.25586,0.509112,0.500437
94,0.166016,0.232958,0.184941,0.231032,0.197225,0.197316
101,0.331219,0.432536,0.343024,0.384554,0.389111,0.387425
222,0.801248,0.779461,0.614957,0.297987,0.905082,0.885557
364,0.650821,0.702224,0.555131,0.40575,0.744536,0.732676
545,0.817796,0.958538,0.75886,0.690261,0.945887,0.935392
624,0.121631,0.140518,0.111219,0.097817,0.140404,0.138725
661,0.642869,0.87037,0.690618,0.819481,0.759418,0.757931
871,0.283462,0.285528,0.225425,0.128576,0.321522,0.31518


Now, let us pick up a few entries. Home #59 had 4/5 missing entries!

In [69]:
home_pred = {}
home = 59
home_pred[home] = {}
for month in range(5, 11):
    home_pred[59][month] = (col_max['hvac_%d' %month]-col_min['hvac_%d' %month])*pred_df.loc[home,'hvac_%d'%month] +col_min['hvac_%d' %month]

In [74]:
pred_59 = pd.DataFrame(home_pred).squeeze()
gt_59 = subset_hvac.ix[59]
gt_59.index=pred_59.index

Here's the GT and prediction for this home. 

In [76]:
pd.DataFrame({"gt":gt_59, "pred":pred_59})

Unnamed: 0,gt,pred
5,77.613205,202.93085
6,197.951508,341.393123
7,457.950378,564.280025
8,700.443054,675.651405
9,244.340973,497.247409
10,81.064682,237.247047


In [2]:
!pwd

/Users/nipunbatra/git/enerscale/code


Given that we had 1/6 records for home #59. Our predictions aren't all that bad.