# Scikit-learn

In [None]:
import utils
import pandas as pd
import glob

filenames = glob.glob('data/neighborhood/outputFiles/*.csv')
df = utils.read_dir(filenames)
df.shape

We need the values from a single house.

In [None]:
cols = [ x for x in df.columns if x.endswith(':house_0')]
cols

Clean up the columns a bit.

In [None]:
df = df[cols]
df.columns = [ x.replace('house:', '').replace(':house_0', '') for x in df.columns ]

[ x for x in df.columns]

What are we looking at?

In [None]:
df.resample('60min')[['air_temperature','outdoor_temperature','hvac_power.real']].plot(secondary_y='hvac_power.real')

## Modeling

We want to predict the `hvac_power.real` required to maintain a given temperature given:

- air_temperature
- previous air_temperature
- outdoor_temperature


We can use this simple model to optimize a homes smart thermastat.

In [None]:
tmp = df.resample('60min')
tmp['previous_air_temperature'] = tmp['air_temperature'].shift(1)
tmp.dropna(inplace=True)

Let's create a simple linear model.

In [None]:
from sklearn.linear_model import Ridge

y = tmp['hvac_power.real']
X = tmp[['air_temperature','outdoor_temperature', 'previous_air_temperature']].as_matrix()

model = Ridge(normalize=False, fit_intercept=False)
model.fit(X,y)

In [None]:
y_pred = model.predict(X)
print(model.score(X,y))

In [None]:
import matplotlib.pyplot as plt

plt.plot(y, y_pred, 'o')