In [2]:
import sys
print(sys.version)

3.6.9 (default, Nov  7 2019, 10:44:02) 
[GCC 8.3.0]


In [3]:
from collections import namedtuple
import rx
import rxsci as rs
import rxsci.container.csv as csv
import distogram

# for train and plot
import pandas as pd
import numpy as np
import plotly.express as px

from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [4]:
def namedtuple_to_df(d, columns=None):
    return pd.DataFrame.from_dict(d._asdict(),
    orient='index',
    columns=columns,
)

# Initialize Dataset

In [5]:
dataset_path = '/opt/dataset/HomeC.csv'

In [6]:
parser = csv.create_line_parser(
    dtype=[
        ('time', 'int'),
        ('use', 'float'),
        ('gen', 'float'),
        ('house_overall', 'float'),
        ('dishwasher', 'float'),
        ('furnace1', 'float'),
        ('furnace2', 'float'),
        ('home_office', 'float'),
        ('fridge', 'float'),
        ('wine_cellar', 'float'),
        ('garage_door', 'float'),
        ('kitchen_12', 'float'),
        ('kitchen_14', 'float'),
        ('kitchen_38', 'float'),
        ('barn', 'float'),
        ('well', 'float'),
        ('microwave', 'float'),
        ('living_room', 'float'),
        ('solar', 'float'),
        ('temperature', 'float'),
        ('icon', 'str'),
        ('humidity', 'float'),
        ('visibility', 'float'),
        ('summary', 'str'),
        ('apparent_temperature', 'float'),
        ('pressure', 'float'),
        ('wind_speed', 'float'),
        ('cloud_cover', 'str'),
        ('wind_bearing', 'float'),
        ('precip_intensity', 'float'),
        ('dew_point', 'float'),
        ('precip_probability', 'float'),
    ]
)

# Explore

In [20]:
%%time
dist = csv.load_from_file(dataset_path, parser).pipe(
    rs.ops.map(lambda i: i.house_overall),
    rs.math.dist.update(reduce=True),
).run()

CPU times: user 3.49 s, sys: 47.3 ms, total: 3.54 s
Wall time: 3.54 s


## overall consumption

In [21]:
print("house overall consumption statistics:")
d = rx.just(dist).pipe(
    rs.math.dist.describe(),
).run()

namedtuple_to_df(d)

house overall consumption statistics:


Unnamed: 0,0
min,0.0
max,14.714567
mean,0.858962
stddev,1.057455
p25,0.367093
p50,0.568153
p75,0.973012


In [22]:
df_hist = rx.just(dist).pipe(
    rs.math.dist.histogram(),
    rs.ops.map(lambda i: pd.DataFrame(np.array(i), columns=["bin", "count"])),
).run()

fig = px.bar(df_hist, x="bin", y="count", title="house consumption")
fig.update_layout(height=300)
fig.show()

## multiple variable exploration

In [23]:
%%time
dist = csv.load_from_file(dataset_path, parser).pipe(
    rs.ops.tee_map(
        rx.pipe(  # graph 1
            rs.ops.map(lambda i: i.house_overall),
            rs.math.dist.update(reduce=True),
        ),
        rx.pipe(  # graph 2
            rs.ops.map(lambda i: i.temperature),
            rs.math.dist.update(reduce=True),
        ),
        rx.pipe(  # graph 3
            rs.ops.map(lambda i: i.pressure),
            rs.math.dist.update(reduce=True),
        ),
        rx.pipe(  # graph 4
            rs.ops.map(lambda i: i.wind_speed),
            rs.math.dist.update(reduce=True),
        ),
    )
).run()

CPU times: user 5.04 s, sys: 15.3 ms, total: 5.06 s
Wall time: 5.06 s


In [24]:
cols = ['house_overall', 'temperature', 'pressure', 'wind_speed']


for i in range(4):
    d = rx.just(dist[i]).pipe(
            rs.math.dist.describe(),
        ).run()
    display(namedtuple_to_df(d, columns=[cols[i]]))
    
    df_hist = rx.just(dist[i]).pipe(
        rs.math.dist.histogram(),
        rs.ops.map(lambda i: pd.DataFrame(np.array(i), columns=["bin", "count"])),
    ).run()

    fig = px.bar(df_hist, x="bin", y="count", title=cols[i])
    fig.update_layout(height=300)
    fig.show()

Unnamed: 0,house_overall
min,0.0
max,14.714567
mean,0.858962
stddev,1.057455
p25,0.367093
p50,0.568153
p75,0.973012


Unnamed: 0,temperature
min,-11.45
max,93.72
mean,50.745252
stddev,19.101295
p25,35.774367
p50,50.377799
p75,66.263104


Unnamed: 0,pressure
min,986.4
max,1042.46
mean,1016.301625
stddev,7.893601
p25,1011.288848
p50,1016.528537
p75,1021.475115


Unnamed: 0,wind_speed
min,0.0
max,22.91
mean,6.649936
stddev,3.98222
p25,3.665239
p50,5.923372
p75,8.936889


# feature engineering

In [1]:
%%time
Features = namedtuple('Features', ['label', 'pspeed_ratio', 'temperature', 'temperature_stddev'])
epsilon = 1e-5

data = csv.load_from_file(dataset_path, parser)


df = data.pipe(
    rs.ops.map(lambda i: Features(
        label=i.house_overall,
        pspeed_ratio=i.pressure / (i.wind_speed + epsilon),
        temperature=i.temperature,
        temperature_stddev=0.0,
    )),
    rs.state.with_memory_store(rx.pipe(
        rs.data.roll(
            window=60*6, stride=60,  # 6 hours window with 1 hour stride
            pipeline=rs.ops.tee_map(
                rx.pipe(
                    rs.ops.last(),
                ),
                rx.pipe(
                    rs.ops.map(lambda i: i.temperature),
                    rs.math.stddev(reduce=True),
                ),
            ),
        ),
    )),
    rs.ops.map(lambda i: i[0]._replace(temperature_stddev=i[1])),
    rs.ops.to_pandas()
).run()

NameError: name 'namedtuple' is not defined

# Train model

In [33]:
df.shape

(8399, 4)

In [36]:
df.head(5)

Unnamed: 0,label,pspeed_ratio,temperature,temperature_stddev
0,0.343533,155.515841,34.99,0.340013
1,0.388517,176.039556,35.06,0.269735
2,0.29725,169.547296,34.83,0.191086
3,0.3903,183.008679,35.7,0.242327
4,0.65615,197.602728,36.92,0.487607


In [35]:
fig = px.scatter(df, y='temperature')
fig.show()

In [30]:
model = Ridge(alpha=0.3)

x = df[['pspeed_ratio', 'temperature', 'temperature_stddev']]
y = df['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

model.fit(x_train, y_train)

Ridge(alpha=0.3)

In [31]:
pred = model.predict(x_test)
print(np.sqrt(mean_squared_error(y_test,pred)))

0.9855535820277612


# Deployment

## Feature engineering

## Model Serving