In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import datetime
import os
import pathlib

import pandas as pd
import altair as alt

import plotly.express as px
alt.data_transformers.disable_max_rows()

from psp.data import C, trim_pv, filter_rows

In [None]:
# It's always annoying to set the working directory: we use an environment variable defined in the Makefile.
CWD = os.environ.get('CWD')
if CWD:
    os.chdir(CWD)

In [None]:
%pwd

In [None]:
df5 = pd.read_parquet('data/5min.parquet')

In [None]:
# Load already prepared sampled datasets.
# See `psp/scripts/simplify_data.py`.
dir_ = pathlib.Path('./data/5min')
for f in dir_.iterdir():
#     print(f.stem)
    df = pd.read_parquet(f)
    name = 'df5_' + f.stem.replace('5min_', '')
    locals()[name] = df
#     print(f.stem)
    print(f'{name}: {len(df)}')

In [None]:
meta = pd.read_csv('./data/metadata.csv')

In [None]:
# Number of `ss_id`
len(df5['ss_id'].unique())

In [None]:
# Number of data points
print(len(df5))

In [None]:
data= df5_10k.copy()
# data = data.rename(columns={'generation_wh': 'power'})
max_ = 100
num_bins = 10
steps = max_ / num_bins
print(steps)
display(data.head())
(
    alt.Chart(data)
    .mark_bar()
    .encode(
    x=alt.X(C.POWER, bin=alt.Bin(extent=[0, max_], step=steps)),
    y='count()'
    )
)

In [None]:
data = df5_100_1M.copy()
num_bins = 20
max_ = 1000
(
    alt.Chart(data)
    .mark_bar().encode(
        x=alt.X(C.POWER, bin=alt.Bin(extent=[0, max_], step=max_ // num_bins), title=''),
        y=alt.Y('count()', title=''),
        facet=alt.Facet(C.ID, columns=16)
    )
        .resolve_scale(
        x='independent',
        y='independent',
    )

    .properties(width=50, height=50)
)

In [None]:
# Find some stats for each system.
# In particular, find the max power.
data = df5_1M[[C.ID, C.POWER]].groupby(C.ID).agg(['mean', 'std', 'max', 'min', 'count'])
data.columns = data.columns.get_level_values(1)

ss_stats = data
ss_stats.head()

In [None]:
data = ss_stats.reset_index()
(
 alt.Chart(data).mark_bar().encode(
     y='count()',
     x=alt.X('count', bin=alt.Bin(maxbins=100))
 )
)

In [None]:
data = ss_stats.reset_index()
(
 alt.Chart(data).mark_bar().encode(
     y='count()',
     x=alt.X('mean', bin=alt.Bin(maxbins=100))
 )
)

In [None]:
data = ss_stats.reset_index()

(
 alt.Chart(data).mark_bar().encode(
     y='count()',
     x=alt.X('max', bin=alt.Bin(maxbins=100))
 )
)

In [None]:
data = ss_stats
# data['max/mean'] = data['max'] / data['mean']
max_mean = 150
chart = (alt.Chart(data).mark_point().encode(
        x=alt.X('mean', scale=alt.Scale(domain=[0, max_mean], clamp=True)),
        y='max'
    )
)
#reg = chart.transform_regression('mean', 'max').mark_line()

from sklearn.linear_model import LinearRegression

model = LinearRegression(fit_intercept=False)
model.fit(data[['mean']], y=data['max'])

line_data = pd.DataFrame(dict(x=[0, max_mean]))
line_data['y'] = model.predict(line_data[['x']])
line_data

line = alt.Chart(line_data).mark_line(color='red').encode(x='x', y='y')

display(chart + line)
print(model.coef_)

In [None]:
# # Let's use that linear model to compute a capacity for each ss_id.
# data = df5_10k
# data = data[[C.ID, 'power']].groupby(C.ID).mean()
# data['capacity'] = model.coef_[0] * data['power']
# data = data.drop(columns='power')

# capacities = data
# capacities.head()

In [None]:
data = df5_100
group_days = 14
data = (
    data[[C.ID, C.DATE, C.POWER]]
    .groupby([C.ID, pd.Grouper(freq=f'{group_days}D', key=C.DATE)])
    # Sum the power
    .sum()
    .reset_index()
)
data['energy'] = data[C.POWER] *  5 / 60 / group_days
display(data.head())

data[C.DATE ] = pd.to_datetime(data[C.DATE])

main = (
    alt.Chart(data).mark_line().encode(
        x=alt.X(C.DATE, title=''),
        y=alt.Y('energy', title=''),
        facet=alt.Facet(C.ID, title='', header=alt.Header(title=None, labelFontSize=0), columns=12)
    )
    .properties(height=30, width=100)
)
main

In [None]:
data = df5_100.copy()
data = filter_rows(data, data[C.POWER] > 0.2)
data = filter_rows(data, data[C.DATE].dt.hour > 4)
data = filter_rows(data, data[C.DATE].dt.hour < 21)
# display(data.head())

# Keep one point every ...
data = (
    data
    .groupby([C.ID, pd.Grouper(freq='30min', key=C.DATE)])
    # Sum the power
    .mean()
    .reset_index()
)
data['day'] = data[C.DATE].dt.dayofyear
data.head()

# data = data[[C.ID, C.EFF, 'day', 'time']]
# data =data[ data['day'] < 5]
# data = data[data[C.ID] < 6000]
data = data[data[C.DATE].dt.year == 2019]
offset = (365 - 50) // 2
data = data[ (data['day'] > offset) & (data['day'] <  365 - offset)]
display(data.head())

# Hack to sort by latitude.
data = data.join(meta[[C.ID, C.LAT]].set_index(C.ID), on=C.ID)
new_id = data[[C.LAT, C.ID]].apply(lambda row: f'{row[C.LAT]}-{row[C.ID]}', axis=1)
data['new_id'] = new_id
display(data.head())

In [None]:
main = (
    alt.Chart(data).mark_line().encode(
        x=alt.X(f'hoursminutes({C.DATE})', title='', axis=None),
        y=alt.Y(C.EFF, 
                title='', axis=None
               ),
        row=alt.Row('new_id',
                    header=alt.Header(title=None, labelFontSize=0),
                    title='',
                    spacing=-10,
                    sort='descending'
                   
                   ),
        column=alt.Column(f'day',
                          header=alt.Header(title=None, labelFontSize=0),
                          spacing=-10,
                         ),
    )
    .properties(height=10, width=24)
   .configure_view(
        strokeWidth=0
   )
)
main