In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import datetime
import os
import pathlib

import pandas as pd
import altair as alt
import numpy as np

import plotly.express as px

alt.data_transformers.disable_max_rows()

from psp.data import C, trim_pv, filter_rows, get_max_power_for_time_of_day


def _(df, *args, **kwargs):
    print(len(df))
    display(df.head(*args, **kwargs))

In [None]:
# It's always annoying to set the working directory: we use an environment variable defined in the Makefile.
CWD = os.environ.get("CWD")
if CWD:
    os.chdir(CWD)

In [None]:
%pwd

In [None]:
df5 = pd.read_parquet("data/5min.parquet")

In [None]:
# Load already prepared sampled datasets.
# See `psp/scripts/simplify_data.py`.
dir_ = pathlib.Path("./data/5min")
for f in dir_.iterdir():
    if "all" not in f.stem:
        continue
    #     print(f.stem)
    df = pd.read_parquet(f)
    name = "df5_" + f.stem.replace("5min_", "")
    locals()[name] = df
    #     print(f.stem)
    print(f"{name}: {len(df)}")
# df5_100.head()
# df5_all.head()

In [None]:
# meta = pd.read_csv("./data/metada_sensitive.csv")
meta = pd.read_csv("data/metada_sensitive.csv")
meta.head()

In [None]:
alt.Chart(meta).mark_bar().encode(
    x=alt.X("tilt", bin=alt.Bin(maxbins=100)), y="count()"
)

In [None]:
# Number of `ss_id`
len(df5["ss_id"].unique())

In [None]:
# Number of data points
print(len(df5))

In [None]:
data = df5_10k.copy()
# data = data.rename(columns={'generation_wh': 'power'})
max_ = 100
num_bins = 10
steps = max_ / num_bins
print(steps)
display(data.head())
(
    alt.Chart(data)
    .mark_bar()
    .encode(x=alt.X(C.POWER, bin=alt.Bin(extent=[0, max_], step=steps)), y="count()")
)

In [None]:
data = df5_100_1M.copy()
num_bins = 20
max_ = 1000
(
    alt.Chart(data)
    .mark_bar()
    .encode(
        x=alt.X(
            C.POWER, bin=alt.Bin(extent=[0, max_], step=max_ // num_bins), title=""
        ),
        y=alt.Y("count()", title=""),
        facet=alt.Facet(C.ID, columns=16),
    )
    .resolve_scale(
        x="independent",
        y="independent",
    )
    .properties(width=50, height=50)
)

In [None]:
# Find some stats for each system.
# In particular, find the max power.
data = df5_1M[[C.ID, C.POWER]].groupby(C.ID).agg(["mean", "std", "max", "min", "count"])
data.columns = data.columns.get_level_values(1)

ss_stats = data
ss_stats.head()

In [None]:
data = ss_stats.reset_index()
(
    alt.Chart(data)
    .mark_bar()
    .encode(y="count()", x=alt.X("count", bin=alt.Bin(maxbins=100)))
)

In [None]:
data = ss_stats.reset_index()
(
    alt.Chart(data)
    .mark_bar()
    .encode(y="count()", x=alt.X("mean", bin=alt.Bin(maxbins=100)))
)

In [None]:
data = ss_stats.reset_index()

(
    alt.Chart(data)
    .mark_bar()
    .encode(y="count()", x=alt.X("max", bin=alt.Bin(maxbins=100)))
)

In [None]:
data = ss_stats
# data['max/mean'] = data['max'] / data['mean']
max_mean = 150
chart = (
    alt.Chart(data)
    .mark_point()
    .encode(x=alt.X("mean", scale=alt.Scale(domain=[0, max_mean], clamp=True)), y="max")
)
# reg = chart.transform_regression('mean', 'max').mark_line()

from sklearn.linear_model import LinearRegression

model = LinearRegression(fit_intercept=False)
model.fit(data[["mean"]], y=data["max"])

line_data = pd.DataFrame(dict(x=[0, max_mean]))
line_data["y"] = model.predict(line_data[["x"]])
line_data

line = alt.Chart(line_data).mark_line(color="red").encode(x="x", y="y")

display(chart + line)
print(model.coef_)

In [None]:
# # Let's use that linear model to compute a capacity for each ss_id.
# data = df5_10k
# data = data[[C.ID, 'power']].groupby(C.ID).mean()
# data['capacity'] = model.coef_[0] * data['power']
# data = data.drop(columns='power')

# capacities = data
# capacities.head()

In [None]:
data = df5_100
group_days = 14
data = (
    data[[C.ID, C.DATE, C.POWER]]
    .groupby([C.ID, pd.Grouper(freq=f"{group_days}D", key=C.DATE)])
    # Sum the power
    .sum()
    .reset_index()
)
data["energy"] = data[C.POWER] * 5 / 60 / group_days
display(data.head())

data[C.DATE] = pd.to_datetime(data[C.DATE])

main = (
    alt.Chart(data)
    .mark_line()
    .encode(
        x=alt.X(C.DATE, title=""),
        y=alt.Y("energy", title=""),
        facet=alt.Facet(
            C.ID, title="", header=alt.Header(title=None, labelFontSize=0), columns=12
        ),
    )
    .properties(height=30, width=100)
)

main

In [None]:
LON_RANGE = [-4.537402, -3.940503]
LAT_RANGE = [55.722169, 56.000524]

m = meta.copy()
for col, (low, high) in zip([C.LAT, C.LON], [LAT_RANGE, LON_RANGE]):
    m = filter_rows(m, (m[col] < high) & (m[col] > low), "filter on " + col)
ss_box = m[C.ID].unique().tolist()
print(len(ss_box))

In [None]:
data = df5_glasgow  # .copy()

# Filter by system
# data = filter_rows(data, data[C.ID].isin(ss_box), 'system id')


# Keep one year
data = filter_rows(data, data[C.DATE].dt.year == 2019, "2019")

data = filter_rows(data, data[C.POWER] > 0.2, "> 0.2")
data = filter_rows(data, data[C.DATE].dt.hour > 4, "after 4am")
data = filter_rows(data, data[C.DATE].dt.hour < 21, "before 9pm")

# Filter days
data["day"] = data[C.DATE].dt.dayofyear
keep_days = 14
offset = (365 - keep_days) // 2
before = offset
after = 365 - offset
# data = data[]
data = filter_rows(
    data, (data["day"] > offset) & (data["day"] <= 365 - offset), "N days"
)


# Keep one point every ...
# data = (
#     data.groupby([C.ID, pd.Grouper(freq="30min", key=C.DATE)])
#     # Sum the power
#     .mean().reset_index()
# )

data.head()

# data = data[[C.ID, C.EFF, 'day', 'time']]
# data =data[ data['day'] < 5]
# data = data[data[C.ID] < 6000]
# data = data[data['day'] % 4 == 0]
display(data.head())

# Hack to sort by latitude.
data["lat_id"] = data[[C.LAT, C.ID]].apply(
    lambda row: f"{row[C.LAT]:.2f}00{int(row[C.ID])}", axis=1
)
# ... or longitude
# note that I'm concatenating the numbers - it's a hack because there are negative values and they won't
# be sorted properly as strings
data["lon_id"] = data[[C.LON, C.ID]].apply(
    lambda row: f"{row[C.LON]:.2f}00{int(row[C.ID])}", axis=1
)
data["lon_id"] = data["lon_id"].astype(float)


display(data.head())
# print(len(data[C.ID].unique()))
# display(meta[meta[C.ID].isin(data[C.ID].unique())])

In [None]:
# from psp.data import join_efficiency
# data2 = join_efficiency(data)

In [None]:
main = (
    alt.Chart(data)
    .mark_line()
    .encode(
        x=alt.X(f"hoursminutes({C.DATE})", title="", axis=None),
        y=alt.Y(C.EFF, title="", axis=None),
        row=alt.Row(
            #             'day',
            #             "lon_id",
            "lat_id",
            header=alt.Header(title=None, labelFontSize=0),
            title="PV System",
            spacing=-10,
            #             spacing=100,
            sort="descending",
        ),
        column=alt.Column(
            f"day",
            #             'lon_id',
            header=alt.Header(title=None, labelFontSize=0),
            spacing=-10,
            #             title='Day'
        ),
    )
    .properties(height=30, width=100)
    .configure_view(strokeWidth=0)
)
main

In [None]:
# Geographical distribution of the PV systems.
data = meta.copy()  # .sample(100)
data["log(kwp)"] = np.log(data["kwp"])
data["one"] = 1.0
px.scatter_geo(
    data,
    lon=C.LON,
    lat=C.LAT,
    fitbounds="locations",
    size="one",
    size_max=3,
    color="log(kwp)",
    #     width=1000,
)

In [None]:
data = df5_1M
# data = df5_10k
# display(data.head())

# data = data.set_index([])

data = (
    data[[C.EFF, C.DATE]]
    .groupby(
        [
            pd.Grouper(key=C.DATE, freq="1M"),
            data[C.DATE].dt.time,
            #     pd.Grouper(key=C.DATE, freq='1H')
        ]
    )
    .max()
)
# data[C.DATE].dt.time]).max()
# display(data.head())
data.index = data.index.set_names(
    ("month", "time")
)  # rename(index={(C.DATE, C.DATE): ('month', 'time')})
# data['minute'] = pd.to_datetime(data['time'])
display(data.head())

# print(data.index)
data = data.groupby(level="month").rolling(12).mean()
data.index = data.index.droplevel(0)

display(data.head())

data = data.reset_index()

# data['time'] = data['time'].astype(str)
# display(data.head())
data["time"] = pd.to_datetime(data["time"], format="%H:%M:%S")
# display(data.tail())
# display(data.dtypes)

(
    alt.Chart(data)
    .mark_line()
    .encode(
        # x=f'hourminute({C.DATE})',
        x=alt.X("hoursminutes(time)", title=""),
        y=alt.Y("max(efficiency)", title=""),
        color=alt.Color(
            "month(month):N", title="Month", scale=alt.Scale(scheme="viridis")
        ),  # , scale=alt.Scale(scheme={'name': 'rainbow', 'extent': [-100, 100]})),
        #         column=alt.Column('month(month)', title=''),
        facet=alt.Facet("year(month)", title="", columns=2),
    )
    .properties(
        width=400,
        height=200,
    )
)

In [None]:
# data = df5_1M
# data = df5_100_1M
data = df5_glasgow  # _10k
# keep 10
# ss_ids = data[C.ID].unique()[:10]
# data = filter_rows(data, data[C.ID].isin(ss_ids), 'keep 10')
# display(data.head())

# data = data.set_index([])

data = (
    data[[C.ID, C.EFF, C.DATE]]
    .groupby(
        [
            C.ID,
            pd.Grouper(key=C.DATE, freq="1M"),
            #     pd.Grouper(key=C.DATE, freq='1H'),
            data[C.DATE].dt.hour,
            #     pd.Grouper(key=C.DATE, freq='1H')
        ]
    )
    .max()
)

# data[C.DATE].dt.time]).max()
display(data.head())
data.index = data.index.set_names(
    (C.ID, "month", "hour")
)  # rename(index={(C.DATE, C.DATE): ('month', 'time')})
# data['minute'] = pd.to_datetime(data['time'])
display(data.head())

# print(data.index)
# data = data.groupby(level=[C.ID, 'month']).rolling(12).mean()
# data.index = data.index.droplevel(0)
# data.index = data.index.droplevel(0)

display(data.head())

data = data.reset_index()

# data['time'] = data['time'].astype(str)
display(data.head())
# data['time'] = pd.to_datetime(data['time'], format='%H:%M:%S')
# display(data.tail())
# display(data.dtypes)

(
    alt.Chart(data)
    .mark_line(opacity=0.2)
    .encode(
        # x=f'hourminute({C.DATE})',
        x=alt.X("hour", title=""),
        y=alt.Y("efficiency", title=""),
        row=alt.Row("year(month)", title=""),
        column=alt.Column("month(month)"),
        color=alt.Color(C.ID + ":N", scale=alt.Scale(range=["gray"]))
        # color=alt.Color('month(month):N', title='Month', scale=alt.Scale(scheme='viridis')),#, scale=alt.Scale(scheme={'name': 'rainbow', 'extent': [-100, 100]})),
        #         column=alt.Column('month(month)', title=''),
        #         column=alt.Column('year(month)', title=''),
        #         row=C.ID,
    )
    .properties(
        width=100,
        height=100,
    )
)

In [None]:
# keep 10 systems
# all the sytems
all_ids = df5_glasgow[C.id].unique().tolist()
meta.head()
meta_glas = meta[meta[C.id].isin(all_ids)]
len(meta_glas)
keep_ids = meta_glas.sort_values("latitude_rounded").iloc[:10][C.id].tolist()

data = df5_glasgow[df5_glasgow[C.id].isin(keep_ids)].copy()  # .sample(10000).copy()
# sample * the timestamps *
timestamps = np.random.choice(data[C.date].unique(), 1000)

data = data[data[C.date].isin(timestamps)]

data[C.id] = data[C.id].astype(str)

display(data.head())
len(data)

display(data.head())
ids = data[C.id].unique().tolist()
data = data.pivot(index=C.date, columns=C.id, values=C.eff).reset_index()
display(data.tail())

alt.Chart(data).mark_circle(opacity=0.6, size=5).encode(
    alt.X(alt.repeat("column"), type="quantitative", title=""),
    alt.Y(alt.repeat("row"), type="quantitative", title=""),
    color=alt.Color(f"month({C.date}):N", scale=alt.Scale(scheme="viridis")),
).properties(width=100, height=100).repeat(row=ids, column=ids)

In [None]:
print(df5_glasgow.dtypes)
# Let's compare correlations and distances.
timestamps = df5_glasgow[C.date].unique()[:1000].tolist()
data = df5_glasgow.copy()
data = data[data[C.date].isin(timestamps)]


ids = data[C.id].unique().tolist()
print(ids)
data = data.pivot(index=C.date, columns=C.id, values=C.power)


corrs = data.corr()

# print(ids)
display(corrs.head())
from psp.gis import approx_distance


# distances.loc[123, 123] = 0.
# distances.head()

In [None]:
# save!
distance.to_csv("data/pv_distances.csv")

In [None]:
c = corrs.stack()
d = distances.stack()
display(c.head())
display(d.head())

# remove distance of 0

data = pd.concat([c, d], axis=1).rename(columns={0: "correlation", 1: "distance"})
data = data[data["distance"] > 0]
# data.head()
data.tail()

alt.Chart(data.reset_index(drop=True)).mark_circle().encode(
    x=alt.X("distance"),
    y="correlation",
).properties(width=800)

In [None]:
# Calculate distances between PV sites.
# This is slow to compute so we save the file for later reloading!
import tqdm
from psp.gis import approx_distance

# Get the ss_id for which we have data.
ids = df5_1M[C.id].unique().tolist()
print(f"Distances for {len(ids)} systems")

rows = []
for i in tqdm.tqdm(range(len(m))):
    s1 = m.iloc[i]
    #     print(s1)
    id1, lat1, lon1 = s1[[C.id, C.lat, C.lon]]
    for j in range(i + 1, len(m)):
        s2 = m.iloc[j]
        id2, lat2, lon2 = s2[[C.id, C.lat, C.lon]]
        d = approx_distance((lat1, lon1), (lat2, lon2)) / 1000
        rows.append((int(id1), int(id2), d))
#         break
#     break
#         all_distances.loc[id1, id2] = d
#         all_distances.loc[id2, id1] = d
# print(rows)
distances = pd.DataFrame.from_records(rows, columns=["ss_id1", "ss_id2", "distance"])
distances.head()

In [None]:
# Save!
# NOTE I saved distances as integer by mistake, needs to rerun and resave and delete this comment.
distances.to_csv("data/pv_distances.csv")

In [None]:
# Load distances
distances = pd.read_csv("data/pv_distances.csv")

In [None]:
meta.head()

In [None]:
print("go")
d = distances.sort_values("distance")[:50]  # .set_index(['ss_id1', 'ss_id2'])
couples = d.copy()
print(d["distance"].max())

couples = couples.join(meta.set_index(C.id)["kwp"], on="ss_id1").rename(
    columns={"kwp": "kwp1"}
)
couples = couples.join(meta.set_index(C.id)["kwp"], on="ss_id2", rsuffix="2")
_(couples)
# d = distances[distances['distance'] < 0.2]

In [None]:
print(len(d))
# display(d.tail())

# All the systems with small enough distances
ids = set(d["ss_id1"].tolist()) | set(d["ss_id2"].tolist())

# A sample of 1k timestamps
timestamps = df5_1M[C.date].unique()  # [:10000]#
print(len(timestamps))

df = df5_all
neighbor_data = df[df[C.date].isin(timestamps) & df[C.id].isin(ids)].copy()
# neighbor_data = df5_all[df5_all[C.date].isin(timestamps) & df5_all[C.id].isin(ids)].copy()
print(len(neighbor_data))
# c = corrs.stack().to_frame().reset_index(names=['ss_id1', 'ss_id2']).rename(columns={0: 'correlation'}).set_index(['ss_id1', 'ss_id2'])
# display(c.tail())

# d.join(c)

In [None]:
neighbor_data.head()

In [None]:
# neighbor_data[C.date] = neighbor_data[C.date].b

In [None]:
power = neighbor_data.sort_values([C.id, C.date]).set_index([C.id, C.date])[[C.power]]
power.head()

In [None]:
power.index.get_level_values(0).unique().tolist()

In [None]:
idx = power.index.get_level_values(0).unique().tolist()[:2]

power.loc[idx].tail()

In [None]:
capacity = get_max_power_for_time_of_day(power, radius=7, min_records=10)

In [None]:
# capacity.isnull().mean()

In [None]:
# cap = cap.sort_index()
# min_cap = min_cap.sort_index()

In [None]:
_(capacity)
# print(len(capacity.index.get_level_values(0).unique()))

In [None]:
# rolling window on the capacity to smooth it out. It should be smooth it's a theoretical model!
sm = (
    capacity.reset_index()
    .groupby([C.id])
    .rolling("1h", on=C.date, center=True, min_periods=4, closed="both")
    .mean()
    .set_index([C.date], append=True)
    .reset_index(level=1, drop=True)
    .sort_index()
)


_(sm)

In [None]:
# cap.mean()

In [None]:
# p = power.loc[ids].sort_index()
smooth_cap = smooth_cap.loc[ids].sort_index()
smooth_min_cap = smooth_min_cap.loc[ids].sort_index()

In [None]:
# power per cap
# ppc = p / c
power = power.sort_index()

In [None]:
data = sm.copy().reset_index()
data = data[data[C.id] == 6667]
data = data[data[C.date].dt.year == 2020]
data = data[data[C.date].dt.month == 5]
# data = data[data[C.date].dt.day < 7]

alt.Chart(data).mark_line().encode(
    x=f"hoursminutes({C.date})",
    y=C.power,
    facet=alt.Facet(f"yearmonthdate({C.date})", columns=6),
).properties(width=100, height=100)

In [None]:
# ppc.tail(5)

In [None]:
cap2.mean()

In [None]:
# merge the datasets
mer = power.copy()
mer["max"] = capacity
mer["smooth_max"] = sm
_(mer)

In [None]:
# Find the time at which we get our max capacity, for each ss_id for each day

x = sm.copy()
x = x.reset_index(1)
x["date"] = x[C.date].dt.date
# _(x)
# x['date'] =

x = (
    x.reset_index()
    .sort_values([C.id, "date", C.power], ascending=False)
    .drop_duplicates([C.id, "date"])
)

date_col = x[C.date].dt
x["time"] = date_col.hour + date_col.minute / 60

x = x.drop(columns=[C.power, "date"])
# x['date'] = x[C.date].dt.date
# x = x.drop(columns=C.date)
x = x.groupby([C.id]).median()  # agg(['mean', 'std'])
# x.columns = ['mean', 'std']

# alt.Chart(x).mark_point().encode(
#     x='mean',
#     y='std',
# )

x = x.reset_index().join(meta.set_index(C.id), on=C.id)

_(x)

alt.Chart(x).mark_point().encode(
    x="time",
    y="orientation",
)

# alt.Chart(x).mark_bar().encode(
#     x = alt.X('time', bin=alt.Bin(maxbins=100)),
#     y= 'count()',
#     row='ss_id'
# ).properties(height=75, width=200)

# x = x[x[C.date].dt.year == 2020]
# x = x[x[C.date].dt.month == 3]

# _(x)#.reset_index())
# return
# alt.Chart(x).mark_line().encode(
#     x=f'yearmonthdate({C.date})',
#     y=alt.Y('time', scale=alt.Scale(domain=[10, 15], clamp=True)),
#     row=alt.Row(C.id, spacing=1)
# ).properties(height=70, width=800)
# # _(x)

In [None]:
x[["time", "orientation"]].corr()

In [None]:
_(couples.reset_index(drop=True), 20)

In [None]:
# trim to keep only a few days for given panels
# data = x[x['power'] > 0.05 * x['power'].quantile(0.99)]
# print('ok')

# _(mer)

idx = 12
row = couples.iloc[idx]
ss_id1 = row["ss_id1"]
ss_id2 = row["ss_id2"]
print(ss_id1)
print(ss_id2)
print(f'distance: {row["distance"]}')


data = mer.reset_index().copy()
_(data)
# data = data.reset_index()
data = data[data[C.id].isin([ss_id1, ss_id2])].copy()

# _(data)

data = data.loc[data[C.date].dt.year == 2020]
# data = data.loc[data[C.date].dt.month < 4]
n = 100
data = data.loc[data[C.date].dt.dayofyear.isin(list(range(n, n + 10)))]
# data = data.sort_index().reset_index()#.copy()
# display(meta.head())
data = data.join(meta.set_index(C.id)[[C.lat, C.lon]], on=C.id)
print("add")

print("data points for ss_id1")
_(data.groupby(C.id).count())

_(data)

# Add the elevation info
if False:
    from pvlib.solarposition import get_solarposition

    def get_elevation(ts, lat, lon):
        return get_solarposition(ts, lat, lon)["elevation"].values[0]

    print(get_elevation(datetime.datetime.now(), 34, -34))
    print(len(data))
    # data['azimuth'] = data[[C.date, C.lat, C.lon]].apply(lambda row: get_azimuth(*row), axis=1)
    display(data.head())
    data["elevation"] = data[[C.date, C.lat, C.lon]].apply(
        lambda row: get_elevation(row[C.date], row[C.lat], row[C.lon]), axis=1
    )

    # get_azimuth(datetime.datetime.now(), -45, 70)
    # data.head()
with_el = data
_(with_el)

In [None]:
# _(with_el)
# if False:
#     with_el['elevation'] = with_el['min_cap']
#     # Normalize elevation by the max for the max cap for the day
#     max_cap = with_el[[C.id, C.date, 'cap', 'elevation']].copy()
#     max_cap['date'] = max_cap[C.date].dt.date
#     #y = y.set_index([C.id, C.date]).sort_index()
#     max_cap = max_cap.groupby([C.id, 'date']).max()#.drop(columns=C.date)
#     max_cap['max_cap/max_el'] = max_cap['cap'] / max_cap['elevation']
#     #max_cap = max_cap.rename(columns={'cap': 'max_cap'}).drop(columns='timestamp')
#     max_cap = max_cap.drop(columns=['cap', 'elevation', C.date])
#     _(max_cap)

#     y = data.copy()
#     y['date'] = y[C.date].dt.date
#     #y = y.set_index([C.id, 'date'])
#     _(y)
#     y = y.join(max_cap, on=[C.id, 'date'])
#     y = y.drop(columns=['date'])
#     _(y)
# else:
#     # Get the max of `cap` and `min_cap for each day
#     t = with_el.copy()
#     t = t[[C.id, C.date, 'cap', 'min_cap']]
#     t['date'] = t[C.date].dt.date
#     t = t.drop(columns=[C.date])
#     t = t.groupby([C.id, 'date']).agg(['min', 'max'])
#     _(t)
#     # FIXME thisis skipped still we do nothing with `t`

#     y = with_el.copy()
y = with_el

In [None]:
d = y.rename(columns={C.power: "power"})
_(d)
# d['power/cap'] = d['power'] / d['cap']

# d['power/min_cap'] = d['power'] / (d['min_cap'])# * d['max_cap/max_el'])
# d['power/min_cap norm'] = d['power/min_cap'] / d['max_cap/max_el']

# d['special'] = (d['power'] - d['min_cap']) / (d['cap'] - d['min_cap'])

# th = 0.2
# below_th = (d['special'] < th).astype(float)

# d['sqrt special'] = np.sqrt(d['special']) #(1 - below_th) * d['special'] + below_th * (d['power'] - d['min_cap']) / d['min_cap']
# d['log special'] = np.log(d['special'])

d["power/max"] = d["power"] / d["smooth_max"]


# d['special 2'] = (d['power'] - d['min_cap']) / (d['cap'] - d['min_cap']) ** 2
# d['special_2'] = d['special']

# d['power/min_cap'] = d['power'] / d['min_cap']

d.head()

In [None]:
# hacky merge of power/cap and power/elevation
z = d.copy()
threshold = 1
# low_power_cap = (z['power/elevation'] < threshold).astype(float)
# z['target'] = low_power_cap * z['power/elevation'] + (1 - low_power_cap) * z['power/cap']
# z['target'] =

_(z)

In [None]:
data = z.copy()

stacked = (
    data.set_index([C.id, C.date])
    .drop(columns=[C.lat, C.lon])
    .stack()
    .to_frame()
    .reset_index()
    .rename(columns={"level_2": "metric", 0: "value"})
    .copy()
)
_(stacked, 10)

In [None]:
# distances = distances.sort_values('distance')
# d = distances[distances['ss_id1'].isin(ids) & distances['ss_id2'].isin(ids)


# data = data[data['']]

chart = (
    (
        alt.Chart(stacked)
        .mark_line()
        .encode(
            x=f"hoursminutes({C.date})",
            # y=C.power,
            y="value",
            row=alt.Row(f"yearmonthdate({C.date})", spacing=1),
            column=alt.Column(
                "metric", spacing=1
            ),  # , sort=['power', 'cap', 'elevation', 'power/cap']),
            # facet=alt.Facet(f'yearmonthdate({C.date})', columns=6, spacing=1),
            color=C.id + ":N",
        )
    )
    .properties(width=200, height=75)
    .resolve_scale(
        # x='independent',
        y="independent"
    )
)
display(chart)

In [None]:
data = neighbor_data.copy()
display(data.head())


def plot(ss_id1, ss_id2, dist):

    d = data.loc[
        data[C.id].isin([int(ss_id1), int(ss_id2)]), [C.id, C.power, C.date]
    ].copy()
    display(d)
    #     if len(d) < 100:
    #         print('uh oh')
    #         return
    d[C.id] = d[C.id].astype(str)
    d = pd.pivot(d, index=C.date, columns=C.id, values=C.power).reset_index()
    display(d.head())
    #     print(d.corr()[ss_id1][ss_id2])
    #     print(d.isnull().max(axis=1).mean())
    #     print(len(d))

    #     display(d.head())
    #     if d.isnull().max(axis=1).sum() / len(d) > 0.5:
    #         print('uh oh')
    #         return

    def round_to(x, t):
        return round(x / t) * t

    x = d[C.date].dt.hour
    d["hours"] = round_to(x, 3)
    m = x * 60 + d[C.date].dt.minute
    d["minute"] = m - d["hours"] * 60 + 60
    d["month"] = round_to(d[C.date].dt.month, 3)
    #     d['minute'] = d[C.date].dt.minute

    chart = (
        alt.Chart(d.sample(10000))
        .mark_circle(size=12, opacity=0.6)
        .encode(
            x=alt.X(str(int(ss_id1)), title=""),
            y=alt.Y(str(int(ss_id2)), title=""),
            #             color=alt.Color(f'month({C.date}):N', scale=alt.Scale(scheme='viridis')),
            #             color=alt.Color(f'minute', scale=alt.Scale(scheme='viridis')),
            color=alt.Color(f"hours({C.date}):O", scale=alt.Scale(scheme="viridis")),
            #             row=alt.Row(f'hoursminutes({C.date})', bin=True),
            # row=alt.Row(f'hours', spacing=1),
            # column=alt.Column(f'month', spacing=1),
            facet=alt.Facet(f"month({C.date})", columns=4, spacing=1),
        )
        #      ).properties(width=800, height=800, title=f'Distance = {dist * 1000:.0f}m')
    ).properties(
        width=150,
        height=150,
        title=f"Power of pv={ss_id1} vs pv={ss_id2} (distance={dist * 1000:.0f}m)",
    )
    return chart


plot(ss_id1, ss_id2, 0)

In [None]:
# Power distribution (over a couple of days) for a given time of day, for a give PV system.

t = power.copy()
t = t.loc[[ss_id1, ss_id2]]
t[C.date] = t.index.get_level_values(1)
t[C.id] = t.index.get_level_values(0)

# keep only some days
t = t[t[C.date].dt.year == 2020]

n = 200
t = t.loc[t[C.date].dt.dayofyear.isin(list(range(n - 10, n + 20)))]
# t = t[t[C.date].dt.month == 10]

t = t.reset_index(drop=True)
_(t)
chart = (
    (
        alt.Chart(t)
        .mark_bar(opacity=0.3, binSpacing=0)
        .encode(
            x=alt.X(C.power, bin=alt.Bin(maxbins=50)),
            y=alt.Y("count()", stack=None, title=""),
            row=alt.Row(f"hours({C.date})", spacing=1),
            # column=alt.Column(f'month({C.date})', spacing=1),
            color=C.id + ":N",
        )
    )
    .properties(width=150, height=50)
    .resolve_scale(
        #   x="independent",
        y="independent",
    )
)
chart

In [None]:
# Power distribution (over a couple of days) for a given time of day, for a give PV system.
# import tqdm
from tqdm.auto import tqdm

t = power.copy()
t = t.loc[[ss_id1, ss_id2]]
t[C.date] = t.index.get_level_values(1)
t[C.id] = t.index.get_level_values(0)
t["time_of_day"] = t[C.date].dt.hour * 60 + t[C.date].dt.minute

# keep only some days
t = t[t[C.date].dt.year == 2020]

# n = 200 - 14
t = t.loc[
    t[C.date].dt.dayofyear.isin(
        list(
            range(
                200 - 14,
                200 + 10 + 14,
            )
        )
    )
]
# t = t[t[C.date].dt.month == 10]

t = t.reset_index(drop=True)
_(t)

for idx, row in tqdm(t.iterrows(), total=len(t)):
    #     print(idx)
    ss_id = row[C.id]
    p = row[C.power]
    ts = row[C.date]
    doy = ts.day_of_year

    time_of_day = ts.hour * 60 + ts.minute
    #    hour = ts.hour
    #  print(idx)
    #    break

    #     print(ts)
    #     print(doy)
    #     print(hour)

    day_radius = 7
    time_radius = 15

    # Gather other values for this hours in the days around

    #     similar_old = t[
    #         (t[C.id] == ss_id)
    #         & (t['time_of_day'] == time_of_day)
    #         & (abs(t[C.date].dt.day_of_year - doy) <= 7)].copy()

    #     similar_new_narrow = t[
    #         (t[C.id] == ss_id)
    #         & (abs(t['time_of_day'] - time_of_day) < 15)
    #         & (abs(t[C.date].dt.day_of_year - doy) <= 7)].copy()

    similar_new_wide = t[
        (t[C.id] == ss_id)
        & (abs(t["time_of_day"] - time_of_day) < 30)
        & (abs(t[C.date].dt.day_of_year - doy) <= 14)
    ].copy()

    # TODO
    #    similar_new = t[]
    # TODO Do some interpolate instead of just doing the mean.
    # See: https://stackoverflow.com/a/26490248/1067132
    #     for name, similar in zip(['old', 'narrow', 'wide'], [similar_old, similar_new_narrow, similar_new_wide]):
    for name, similar in [["", similar_new_wide]]:
        z = (similar[C.power] < p).mean()
        t.loc[idx, "z_" + name] = z

    max_ = (similar[C.power]).max()
    t.loc[idx, "old"]
    # max_ = similar[C.power].max()
    # min_ = similar[C.power].min()
    # t.loc[idx, 'p/max_' + name] = p / max_
    # spread = max_ - min_
    # t.loc[idx, '(p-min)/(max-min)_' + name] = 0 if spread == 0 else (p - min_) / (max_ - min_)

#     z_old = (similar_old[C.power] < p).mean()
#     t.loc[idx, 'z'] = z
#     t.loc[idx, 'max'] = similar[C.power].max()
#     t.loc[idx, 'min'] = similar[C.power].min()
_(t)

In [None]:
tt = t.copy()
tt = tt.set_index([C.id, C.date])
# tt['power / max'] = tt[C.power] / tt['max']
tt = tt.drop(columns=["time_of_day"])  # , 'min', 'max'])
tt = (
    tt.stack()
    .to_frame()
    .reset_index(2)
    .rename(columns={"level_2": "metric", 0: "value"})
)
tt = tt.reset_index()


_(tt)

# print(tt['metric'] == C.power)
# print(C.power)
tt = tt.loc[
    tt[C.date].dt.dayofyear.isin(
        list(
            range(
                200,
                200 + 10,
            )
        )
    )
]
# tt = tt[tt['metric'] == C.power]

tt = tt[tt["metric"] == "z_"]

alt.Chart(tt).mark_line().encode(
    y="value",
    x=f"hoursminutes({C.date})",
    row=alt.Column(f"yearmonthdate({C.date})", spacing=1),
    column=alt.Column("metric", spacing=1),
    color=C.id + ":N",
).resolve_scale(
    #      x="independent",
    #  y="independent",
).properties(
    width=400, height=150
)

In [None]:
# display(d.head())
# i = 0
# for _, (ss_id1, ss_id2, dist) in d.sort_values('distance').iterrows():
# #     print(dist)
# #     if dist == 0:
# #         continue
#     display(plot(str(int(ss_id1)), str(int(ss_id2)), dist))
#     i += 1

#     if i > 30:
#         break

# return


c = data[[C.id, C.power, C.date]].copy()
c[C.id] = c[C.id].astype(str)
c = pd.pivot(c, index=C.date, columns=C.id, values=C.power)

display(c.head())

cc = c.iloc[:, :10].copy()
display(cc.head())
ids = list(cc.columns)
# cc[C.id] = cc[C.id].astype(str)
# ids = list(map(str, list(cc.columns)))
print(ids)
# return
alt.Chart(cc).mark_circle(opacity=0.6, size=5).encode(
    alt.X(alt.repeat("column"), type="quantitative"),
    alt.Y(alt.repeat("row"), type="quantitative"),
    color=alt.Color(f"month({C.date}):N", scale=alt.Scale(scheme="viridis")),
).properties(width=20, height=20).repeat(row=ids, column=ids)

In [None]:
display(meta.set_index("ss_id").loc[26934])
display(meta.set_index("ss_id").loc[4035])

In [None]:
c = c.corr().stack()
display(c.head())
# c = c.stack()
# dd = d.set_index(('ss_id1', 'ss_id2'))
display(d.head())
display(dd.head())