# Tracking the Sun 2019

OEDI Berkeley Lab’s *Tracking the Sun* report series is dedicated to summarizing installed prices and other trends among grid-connected, distributed solar photovoltaic (PV) systems in the United States. For more information, please refer to https://emp.lbl.gov/tracking-the-sun/

## 0. Build Database Connection

PyAthena is a Python DB API 2.0 (PEP 249) compliant client for the Amazon Athena JDBC driver.
https://github.com/laughingman7743/PyAthena

In [1]:
from pyathena.connection import Connection
from pyathena.pandas_cursor import PandasCursor

In [7]:
AWS_REGION_NAME = "us-west-2"
DATABASE_NAME = "oedi"
TABLE_NAME = "oedi_tracking_the_sun_2019"
S3_STAGING_DIR = "s3://oedi-staging/tracking-the-sun"

In [3]:
cursor = Connection(region_name=AWS_REGION_NAME, s3_staging_dir=S3_STAGING_DIR).cursor()

In [4]:
pandas_cursor = Connection(region_name=AWS_REGION_NAME, s3_staging_dir=S3_STAGING_DIR).cursor(PandasCursor)

## 1. Retrieve Table Metadata

### 1.1 Columns
Retrieve the schema

In [5]:
import pandas as pd

In [8]:
# Retrieve column information
result = cursor.execute(f"DESCRIBE {DATABASE_NAME}.{TABLE_NAME}")
columns = [[item.strip() for item in row[0].split("\t")] for row in result.fetchall()]
pd.DataFrame(columns, columns=["NAME", "TYPE", "FROM"])

Unnamed: 0,NAME,TYPE,FROM
0,data_provider,string,
1,system_id_from_first_data_provider,string,
2,system_id_from_second_data_provider_if_applicable,string,
3,system_id_tracking_the_sun,string,
4,installation_date,date,
...,...,...,...
60,,,
61,# Partition Information,,
62,# col_name,data_type,comment
63,,,


### 1.2 Partitions
Retrieve the partitions, partition key is *state*

In [9]:
# Retrieve parition information
result = cursor.execute(f"SHOW PARTITIONS {DATABASE_NAME}.{TABLE_NAME}")
for row in result.fetchall():
    print(row)

('state=PA',)
('state=UT',)
('state=VT',)
('state=CA',)
('state=KS',)
('state=AZ',)
('state=NH',)
('state=RI',)
('state=DE',)
('state=WA',)
('state=MA',)
('state=NM',)
('state=MO',)
('state=OH',)
('state=MD',)
('state=IL',)
('state=WI',)
('state=DC',)
('state=TX',)
('state=NJ',)
('state=NY',)
('state=MN',)
('state=CT',)
('state=FL',)
('state=CO',)
('state=OR',)
('state=ME',)
('state=AR',)


## 2. PV System Installation Trend
To visualize the number of PV System installation among states using grid plot.

In [10]:
# bokeh
from bokeh.io import output_notebook
from bokeh.models import LinearColorMapper, PrintfTickFormatter, ColorBar, LogColorMapper, LogTicker
from bokeh.plotting import figure, show
output_notebook()

In [11]:
pv_state_year = pandas_cursor.execute(
    f"""
    SELECT state, CAST(YEAR(installation_date) AS VARCHAR(4)) AS year, COUNT(*) as count
    FROM {DATABASE_NAME}.{TABLE_NAME}
    GROUP BY CAST(YEAR(installation_date) AS VARCHAR(4)), state;
    """
).as_pandas()

In [12]:
years = sorted(pv_state_year["year"].unique())
states = list(reversed(sorted(pv_state_year["state"].unique())))

ifig = figure(
    title="Heatmap of State PV Systems",
    x_range=years,
    y_range=states,
    plot_height=350,
    sizing_mode="scale_width",
    x_axis_location="above",
    toolbar_location='below',
    tooltips=[("state", "@state"), ("year", "@year"), ("count", "@count")]
)

colors = ["#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce", "#ddb7b1", "#cc7878", "#933b41", "#550b1d"]
mapper = LogColorMapper(
    palette=colors,
    low=pv_state_year["count"].min(),
    high=pv_state_year["count"].max()
)
ifig.rect(
    x="year",
    y="state",
    width=1,
    height=1,
    source=pv_state_year,
    fill_color={"field": "count", "transform": mapper},
    line_color=None
)
ifig.grid.grid_line_color = None
ifig.axis.axis_line_color = None
ifig.axis.major_tick_line_color = None
ifig.axis.major_label_text_font_size = "6pt"
ifig.axis.major_label_standoff = 0

color_bar = ColorBar(
    color_mapper=mapper,
    major_label_text_font_size="6pt",
    ticker=LogTicker(),
    formatter=PrintfTickFormatter(format="%d"),
    label_standoff=6,
    border_line_color=None,
    location=(0, 0),
)

ifig.add_layout(color_bar, "right")


show(ifig)

## 2. PV System Unit Price Trend
To visualize the average installation price of PV system over past years state by state.

In [13]:
import numpy as np
from ipywidgets import interact
from bokeh.core.properties import value
from bokeh.models import ColumnDataSource, FactorRange
from bokeh.plotting import figure, show
from bokeh.transform import factor_cmap

In [14]:
pv_price = pandas_cursor.execute(
    f"""
    SELECT 
        state,
        CAST(YEAR(installation_date) AS VARCHAR(4)) AS year,
        system_size,
        total_installed_price,
        ROUND((total_installed_price / system_size), 2) AS unit_installed_price,
        customer_segment
    FROM {DATABASE_NAME}.{TABLE_NAME}
    WHERE total_installed_price != -9999
    AND system_size != -9999 
    AND system_size != 0
    """
).as_pandas()

In [15]:
pv_price.head()

Unnamed: 0,state,year,system_size,total_installed_price,unit_installed_price,customer_segment
0,AR,2010,2.016,14558.0,7221.23,RES
1,AR,2010,3.36,26096.0,7766.67,RES
2,AR,2010,13.44,91139.0,6781.18,RES
3,AR,2010,5.52,40043.0,7254.17,RES
4,AR,2010,2.53,21497.0,8496.84,RES


In [16]:
states = sorted(pv_price["state"].unique())
customers = pv_price["customer_segment"].unique()

In [17]:
@interact
def show_pv_price_trend(state=states):
    # filter
    state_pv_price = pv_price[pv_price["state"]==state]
    
    # wrapping
    unstack_pv_price = state_pv_price.groupby(["customer_segment", "year"]).mean().round(2)[["unit_installed_price"]].unstack(0)
    customer_pv_price = unstack_pv_price.reindex(pd.Index([str(x) for x in list(range(1998, 2019, 1))], name="year"))["unit_installed_price"]
    
    customers = ["RES", "NON-RES"]
    for customer in customers:
        if customer in customer_pv_price.columns:
            continue
        customer_pv_price.loc[:, customer] = np.NaN
    customer_pv_price = customer_pv_price[customers] #.fillna(0)
    
    # plots
    years = customer_pv_price.index.values
    categories = ["R", "N"]
    data = {
        "years": years,
        "R": customer_pv_price["RES"],
        "N": customer_pv_price["NON-RES"]
    }
    x = [(year, customer) for year in years for customer in categories]
    prices = sum(zip(data["R"], data["N"]), ())
    
    fig = figure(
        title="PV Unit Price Trend",
        x_range=FactorRange(*x),
        plot_height=300,
        sizing_mode="scale_width",
        tools="hover",
        tooltips="@prices",
        toolbar_location="below"
    )
    
    source = ColumnDataSource(data=dict(x=x, prices=prices))
    colors = ["#718dbf", "#e84d60"]
    fig.vbar(
        source=source,
        x="x",
        top='prices',
        width=0.9,
        fill_color=factor_cmap('x', palette=colors, factors=categories, start=1, end=2),
        line_color=factor_cmap('x', palette=colors, factors=categories, start=1, end=2),
        alpha=0.7
    )
    fig.line(x=years, y=data["R"], line_width=2, line_color="#718dbf")
    fig.line(x=years, y=data["N"], line_width=2, line_color="#e84d60")
    
    fig.xgrid.grid_line_color = None
    show(fig)

interactive(children=(Dropdown(description='state', options=('AR', 'AZ', 'CA', 'CO', 'CT', 'DE', 'FL', 'IL', '…

R: Residential <br>
N: Non-residential