This notebook is for prototyping data preparation for insertion into the database.

# Data for installer table.

Need:
- installer name
- installer primary module manufacurer (e.g. mode of manufacturer name for all installers)

In [49]:
import pandas as pd
import numpy as np

In [240]:
def load_lbnl_data(replace_nans=True):
    df1 = pd.read_csv('data/TTS_LBNL_public_file_10-Dec-2019_p1.csv', encoding='latin-1', low_memory=False)
    df2 = pd.read_csv('data/TTS_LBNL_public_file_10-Dec-2019_p2.csv', encoding='latin-1', low_memory=False)
    lbnl_df = pd.concat([df1, df2], axis=0)
    if replace_nans:
        lbnl_df.replace(-9999, np.nan, inplace=True)
        lbnl_df.replace('-9999', np.nan, inplace=True)
    
    return lbnl_df

In [241]:
lbnl_df = load_lbnl_data(replace_nans=False)
lbnl_df_nonan = load_lbnl_data()

In [242]:
lbnl_df.head()

Unnamed: 0,Data Provider,System ID (from first Data Provider),"System ID (from second Data Provider, if applicable)",System ID (Tracking the Sun),Installation Date,System Size,Total Installed Price,Appraised Value Flag,Sales Tax Cost,Rebate or Grant,...,Inverter Manufacturer #3,Inverter Model #1,Inverter Model #2,Inverter Model #3,Microinverter #1,Microinverter #2,Microinverter #3,System Inverter Capacity,DC Optimizer,Inverter Loading Ratio
0,Arkansas State Energy Office,-9999,-9999,AR_y4H4nGRh77,4/29/2010,2.016,14558.0,False,510.762764,0.0,...,-9999,-9999,-9999,-9999,1,-9999,-9999,1.71,0,1.178947
1,Arkansas State Energy Office,-9999,-9999,AR_J87exIa4x7,4/26/2010,3.36,26096.0,False,851.271273,0.0,...,-9999,-9999,-9999,-9999,1,-9999,-9999,2.85,0,1.178947
2,Arkansas State Energy Office,-9999,-9999,AR_oC05quuYoK,4/20/2010,13.44,91139.0,False,3405.085091,0.0,...,-9999,-9999,-9999,-9999,1,-9999,-9999,11.4,0,1.178947
3,Arkansas State Energy Office,-9999,-9999,AR_rBRCGzrT6C,4/21/2010,5.52,40043.0,False,1398.517091,0.0,...,-9999,-9999,-9999,-9999,1,-9999,-9999,4.56,0,1.210526
4,Arkansas State Energy Office,-9999,-9999,AR_bXvxLeboru,4/22/2010,2.53,21497.0,False,640.987,0.0,...,-9999,-9999,-9999,-9999,1,-9999,-9999,2.09,0,1.210526


In [243]:
lbnl_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1543831 entries, 0 to 843830
Data columns (total 60 columns):
 #   Column                                                Non-Null Count    Dtype  
---  ------                                                --------------    -----  
 0   Data Provider                                         1543831 non-null  object 
 1   System ID (from first Data Provider)                  1543831 non-null  object 
 2   System ID (from second Data Provider, if applicable)  1543831 non-null  object 
 3   System ID (Tracking the Sun)                          1543831 non-null  object 
 4   Installation Date                                     1543831 non-null  object 
 5   System Size                                           1543831 non-null  float64
 6   Total Installed Price                                 1543831 non-null  float64
 7   Appraised Value Flag                                  1543831 non-null  bool   
 8   Sales Tax Cost                   

In [244]:
# get mode of module manufacturer #1 for each install company
# doesn't seem to work when -9999 values are replaced with NaNs
manufacturer_modes = lbnl_df[['Installer Name', 'Module Manufacturer #1']].groupby('Installer Name').agg(lambda x: x.value_counts().index[0])

In [245]:
manufacturer_modes.head()

Unnamed: 0_level_0,Module Manufacturer #1
Installer Name,Unnamed: 1_level_1
-9999,-9999
01 electric,Suntech Power
011design,Hanwha SolarOne (Qidong)
0821 abc,MAGE Solar
0light electric system,-9999


In [246]:
lbnl_zip_data = lbnl_df[['Battery System', 'Feed-in Tariff (Annual Payment)', 'Zip Code']].copy()

Relpace missing values with 0 so it doesn't screw up the average calculation.

In [251]:
lbnl_zip_data.replace(-9999, 0, inplace=True)
lbnl_zip_groups = lbnl_zip_data.groupby('Zip Code').mean()

In [252]:
lbnl_zip_groups.head()

Unnamed: 0_level_0,Battery System,Feed-in Tariff (Annual Payment)
Zip Code,Unnamed: 1_level_1,Unnamed: 2_level_1
85351,0.0,0.0
-9999,0.001072,0.0
501,0.0,0.0
1001,0.0,0.0
1002,0.0,0.0


In [143]:
lbnl_zip_groups.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36744 entries,  85351 to 99403
Data columns (total 2 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Battery System                   36744 non-null  float64
 1   Feed-in Tariff (Annual Payment)  36744 non-null  float64
dtypes: float64(2)
memory usage: 861.2+ KB


Drop missing zip codes.

In [253]:
lbnl_zip_groups = lbnl_zip_groups[~(lbnl_zip_groups.index == '-9999')]

In [254]:
lbnl_zip_groups.reset_index(inplace=True)

In [255]:
lbnl_zip_groups.head()

Unnamed: 0,Zip Code,Battery System,Feed-in Tariff (Annual Payment)
0,85351,0.0,0.0
1,501,0.0,0.0
2,1001,0.0,0.0
3,1002,0.0,0.0
4,1003,0.0,0.0


# Data for the Utility table.

Need:
- zipcode
- utility name
- ownership
- service type

Join EIA-861 report data with EIA IOU rates by zipcode

In [11]:
eia861_df = pd.read_excel('data/Sales_Ult_Cust_2018.xlsx', header=[0, 1, 2])

In [186]:
def load_eia_iou_data():
    iou_df = pd.read_csv('data/iouzipcodes2017.csv')
    noniou_df = pd.read_csv('data/noniouzipcodes2017.csv')
    eia_zipcode_df = pd.concat([iou_df, noniou_df], axis=0)
    
    # zip codes are ints without zero padding
    eia_zipcode_df['zip'] = eia_zipcode_df['zip'].astype('str')
    eia_zipcode_df['zip'] = eia_zipcode_df['zip'].apply(lambda x: x.zfill(5))
    
    return eia_zipcode_df

In [187]:
eia_zip_df = load_eia_iou_data()

In [188]:
eia_zip_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86672 entries, 0 to 34073
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   zip           86672 non-null  object 
 1   eiaid         86672 non-null  int64  
 2   utility_name  86672 non-null  object 
 3   state         86672 non-null  object 
 4   service_type  86672 non-null  object 
 5   ownership     86672 non-null  object 
 6   comm_rate     86672 non-null  float64
 7   ind_rate      86672 non-null  float64
 8   res_rate      86672 non-null  float64
dtypes: float64(3), int64(1), object(5)
memory usage: 6.6+ MB


In [189]:
# util number here is eiaia in the IOU data
utility_number = eia861_df['Utility Characteristics', 'Unnamed: 1_level_1', 'Utility Number']
utility_name = eia861_df['Utility Characteristics', 'Unnamed: 2_level_1', 'Utility Name']
service_type = eia861_df['Utility Characteristics', 'Unnamed: 4_level_1', 'Service Type']
ownership = eia861_df['Utility Characteristics', 'Unnamed: 7_level_1', 'Ownership']

eia_utility_data = pd.concat([utility_number, utility_name, service_type, ownership], axis=1)
eia_utility_data.columns = eia_utility_data.columns.droplevel(0).droplevel(0)
eia_utility_data.head()

Unnamed: 0,Utility Number,Utility Name,Service Type,Ownership
0,55.0,City of Aberdeen - (MS),Bundled,Municipal
1,59.0,City of Abbeville - (LA),Bundled,Municipal
2,84.0,A & N Electric Coop,Bundled,Cooperative
3,84.0,A & N Electric Coop,Bundled,Cooperative
4,97.0,Adams Electric Coop,Bundled,Cooperative


In [190]:
res_data = eia861_df['RESIDENTIAL'].copy()

In [191]:
res_data.head()

Unnamed: 0_level_0,Revenues,Sales,Customers
Unnamed: 0_level_1,Thousand Dollars,Megawatthours,Count
0,4185.0,37455,2566
1,5572.6,56479,4655
2,279.9,2312,271
3,45278.0,386800,31382
4,17507.0,119449,8664


In [192]:
res_data[res_data['Revenues', 'Thousand Dollars'] == '.']

Unnamed: 0_level_0,Revenues,Sales,Customers
Unnamed: 0_level_1,Thousand Dollars,Megawatthours,Count
25,.,.,.
114,.,.,.
115,.,.,.
227,.,.,.
228,.,.,.
...,...,...,...
3207,.,.,.
3208,.,.,.
3209,.,.,.
3210,.,.,.


Missing data seems to be a period.

In [193]:
res_data.replace('.', np.nan, inplace=True)

In [194]:
for c in res_data.columns:
    print(c)
    res_data[c] = res_data[c].astype('float')

('Revenues', 'Thousand Dollars')
('Sales', 'Megawatthours')
('Customers', 'Count')


In [195]:
res_data['average_yearly_bill'] = res_data['Revenues', 'Thousand Dollars'] * 1000 / res_data['Customers', 'Count']

In [196]:
res_data.head()

Unnamed: 0_level_0,Revenues,Sales,Customers,average_yearly_bill
Unnamed: 0_level_1,Thousand Dollars,Megawatthours,Count,Unnamed: 4_level_1
0,4185.0,37455.0,2566.0,1630.943102
1,5572.6,56479.0,4655.0,1197.121375
2,279.9,2312.0,271.0,1032.841328
3,45278.0,386800.0,31382.0,1442.801606
4,17507.0,119449.0,8664.0,2020.660203


In [197]:
res_data['average_yearly_kwh'] = (res_data['Sales', 'Megawatthours'] * 1000) / res_data['Customers', 'Count']

In [198]:
res_data.head()

Unnamed: 0_level_0,Revenues,Sales,Customers,average_yearly_bill,average_yearly_kwh
Unnamed: 0_level_1,Thousand Dollars,Megawatthours,Count,Unnamed: 4_level_1,Unnamed: 5_level_1
0,4185.0,37455.0,2566.0,1630.943102,14596.64848
1,5572.6,56479.0,4655.0,1197.121375,12132.975295
2,279.9,2312.0,271.0,1032.841328,8531.365314
3,45278.0,386800.0,31382.0,1442.801606,12325.536932
4,17507.0,119449.0,8664.0,2020.660203,13786.819021


Get average bill and kWh used by zip code.

In [199]:
res_columns = ['average_yearly_bill', 'average_yearly_kwh']

In [200]:
res_data.columns = res_data.columns.droplevel(1)

In [201]:
res_data[res_columns].head()

Unnamed: 0,average_yearly_bill,average_yearly_kwh
0,1630.943102,14596.64848
1,1197.121375,12132.975295
2,1032.841328,8531.365314
3,1442.801606,12325.536932
4,2020.660203,13786.819021


In [202]:
eia_861_data = pd.concat([res_data[res_columns], eia_utility_data], axis=1)
eia_861_data.head()

Unnamed: 0,average_yearly_bill,average_yearly_kwh,Utility Number,Utility Name,Service Type,Ownership
0,1630.943102,14596.64848,55.0,City of Aberdeen - (MS),Bundled,Municipal
1,1197.121375,12132.975295,59.0,City of Abbeville - (LA),Bundled,Municipal
2,1032.841328,8531.365314,84.0,A & N Electric Coop,Bundled,Cooperative
3,1442.801606,12325.536932,84.0,A & N Electric Coop,Bundled,Cooperative
4,2020.660203,13786.819021,97.0,Adams Electric Coop,Bundled,Cooperative


In [203]:
eia_861_data_zipcode = eia_861_data.merge(eia_zip_df, left_on='Utility Number', right_on='eiaid')

In [204]:
eia_861_data_zipcode.head()

Unnamed: 0,average_yearly_bill,average_yearly_kwh,Utility Number,Utility Name,Service Type,Ownership,zip,eiaid,utility_name,state,service_type,ownership,comm_rate,ind_rate,res_rate
0,1032.841328,8531.365314,84.0,A & N Electric Coop,Bundled,Cooperative,21824,84,A & N Electric Coop,MD,Bundled,Cooperative,0.115607,0.0,0.114949
1,1032.841328,8531.365314,84.0,A & N Electric Coop,Bundled,Cooperative,21851,84,A & N Electric Coop,MD,Bundled,Cooperative,0.115607,0.0,0.114949
2,1032.841328,8531.365314,84.0,A & N Electric Coop,Bundled,Cooperative,21866,84,A & N Electric Coop,MD,Bundled,Cooperative,0.115607,0.0,0.114949
3,1032.841328,8531.365314,84.0,A & N Electric Coop,Bundled,Cooperative,21864,84,A & N Electric Coop,MD,Bundled,Cooperative,0.115607,0.0,0.114949
4,1032.841328,8531.365314,84.0,A & N Electric Coop,Bundled,Cooperative,21863,84,A & N Electric Coop,MD,Bundled,Cooperative,0.115607,0.0,0.114949


Double-check res_rate

In [205]:
eia_861_data_zipcode['res_rate_recalc'] = eia_861_data_zipcode['average_yearly_bill'] / eia_861_data_zipcode['average_yearly_kwh']

In [206]:
eia_861_data_zipcode.head()

Unnamed: 0,average_yearly_bill,average_yearly_kwh,Utility Number,Utility Name,Service Type,Ownership,zip,eiaid,utility_name,state,service_type,ownership,comm_rate,ind_rate,res_rate,res_rate_recalc
0,1032.841328,8531.365314,84.0,A & N Electric Coop,Bundled,Cooperative,21824,84,A & N Electric Coop,MD,Bundled,Cooperative,0.115607,0.0,0.114949,0.121064
1,1032.841328,8531.365314,84.0,A & N Electric Coop,Bundled,Cooperative,21851,84,A & N Electric Coop,MD,Bundled,Cooperative,0.115607,0.0,0.114949,0.121064
2,1032.841328,8531.365314,84.0,A & N Electric Coop,Bundled,Cooperative,21866,84,A & N Electric Coop,MD,Bundled,Cooperative,0.115607,0.0,0.114949,0.121064
3,1032.841328,8531.365314,84.0,A & N Electric Coop,Bundled,Cooperative,21864,84,A & N Electric Coop,MD,Bundled,Cooperative,0.115607,0.0,0.114949,0.121064
4,1032.841328,8531.365314,84.0,A & N Electric Coop,Bundled,Cooperative,21863,84,A & N Electric Coop,MD,Bundled,Cooperative,0.115607,0.0,0.114949,0.121064


In [207]:
eia_861_data_zipcode.drop_duplicates(inplace=True)

In [208]:
eia_861_data_zipcode.tail()

Unnamed: 0,average_yearly_bill,average_yearly_kwh,Utility Number,Utility Name,Service Type,Ownership,zip,eiaid,utility_name,state,service_type,ownership,comm_rate,ind_rate,res_rate,res_rate_recalc
159481,911.89077,6462.641668,57483.0,Liberty Utilities,Bundled,Investor Owned,96118,57483,Liberty Utilities,CA,Bundled,Investor Owned,0.137446,0.0,0.137887,0.141102
159482,911.89077,6462.641668,57483.0,Liberty Utilities,Bundled,Investor Owned,96111,57483,Liberty Utilities,CA,Bundled,Investor Owned,0.137446,0.0,0.137887,0.141102
159483,911.89077,6462.641668,57483.0,Liberty Utilities,Bundled,Investor Owned,96125,57483,Liberty Utilities,CA,Bundled,Investor Owned,0.137446,0.0,0.137887,0.141102
159484,911.89077,6462.641668,57483.0,Liberty Utilities,Bundled,Investor Owned,96124,57483,Liberty Utilities,CA,Bundled,Investor Owned,0.137446,0.0,0.137887,0.141102
159485,911.89077,6462.641668,57483.0,Liberty Utilities,Bundled,Investor Owned,96126,57483,Liberty Utilities,CA,Bundled,Investor Owned,0.137446,0.0,0.137887,0.141102


In [209]:
eia_861_data_zipcode.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 152322 entries, 0 to 159485
Data columns (total 16 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   average_yearly_bill  143449 non-null  float64
 1   average_yearly_kwh   143449 non-null  float64
 2   Utility Number       152322 non-null  float64
 3   Utility Name         152322 non-null  object 
 4   Service Type         152322 non-null  object 
 5   Ownership            152322 non-null  object 
 6   zip                  152322 non-null  object 
 7   eiaid                152322 non-null  int64  
 8   utility_name         152322 non-null  object 
 9   state                152322 non-null  object 
 10  service_type         152322 non-null  object 
 11  ownership            152322 non-null  object 
 12  comm_rate            152322 non-null  float64
 13  ind_rate             152322 non-null  float64
 14  res_rate             152322 non-null  float64
 15  res_rate_recalc  

# Join project solar, ACS, EIA, and LBNL data to get main table

Try and save all of required data from bigquery.

In [94]:
# Set up GCP and AWS bucket APIs
from google.cloud import bigquery
# Construct a BigQuery client object.
client = bigquery.Client()

In [95]:
# ACS US census data
ACS_DB = '`bigquery-public-data`.census_bureau_acs'
ACS_TABLE = 'zip_codes_2017_5yr'

# project sunroof
PSR_DB = '`bigquery-public-data`.sunroof_solar'
PSR_TABLE = 'solar_potential_by_postal_code'

In [100]:
# columns to keep from ACS data
ACS_COLS = ['geo_id',  # zipcode
            'median_age',
            'housing_units',
            'median_income',
            'owner_occupied_housing_units',
            'occupied_housing_units',
            # housing units which will be used to calculate total single-family homes
           'dwellings_1_units_detached',
           'dwellings_1_units_attached',
           'dwellings_2_units',
           'dwellings_3_to_4_units',
           'bachelors_degree_2',
           'different_house_year_ago_different_city',
           'different_house_year_ago_same_city']

In [101]:
query = """SELECT {} FROM {}.{} LIMIT 20;""".format(', '.join(ACS_COLS), ACS_DB, ACS_TABLE)
acs_df = pd.read_gbq(query)
acs_df

Downloading: 100%|██████████| 20/20 [00:00<00:00, 120.87rows/s]


Unnamed: 0,geo_id,median_age,housing_units,median_income,owner_occupied_housing_units,occupied_housing_units,dwellings_1_units_detached,dwellings_1_units_attached,dwellings_2_units,dwellings_3_to_4_units,bachelors_degree_2,different_house_year_ago_different_city,different_house_year_ago_same_city
0,55111,63.4,13.0,,0.0,13.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
1,61112,89.5,100.0,,0.0,100.0,0.0,0.0,0.0,0.0,7.0,68.0,0.0
2,33856,72.0,501.0,24617.0,0.0,273.0,0.0,8.0,63.0,143.0,21.0,39.0,0.0
3,32079,79.5,291.0,32813.0,25.0,220.0,80.0,6.0,6.0,79.0,131.0,45.0,12.0
4,85633,28.8,33.0,,7.0,16.0,26.0,2.0,0.0,0.0,11.0,27.0,0.0
5,59465,62.5,155.0,33750.0,8.0,15.0,144.0,0.0,4.0,0.0,4.0,0.0,0.0
6,40041,81.6,293.0,21212.0,51.0,248.0,19.0,0.0,0.0,0.0,35.0,50.0,58.0
7,49074,83.1,91.0,19479.0,0.0,59.0,0.0,0.0,0.0,0.0,17.0,6.0,12.0
8,30289,67.0,23.0,22250.0,9.0,23.0,23.0,0.0,0.0,0.0,1.0,4.0,0.0
9,63745,56.9,14.0,35000.0,6.0,12.0,14.0,0.0,0.0,0.0,1.0,0.0,0.0


In [106]:
query = f"""SELECT   geo_id,
                    median_age,
                    housing_units,
                    median_income,
                    owner_occupied_housing_units,
                    occupied_housing_units,
                    dwellings_1_units_detached + dwellings_1_units_attached + dwellings_2_units + dwellings_3_to_4_units AS family_homes,
                    bachelors_degree_2,
                    different_house_year_ago_different_city + different_house_year_ago_same_city AS moved_recently
                    FROM {ACS_DB}.{ACS_TABLE}
                    LIMIT 10;"""

test_df = pd.read_gbq(query)
test_df

Downloading: 100%|██████████| 10/10 [00:00<00:00, 55.20rows/s]


Unnamed: 0,geo_id,median_age,housing_units,median_income,owner_occupied_housing_units,occupied_housing_units,family_homes,bachelors_degree_2,moved_recently
0,55111,63.4,13.0,,0.0,13.0,0.0,0.0,4.0
1,61112,89.5,100.0,,0.0,100.0,0.0,7.0,68.0
2,33856,72.0,501.0,24617.0,0.0,273.0,214.0,21.0,39.0
3,32079,79.5,291.0,32813.0,25.0,220.0,171.0,131.0,57.0
4,85633,28.8,33.0,,7.0,16.0,28.0,11.0,27.0
5,59465,62.5,155.0,33750.0,8.0,15.0,148.0,4.0,0.0
6,40041,81.6,293.0,21212.0,51.0,248.0,19.0,35.0,108.0
7,49074,83.1,91.0,19479.0,0.0,59.0,0.0,17.0,18.0
8,30289,67.0,23.0,22250.0,9.0,23.0,23.0,1.0,4.0
9,63745,56.9,14.0,35000.0,6.0,12.0,14.0,1.0,0.0


In [129]:
acs_data_query = f"""SELECT   geo_id,
                    median_age,
                    housing_units,
                    median_income,
                    owner_occupied_housing_units,
                    occupied_housing_units,
                    dwellings_1_units_detached + dwellings_1_units_attached + dwellings_2_units + dwellings_3_to_4_units AS family_homes,
                    bachelors_degree_2,
                    different_house_year_ago_different_city + different_house_year_ago_same_city AS moved_recently
                    FROM {ACS_DB}.{ACS_TABLE}"""

acs_data = pd.read_gbq(acs_data_query)


Downloading:   0%|          | 0/33120 [00:00<?, ?rows/s][A
Downloading: 100%|██████████| 33120/33120 [00:02<00:00, 11087.15rows/s][A


In [130]:
acs_data.to_csv('data/acs_data.csv', index=False)

In [131]:
acs_data.shape

(33120, 9)

In [132]:
acs_data.head()

Unnamed: 0,geo_id,median_age,housing_units,median_income,owner_occupied_housing_units,occupied_housing_units,family_homes,bachelors_degree_2,moved_recently
0,55111,63.4,13.0,,0.0,13.0,0.0,0.0,4.0
1,61112,89.5,100.0,,0.0,100.0,0.0,7.0,68.0
2,33856,72.0,501.0,24617.0,0.0,273.0,214.0,21.0,39.0
3,32079,79.5,291.0,32813.0,25.0,220.0,171.0,131.0,57.0
4,85633,28.8,33.0,,7.0,16.0,28.0,11.0,27.0


Project sunroof data

In [112]:
psr_cols = ['region_name',
            'percent_covered',
            'percent_qualified',
           'number_of_panels_total',
           'kw_median',
           'count_qualified',
           'existing_installs_count']

In [113]:
psr_query = f"""SELECT region_name,
                    percent_covered,
                    percent_qualified,
                    number_of_panels_total,
                    kw_median,
                    (count_qualified - existing_installs_count) AS potential_installs
                    FROM {PSR_DB}.{PSR_TABLE}
                    LIMIT 10;
                    """

test_df = pd.read_gbq(psr_query)
test_df

Downloading: 100%|██████████| 10/10 [00:00<00:00, 70.47rows/s]


Unnamed: 0,region_name,percent_covered,percent_qualified,number_of_panels_total,kw_median,potential_installs
0,20303,33.333333,0.0,,,0
1,21532,0.020812,0.0,,,0
2,24069,0.090253,0.0,,,0
3,24724,62.5,0.0,,,0
4,28357,0.086133,0.0,,,0
5,29058,0.289855,0.0,,,0
6,33849,0.529101,0.0,,,0
7,37015,0.101092,0.0,,,0
8,43002,100.0,0.0,,,0
9,48674,100.0,0.0,,,0


In [114]:
psr_query = f"""SELECT region_name,
                    percent_covered,
                    percent_qualified,
                    number_of_panels_total,
                    kw_median,
                    (count_qualified - existing_installs_count) AS potential_installs
                    FROM {PSR_DB}.{PSR_TABLE};
                    """

psr_df = pd.read_gbq(psr_query)

Downloading: 100%|██████████| 11516/11516 [00:00<00:00, 12087.98rows/s]


In [115]:
psr_df.to_csv('data/psr_data.csv')

In [116]:
psr_df.head()

Unnamed: 0,region_name,percent_covered,percent_qualified,number_of_panels_total,kw_median,potential_installs
0,20303,33.333333,0.0,,,0
1,21532,0.020812,0.0,,,0
2,24069,0.090253,0.0,,,0
3,24724,62.5,0.0,,,0
4,28357,0.086133,0.0,,,0


# Join data for main data table

In [256]:
psr_acs = psr_df.merge(acs_data, left_on='region_name', right_on='geo_id', how='outer')

In [257]:
psr_acs.head()

Unnamed: 0,region_name,percent_covered,percent_qualified,number_of_panels_total,kw_median,potential_installs,geo_id,median_age,housing_units,median_income,owner_occupied_housing_units,occupied_housing_units,family_homes,bachelors_degree_2,moved_recently
0,20303,33.333333,0.0,,,0.0,,,,,,,,,
1,21532,0.020812,0.0,,,0.0,21532.0,31.8,6756.0,43622.0,3479.0,5715.0,5675.0,1003.0,3130.0
2,24069,0.090253,0.0,,,0.0,24069.0,50.1,866.0,41250.0,640.0,761.0,608.0,104.0,253.0
3,24724,62.5,0.0,,,0.0,24724.0,43.5,72.0,53466.0,30.0,39.0,70.0,27.0,6.0
4,28357,0.086133,0.0,,,0.0,28357.0,43.7,1151.0,30869.0,748.0,983.0,624.0,170.0,45.0


In [258]:
psr_acs_lbnl = psr_acs.merge(lbnl_zip_groups, left_on='region_name', right_on='Zip Code', how='outer')

In [259]:
psr_acs_lbnl_eia = psr_acs_lbnl.merge(eia_861_data_zipcode, left_on='region_name', right_on='zip', how='outer')

In [260]:
psr_acs_lbnl_eia.head()

Unnamed: 0,region_name,percent_covered,percent_qualified,number_of_panels_total,kw_median,potential_installs,geo_id,median_age,housing_units,median_income,...,zip,eiaid,utility_name,state,service_type,ownership,comm_rate,ind_rate,res_rate,res_rate_recalc
0,20303,33.333333,0.0,,,0.0,,,,,...,20303,15270.0,Potomac Electric Power Co,DC,Bundled,Investor Owned,0.120417,0.0,0.12214,0.121101
1,20303,33.333333,0.0,,,0.0,,,,,...,20303,15270.0,Potomac Electric Power Co,DC,Delivery,Investor Owned,0.045454,0.014391,0.043327,0.121101
2,20303,33.333333,0.0,,,0.0,,,,,...,20303,15270.0,Potomac Electric Power Co,DC,Bundled,Investor Owned,0.120417,0.0,0.12214,0.14655
3,20303,33.333333,0.0,,,0.0,,,,,...,20303,15270.0,Potomac Electric Power Co,DC,Delivery,Investor Owned,0.045454,0.014391,0.043327,0.14655
4,20303,33.333333,0.0,,,0.0,,,,,...,20303,15270.0,Potomac Electric Power Co,DC,Bundled,Investor Owned,0.120417,0.0,0.12214,0.042477


In [261]:
psr_acs_lbnl_eia.columns

Index(['region_name', 'percent_covered', 'percent_qualified',
       'number_of_panels_total', 'kw_median', 'potential_installs', 'geo_id',
       'median_age', 'housing_units', 'median_income',
       'owner_occupied_housing_units', 'occupied_housing_units',
       'family_homes', 'bachelors_degree_2', 'moved_recently', 'Zip Code',
       'Battery System', 'Feed-in Tariff (Annual Payment)',
       'average_yearly_bill', 'average_yearly_kwh', 'Utility Number',
       'Utility Name', 'Service Type', 'Ownership', 'zip', 'eiaid',
       'utility_name', 'state', 'service_type', 'ownership', 'comm_rate',
       'ind_rate', 'res_rate', 'res_rate_recalc'],
      dtype='object')

In [262]:
psr_acs_lbnl_eia.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 206079 entries, 0 to 206078
Data columns (total 34 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   region_name                      42105 non-null   object 
 1   percent_covered                  42105 non-null   float64
 2   percent_qualified                42105 non-null   float64
 3   number_of_panels_total           42020 non-null   float64
 4   kw_median                        42020 non-null   float64
 5   potential_installs               42105 non-null   float64
 6   geo_id                           62530 non-null   object 
 7   median_age                       61729 non-null   float64
 8   housing_units                    62530 non-null   float64
 9   median_income                    59662 non-null   float64
 10  owner_occupied_housing_units     62530 non-null   float64
 11  occupied_housing_units           62530 non-null   float64
 12  fa

Looks like we have a lot of missing data.  Combine the zip code columns to have one zip column with no missing data.

In [265]:
def fill_zips(x):
    if not pd.isna(x['zip']):
        return x['zip']
    elif not pd.isna(x['Zip Code']):
        return x['Zip Code']
    elif not pd.isna(x['geo_id']):
        return x['geo_id']
    elif not pd.isna(x['region_name']):
        return x['region_name']
    else:
        return np.nan

In [266]:
psr_acs_lbnl_eia['full_zip'] = psr_acs_lbnl_eia.apply(fill_zips, axis=1)

In [267]:
# columns we'll use in the same order as the DB table
cols_to_use = ['full_zip',
               'percent_qualified',
              'number_of_panels_total',
              'kw_median',
              'potential_installs',
              'median_income',
              'median_age',
              'occupied_housing_units',
              'owner_occupied_housing_units',
              'family_homes',
              'bachelors_degree_2',
              'moved_recently',
              'average_yearly_bill',
              'average_yearly_kwh',
               # note: installer ID has to be gotten from the installer table
              'Battery System',
              'Feed-in Tariff (Annual Payment)']

In [268]:
df_to_write = psr_acs_lbnl_eia[cols_to_use]
df_to_write.head()

Unnamed: 0,full_zip,percent_qualified,number_of_panels_total,kw_median,potential_installs,median_income,median_age,occupied_housing_units,owner_occupied_housing_units,family_homes,bachelors_degree_2,moved_recently,average_yearly_bill,average_yearly_kwh,Battery System,Feed-in Tariff (Annual Payment)
0,20303,0.0,,,0.0,,,,,,,,1039.0712,8580.181187,,
1,20303,0.0,,,0.0,,,,,,,,1039.0712,8580.181187,,
2,20303,0.0,,,0.0,,,,,,,,1645.075807,11225.393497,,
3,20303,0.0,,,0.0,,,,,,,,1645.075807,11225.393497,,
4,20303,0.0,,,0.0,,,,,,,,582.926722,13723.320941,,


In [269]:
df_to_write.describe()

Unnamed: 0,percent_qualified,number_of_panels_total,kw_median,potential_installs,median_income,median_age,occupied_housing_units,owner_occupied_housing_units,family_homes,bachelors_degree_2,moved_recently,average_yearly_bill,average_yearly_kwh,Battery System,Feed-in Tariff (Annual Payment)
count,42105.0,42020.0,42020.0,42105.0,59662.0,61729.0,62530.0,62530.0,62530.0,62399.0,62399.0,143459.0,143459.0,54245.0,54245.0
mean,80.851591,364592.3,18.5797,4582.85586,63760.27334,41.128436,5673.529618,3586.362002,4800.618663,2108.36406,2092.204571,1253.007504,11091.514785,0.008129,121.9096
std,13.582837,373772.6,81.714602,4419.226092,28655.14151,8.09553,6109.409421,3733.011956,4996.053298,2717.175284,2668.287756,442.400195,3662.634068,0.078891,22653.6
min,0.0,8.0,2.0,0.0,2499.0,3.9,0.0,0.0,0.0,0.0,0.0,150.0,2597.014925,0.0,0.0
25%,73.303167,52971.5,7.5,623.0,44712.0,36.1,642.0,480.0,650.0,137.0,160.0,942.086883,8459.33039,0.0,0.0
50%,82.763701,251438.0,9.75,3527.0,57185.0,40.9,3322.0,2267.0,2969.0,976.0,956.0,1251.744889,10856.809831,0.0,0.0
75%,90.66704,566164.0,12.75,7271.0,76234.25,45.3,9315.0,5772.0,7848.0,3240.5,3203.0,1550.295733,13705.550305,0.0,0.0
max,100.0,3279590.0,4787.75,31713.0,250001.0,89.5,42546.0,29237.0,36107.0,25738.0,27633.0,3546.875,31896.48623,1.0,5272400.0


In [270]:
df_to_write.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 206079 entries, 0 to 206078
Data columns (total 16 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   full_zip                         206079 non-null  object 
 1   percent_qualified                42105 non-null   float64
 2   number_of_panels_total           42020 non-null   float64
 3   kw_median                        42020 non-null   float64
 4   potential_installs               42105 non-null   float64
 5   median_income                    59662 non-null   float64
 6   median_age                       61729 non-null   float64
 7   occupied_housing_units           62530 non-null   float64
 8   owner_occupied_housing_units     62530 non-null   float64
 9   family_homes                     62530 non-null   float64
 10  bachelors_degree_2               62399 non-null   float64
 11  moved_recently                   62399 non-null   float64
 12  av

That's a lot of missing data.

In [272]:
df_to_write.to_csv('data/solar_metrics_data.csv', index=False)