In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import requests
import json
from pprint import pprint
from config import gkey

csv_path = "../data/Stanford_Data.csv"
solar_df = pd.read_csv(csv_path, delimiter=";" , encoding="ANSI")
solar_df.head()

Unnamed: 0.1,Unnamed: 0,tile_count,solar_system_count,total_panel_area,fips,average_household_income,county,education_bachelor,education_college,education_doctoral,...,incentive_count_nonresidential,incentive_residential_state_level,incentive_nonresidential_state_level,net_metering,feedin_tariff,cooperate_tax,property_tax,sales_tax,rebate,avg_electricity_retail_rate
0,0,0,0,0.0,27145011200,70352.78987,Stearns County,569,1690,13,...,39,11,13,34,0,0,25,12,0,9.46
1,1,25,21,1133.436461,27145011301,61727.0852,Stearns County,674,1434,108,...,39,11,13,34,0,0,25,12,0,9.46
2,2,3,3,64.505776,27145011302,71496.88658,Stearns County,854,1459,31,...,39,11,13,34,0,0,25,12,0,9.46
3,3,0,0,0.0,27145011304,86840.15275,Stearns County,640,1116,68,...,39,11,13,34,0,0,25,12,0,9.46
4,4,5,5,164.583303,27145011400,89135.3156,Stearns County,654,1314,15,...,39,11,13,34,0,0,25,12,0,9.46


In [2]:
#sort values for each column to determine which ones carry the most NA values
solar_df.count().sort_values(ascending=True)

voting_2012_dem_percentage                 61983
voting_2012_gop_percentage                 61983
cooling_design_temperature                 66735
elevation                                  66735
heating_design_temperature                 66735
earth_temperature_amplitude                66735
frost_days                                 66735
lon                                        66735
air_temperature                            66735
daily_solar_radiation                      66735
atmospheric_pressure                       66735
wind_speed                                 66735
earth_temperature                          66735
heating_degree_days                        66735
cooling_degree_days                        66735
relative_humidity                          66735
lat                                        66735
housing_unit_median_gross_rent             70561
housing_unit_median_value                  70643
mortgage_with_rate                         71401
dropout_16_19_inscho

In [3]:
#delete 2012 dem and gop voting percentage as these have largest contribution to NA data
del solar_df["voting_2012_dem_percentage"]
del solar_df["voting_2012_gop_percentage"]

In [4]:
#drop all NA values from remaining data set
solar_df = solar_df.dropna(how='any')

In [5]:
#sort values on remaining dataframe to ensure all columns contain same number of rows with data
solar_df.count().sort_values(ascending=True)

Unnamed: 0                                 63847
age_more_than_85_rate                      63847
age_75_84_rate                             63847
age_35_44_rate                             63847
age_45_54_rate                             63847
age_65_74_rate                             63847
age_55_64_rate                             63847
age_10_14_rate                             63847
age_15_17_rate                             63847
age_5_9_rate                               63847
household_type_family_rate                 63847
dropout_16_19_inschool_rate                63847
occupation_construction_rate               63847
occupation_public_rate                     63847
occupation_information_rate                63847
occupation_finance_rate                    63847
occupation_education_rate                  63847
occupation_administrative_rate             63847
age_25_34_rate                             63847
occupation_manufacturing_rate              63847
age_18_24_rate      

In [6]:
#export cleaned data to csv

solar_df.to_csv("../data/Cleaned_Data.csv", index=False, header=True)



In [7]:
#import cleaned data csv file

csv_path_clean = "../data/Cleaned_Data.csv"
solar_clean_df = pd.read_csv(csv_path_clean, delimiter="," , encoding="ANSI")
solar_clean_df.head()

Unnamed: 0.1,Unnamed: 0,tile_count,solar_system_count,total_panel_area,fips,average_household_income,county,education_bachelor,education_college,education_doctoral,...,incentive_count_nonresidential,incentive_residential_state_level,incentive_nonresidential_state_level,net_metering,feedin_tariff,cooperate_tax,property_tax,sales_tax,rebate,avg_electricity_retail_rate
0,0,0,0,0.0,27145011200,70352.78987,Stearns County,569,1690,13,...,39,11,13,34,0,0,25,12,0,9.46
1,2,3,3,64.505776,27145011302,71496.88658,Stearns County,854,1459,31,...,39,11,13,34,0,0,25,12,0,9.46
2,3,0,0,0.0,27145011304,86840.15275,Stearns County,640,1116,68,...,39,11,13,34,0,0,25,12,0,9.46
3,4,5,5,164.583303,27145011400,89135.3156,Stearns County,654,1314,15,...,39,11,13,34,0,0,25,12,0,9.46
4,5,0,0,0.0,27145011500,62225.90361,Stearns County,522,1395,24,...,39,11,13,34,0,0,25,12,0,9.46


In [9]:
#check that size of new csv is trimmed to ensure it has been cleaned
solar_clean_df.shape

(63847, 167)

In [33]:
#get unique lat/lon combinations for each county

#group dataframe by county and state

grouped_location = solar_clean_df.groupby(["county", "state"])

#create empty dataframe for sorted county and state data

county_df=pd.DataFrame()

#create new column for total population in each county
county_df["population"]=grouped_location["population"].sum()
county_df["lat"]=grouped_location["lat"].mean()
county_df["lon"]=grouped_location["lon"].mean()

#reset index on county data frame so county names repeat

county_df=county_df.reset_index()
county_df

#determine lat/lon for each county and state from geocode







Unnamed: 0,county,state,population,lat,lon
0,Abbeville County,sc,16713,34.301500,-82.426750
1,Acadia Parish,la,44719,30.249556,-92.393000
2,Accomack County,va,33115,37.784250,-75.649750
3,Ada County,id,417501,43.607966,-116.273847
4,Adair County,ia,4561,41.280000,-94.485500
5,Adair County,ky,15219,37.097800,-85.263600
6,Adair County,mo,23169,40.200833,-92.591833
7,Adair County,ok,17149,35.931250,-94.671500
8,Adams County,co,463954,39.872033,-104.928261
9,Adams County,ia,2187,40.969000,-94.804000
