# Prepare solar data 

* Read in libraries
* Read in data
* Narrow down to analysis data
* Produce a dataset summaryizing key variables by year-month and customer type
* Produce a dataset summarizing key variables by zip code
* Identify the top 10 solar installer in NY


In [1]:
# Import Libraries
import pandas as pd 
import matplotlib.pyplot as plt
import os
from datetime import datetime
%config IPCompleter.greedy=True

In [4]:
#02 READ IN DATA 

#Read in NY solar data
path = "https://data.ny.gov/api/views/3x8r-34rs/rows.csv?accessType=DOWNLOAD"
#path = "../Resources/Solar_Installation_Data Raw.csv"

solar = pd.read_csv(path)
solar.head()



Unnamed: 0,Reporting Period,Project Number,City,County,State,Zip Code,Sector,Program Type,Solicitation,Electric Utility,...,Total PV Module Quantity,Project Cost,$Incentive,Total Nameplate kW DC,Expected KWh Annual Production,Remote Net Metering,Affordable Solar,Community Distributed Generation,Green Jobs Green New York Participant,Location 1
0,5/31/2018,4260-88892,Three Mile Bay,Jefferson,NY,13693,Residential,Residential/Small Commercial,PON 2112,National Grid,...,32.0,29680.0,4240.0,8.8,10330,No,No,No,,"Three Mile\nBay, NY 13693\n"
1,5/31/2018,5098-68726,North Babylon,Suffolk,NY,11703,Residential,Residential/Small Commercial,PON 2112LI,PSEG Long Island,...,25.0,20800.0,886.0,6.15,7219,No,No,No,,"North Babylon, NY 11703\n(40.733915, -73.323802)"
2,5/31/2018,89350,Queens,Queens,NY,11434,Residential,Residential/Small Commercial,PON 2112,Consolidated Edison,...,21.0,30791.25,2898.0,7.25,8504,,No,No,,"Queens, NY 11434\n(40.676003, -73.775656)"
3,5/31/2018,93198,Bronx,Bronx,NY,10456,Residential,Residential/Small Commercial,PON 2112,Consolidated Edison,...,14.0,13906.0,1848.0,4.62,5423,,No,No,,"Bronx, NY 10456\n(40.830652, -73.908566)"
4,5/31/2018,94091,Brooklyn,Kings,NY,11220,Residential,Residential/Small Commercial,PON 2112,Consolidated Edison,...,84.0,64300.0,2117.0,5.29,6212,,No,No,Residential,"Brooklyn, NY 11220\n(40.640771, -74.016133)"


In [5]:
#03 DATA CLEANING

#NARROW TO RELEVANT VARIABLES
solar = solar[["Project Number", "City", "Zip Code",
               "Sector", "Electric Utility", 
               "Purchase Type", "Project Status",
               "Date Application Received", "Date Completed",
               "Project Cost","$Incentive", "Total Nameplate kW DC", 
               "Expected KWh Annual Production", "Contractor", 
               "Primary PV Module Manufacturer", "Location 1" ]]

#Identify Date variables
solar['Date Completed'] = pd.to_datetime(solar['Date Completed'] )


#Create additional variable
solar['$_per_watt'] = solar['Project Cost'] / solar['Total Nameplate kW DC'] / 1000
solar['$_per_watt_wincentives'] = (solar['Project Cost'] - solar['$Incentive'] ) / solar['Total Nameplate kW DC'] /1000

#Narrow down to projects that are completed
solar = solar.loc[solar["Project Status"] == "Complete"]

#Drop Missing Values
solar.dropna(how = 'any')

#Convert zip code to string for merge
#solar['Zip Code'] = solar['Zip Code'].to_string()
solar.head()

Unnamed: 0,Project Number,City,Zip Code,Sector,Electric Utility,Purchase Type,Project Status,Date Application Received,Date Completed,Project Cost,$Incentive,Total Nameplate kW DC,Expected KWh Annual Production,Contractor,Primary PV Module Manufacturer,Location 1,$_per_watt,$_per_watt_wincentives
0,4260-88892,Three Mile Bay,13693,Residential,National Grid,Purchase,Complete,2/1/2016,2016-09-21,29680.0,4240.0,8.8,10330,"Fourth Coast, Inc",SolarWorld,"Three Mile\nBay, NY 13693\n",3.372727,2.890909
2,89350,Queens,11434,Residential,Consolidated Edison,Lease,Complete,5/30/2017,2017-07-25,30791.25,2898.0,7.25,8504,SunPower Capital LLC,SunPower,"Queens, NY 11434\n(40.676003, -73.775656)",4.247069,3.847345
3,93198,Bronx,10456,Residential,Consolidated Edison,Lease,Complete,7/19/2017,2017-09-20,13906.0,1848.0,4.62,5423,SunRun Inc.,LG Electronics,"Bronx, NY 10456\n(40.830652, -73.908566)",3.009957,2.609957
7,2011-157041-SLPR,Old Field,11733,Residential,PSEG Long Island,Purchase,Complete,10/9/2005,2005-10-09,,14000.0,3.5,4108,,,"Old Field, NY 11733\n(40.93232, -73.105)",,
8,2011-158360-SLPR,Medford,11763,Residential,PSEG Long Island,Purchase,Complete,4/23/2009,2009-04-23,136952.0,35000.0,10.25,12032,,,"Medford, NY 11763\n(40.822436, -72.982416)",13.361171,9.946537


In [6]:
#04  Save File
#Output File
out_path = os.path.join('..', 'Resources', 'Solar installation data.csv')
solar.to_csv(out_path)

#View data
solar.head()


Unnamed: 0,Project Number,City,Zip Code,Sector,Electric Utility,Purchase Type,Project Status,Date Application Received,Date Completed,Project Cost,$Incentive,Total Nameplate kW DC,Expected KWh Annual Production,Contractor,Primary PV Module Manufacturer,Location 1,$_per_watt,$_per_watt_wincentives
0,4260-88892,Three Mile Bay,13693,Residential,National Grid,Purchase,Complete,2/1/2016,2016-09-21,29680.0,4240.0,8.8,10330,"Fourth Coast, Inc",SolarWorld,"Three Mile\nBay, NY 13693\n",3.372727,2.890909
2,89350,Queens,11434,Residential,Consolidated Edison,Lease,Complete,5/30/2017,2017-07-25,30791.25,2898.0,7.25,8504,SunPower Capital LLC,SunPower,"Queens, NY 11434\n(40.676003, -73.775656)",4.247069,3.847345
3,93198,Bronx,10456,Residential,Consolidated Edison,Lease,Complete,7/19/2017,2017-09-20,13906.0,1848.0,4.62,5423,SunRun Inc.,LG Electronics,"Bronx, NY 10456\n(40.830652, -73.908566)",3.009957,2.609957
7,2011-157041-SLPR,Old Field,11733,Residential,PSEG Long Island,Purchase,Complete,10/9/2005,2005-10-09,,14000.0,3.5,4108,,,"Old Field, NY 11733\n(40.93232, -73.105)",,
8,2011-158360-SLPR,Medford,11763,Residential,PSEG Long Island,Purchase,Complete,4/23/2009,2009-04-23,136952.0,35000.0,10.25,12032,,,"Medford, NY 11763\n(40.822436, -72.982416)",13.361171,9.946537


In [7]:
solar.describe()

Unnamed: 0,Zip Code,Project Cost,$Incentive,Total Nameplate kW DC,Expected KWh Annual Production,$_per_watt,$_per_watt_wincentives
count,82583.0,82396.0,82545.0,82583.0,82583.0,82396.0,82358.0
mean,11926.007423,52035.39,10497.33,13.072131,15134.27,4.793315,3.91259
std,1084.038024,191501.5,48976.45,80.245291,90144.81,2.34799,2.095576
min,501.0,150.0,0.0,0.14,164.0,0.007102,-3.137346
25%,11234.0,23008.47,2080.0,5.12,6010.0,3.75,3.095738
50%,11762.0,32346.38,3780.0,7.14,8381.0,4.56519,3.8
75%,12491.0,45391.1,8033.0,9.88,11598.0,5.6,4.679
max,14905.0,9962128.0,2980270.0,5739.52,6737278.0,282.758621,279.282759
