In [7]:
import pandas as pd
import numpy as np

import datetime
import time

# COMBINING DAILY SOLAR DATA WITH CAPACITY INFO
1. Get solar capacity, S solar, E solar, W solar
2. Iterate through manipulated data and append to row
3. Every time you get to a new data id, get next row from metadata
4. Repeat

In [8]:
# import data
metadata = pd.read_csv('metadata.csv')
metadata.drop(index=metadata.index[0], axis=0, inplace=True)
keeper_columns = ['dataid', 'pv', 'pv_panel_direction', 'total_amount_of_pv', 'amount_of_south_facing_pv', 'amount_of_west_facing_pv',
                 'amount_of_east_facing_pv']
metadata = metadata[keeper_columns]
metadata[["dataid", "total_amount_of_pv"]] = metadata[["dataid", "total_amount_of_pv"]].apply(pd.to_numeric)

solardata = pd.read_csv('manipulated_15minute_data_austin.csv')
solardata[["dataid"]] = solardata[["dataid"]].apply(pd.to_numeric)

metadata

Unnamed: 0,dataid,pv,pv_panel_direction,total_amount_of_pv,amount_of_south_facing_pv,amount_of_west_facing_pv,amount_of_east_facing_pv
1,2836,,,,,,
2,2743,,,,,,
3,5323,,,,,,
4,8560,,,,,,
5,3313,,,,,,
...,...,...,...,...,...,...,...
1728,11360,yes,West,5.25,,5.25,
1729,5361,,,,,,
1730,8217,,,,,,
1731,8057,yes,,,,,


In [9]:
# Combine info

merged = solardata.merge(metadata, on='dataid', how="inner")
merged.head()

Unnamed: 0,dataid,local_15min,grid,solar,leg1v,leg2v,pv,pv_panel_direction,total_amount_of_pv,amount_of_south_facing_pv,amount_of_west_facing_pv,amount_of_east_facing_pv
0,661,11/21/2018 15:15,0.124,0.276,123.915,124.277,yes,South,6.3,6.3,,
1,661,11/21/2018 15:30,0.251,0.167,123.959,124.293,yes,South,6.3,6.3,,
2,661,11/21/2018 15:45,0.419,0.179,123.886,124.24,yes,South,6.3,6.3,,
3,661,11/21/2018 16:00,0.833,0.076,123.88,124.175,yes,South,6.3,6.3,,
4,661,11/21/2018 16:15,1.105,0.064,123.633,124.226,yes,South,6.3,6.3,,


In [10]:
# Create "Percent Output" column
# Unit Assumptions. Near certain these are right:
#      -"total amount of pv" unit is kW (looked at average PV installation sizes)
#      -"solar" and "grid" units are kW (looked at average home consumption)
# IMPROVEMENT AREA: Does not account for orientation of panels
merged["Percent Output"] = merged["solar"] / merged["total_amount_of_pv"]
merged.head()

Unnamed: 0,dataid,local_15min,grid,solar,leg1v,leg2v,pv,pv_panel_direction,total_amount_of_pv,amount_of_south_facing_pv,amount_of_west_facing_pv,amount_of_east_facing_pv,Percent Output
0,661,11/21/2018 15:15,0.124,0.276,123.915,124.277,yes,South,6.3,6.3,,,0.04381
1,661,11/21/2018 15:30,0.251,0.167,123.959,124.293,yes,South,6.3,6.3,,,0.026508
2,661,11/21/2018 15:45,0.419,0.179,123.886,124.24,yes,South,6.3,6.3,,,0.028413
3,661,11/21/2018 16:00,0.833,0.076,123.88,124.175,yes,South,6.3,6.3,,,0.012063
4,661,11/21/2018 16:15,1.105,0.064,123.633,124.226,yes,South,6.3,6.3,,,0.010159


In [11]:
# merge weather and solar data
# IMPROVEMENT AREA: drops all solar info except 'percent output'
merged = merged[['dataid','local_15min','Percent Output']]
weatherdata = pd.read_csv('Compiled Weather Data.csv')

# Converting join columns to matching datetime data types
merged['local_15min'] = pd.to_datetime(merged['local_15min'])
weatherdata['datetime'] = pd.to_datetime(weatherdata['datetime'])

In [12]:
trainingdata = merged.merge(weatherdata, left_on='local_15min', right_on='datetime', how="inner")
trainingdata.head()

Unnamed: 0,dataid,local_15min,Percent Output,name,datetime,temp,feelslike,dew,humidity,precip,...,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,severerisk,conditions,icon,stations
0,661,2018-01-01,,"Austin, TX",2018-01-01,26.9,17.2,21.1,78.65,0.0,...,1038.3,100.0,9.9,,,,,Overcast,cloudy,"KATT,KAUS,72064800230,72254013904,72254413958"
1,1642,2018-01-01,-0.000785,"Austin, TX",2018-01-01,26.9,17.2,21.1,78.65,0.0,...,1038.3,100.0,9.9,,,,,Overcast,cloudy,"KATT,KAUS,72064800230,72254013904,72254413958"
2,2335,2018-01-01,-0.001014,"Austin, TX",2018-01-01,26.9,17.2,21.1,78.65,0.0,...,1038.3,100.0,9.9,,,,,Overcast,cloudy,"KATT,KAUS,72064800230,72254013904,72254413958"
3,2361,2018-01-01,,"Austin, TX",2018-01-01,26.9,17.2,21.1,78.65,0.0,...,1038.3,100.0,9.9,,,,,Overcast,cloudy,"KATT,KAUS,72064800230,72254013904,72254413958"
4,2818,2018-01-01,-0.000928,"Austin, TX",2018-01-01,26.9,17.2,21.1,78.65,0.0,...,1038.3,100.0,9.9,,,,,Overcast,cloudy,"KATT,KAUS,72064800230,72254013904,72254413958"


In [13]:
## IMPROVEMENT OPPORTUNITY: 'conditions' categorical variable is not used
trainingdata.drop(['name', 'datetime', 'dataid', 'stations', 'icon', 'conditions'], axis=1, inplace=True) 
list(trainingdata.columns.values)

['local_15min',
 'Percent Output',
 'temp',
 'feelslike',
 'dew',
 'humidity',
 'precip',
 'precipprob',
 'preciptype',
 'snow',
 'snowdepth',
 'windgust',
 'windspeed',
 'winddir',
 'sealevelpressure',
 'cloudcover',
 'visibility',
 'solarradiation',
 'solarenergy',
 'uvindex',
 'severerisk']

In [None]:
trainingdata.dtypes

In [14]:
trainingdata['local_15min'] = trainingdata['local_15min'].map(lambda x: time.mktime(x.timetuple()))

In [27]:
# reorder so the predicted feature is at the end
column_to_reorder = trainingdata.pop('Percent Output')
trainingdata.insert(len(trainingdata. columns), 'Percent Output', column_to_reorder)

trainingdata.head()

Unnamed: 0,dataid,local_15min,name,datetime,temp,feelslike,dew,humidity,precip,precipprob,...,cloudcover,visibility,solarradiation,solarenergy,uvindex,severerisk,conditions,icon,stations,Percent Output
0,661,1514786000.0,"Austin, TX",2018-01-01,26.9,17.2,21.1,78.65,0.0,,...,100.0,9.9,,,,,Overcast,cloudy,"KATT,KAUS,72064800230,72254013904,72254413958",
1,1642,1514786000.0,"Austin, TX",2018-01-01,26.9,17.2,21.1,78.65,0.0,,...,100.0,9.9,,,,,Overcast,cloudy,"KATT,KAUS,72064800230,72254013904,72254413958",-0.000785
2,2335,1514786000.0,"Austin, TX",2018-01-01,26.9,17.2,21.1,78.65,0.0,,...,100.0,9.9,,,,,Overcast,cloudy,"KATT,KAUS,72064800230,72254013904,72254413958",-0.001014
3,2361,1514786000.0,"Austin, TX",2018-01-01,26.9,17.2,21.1,78.65,0.0,,...,100.0,9.9,,,,,Overcast,cloudy,"KATT,KAUS,72064800230,72254013904,72254413958",
4,2818,1514786000.0,"Austin, TX",2018-01-01,26.9,17.2,21.1,78.65,0.0,,...,100.0,9.9,,,,,Overcast,cloudy,"KATT,KAUS,72064800230,72254013904,72254413958",-0.000928


In [29]:
trainingdata.head()

Unnamed: 0,local_15min,temp,feelslike,dew,humidity,precip,precipprob,preciptype,snow,snowdepth,...,windspeed,winddir,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,severerisk,Percent Output
0,1514786000.0,26.9,17.2,21.1,78.65,0.0,,,0,0,...,10.3,4.0,1038.3,100.0,9.9,,,,,
1,1514786000.0,26.9,17.2,21.1,78.65,0.0,,,0,0,...,10.3,4.0,1038.3,100.0,9.9,,,,,-0.000785
2,1514786000.0,26.9,17.2,21.1,78.65,0.0,,,0,0,...,10.3,4.0,1038.3,100.0,9.9,,,,,-0.001014
3,1514786000.0,26.9,17.2,21.1,78.65,0.0,,,0,0,...,10.3,4.0,1038.3,100.0,9.9,,,,,
4,1514786000.0,26.9,17.2,21.1,78.65,0.0,,,0,0,...,10.3,4.0,1038.3,100.0,9.9,,,,,-0.000928


In [30]:
trainingdata.to_csv("SolarTrainingData.csv")