In [1]:
import pandas as pd
import numpy as np
import datetime
import time

# COMBINING DAILY SOLAR DATA WITH CAPACITY INFO
1. Get solar capacity, S solar, E solar, W solar
2. Iterate through manipulated data and append to row
3. Every time you get to a new data id, get next row from metadata
4. Repeat

In [2]:
# import data
metadata = pd.read_csv('metadata.csv')
metadata.drop(index=metadata.index[0], axis=0, inplace=True)
keeper_columns = ['dataid', 'pv', 'pv_panel_direction', 'total_amount_of_pv', 'amount_of_south_facing_pv', 'amount_of_west_facing_pv',
                 'amount_of_east_facing_pv']
metadata = metadata[keeper_columns]
metadata[["dataid", "total_amount_of_pv"]] = metadata[["dataid", "total_amount_of_pv"]].apply(pd.to_numeric)

solardata = pd.read_csv('manipulated_15minute_data_austin.csv')
solardata[["dataid"]] = solardata[["dataid"]].apply(pd.to_numeric)

metadata

Unnamed: 0,dataid,pv,pv_panel_direction,total_amount_of_pv,amount_of_south_facing_pv,amount_of_west_facing_pv,amount_of_east_facing_pv
1,2836,,,,,,
2,2743,,,,,,
3,5323,,,,,,
4,8560,,,,,,
5,3313,,,,,,
...,...,...,...,...,...,...,...
1728,11360,yes,West,5.25,,5.25,
1729,5361,,,,,,
1730,8217,,,,,,
1731,8057,yes,,,,,


In [3]:
# Combine info

merged = solardata.merge(metadata, on='dataid', how="inner")
merged.head()

Unnamed: 0,dataid,local_15min,grid,solar,leg1v,leg2v,pv,pv_panel_direction,total_amount_of_pv,amount_of_south_facing_pv,amount_of_west_facing_pv,amount_of_east_facing_pv
0,661,11/21/2018 15:15,0.124,0.276,123.915,124.277,yes,South,6.3,6.3,,
1,661,11/21/2018 15:30,0.251,0.167,123.959,124.293,yes,South,6.3,6.3,,
2,661,11/21/2018 15:45,0.419,0.179,123.886,124.24,yes,South,6.3,6.3,,
3,661,11/21/2018 16:00,0.833,0.076,123.88,124.175,yes,South,6.3,6.3,,
4,661,11/21/2018 16:15,1.105,0.064,123.633,124.226,yes,South,6.3,6.3,,


In [4]:
# Create "Percent Output" column
# Unit Assumptions. Near certain these are right:
#      -"total amount of pv" unit is kW (looked at average PV installation sizes)
#      -"solar" and "grid" units are kW (looked at average home consumption)
# IMPROVEMENT AREA: Does not account for orientation of panels
merged["Percent Output"] = merged["solar"] / merged["total_amount_of_pv"]
merged.head()

Unnamed: 0,dataid,local_15min,grid,solar,leg1v,leg2v,pv,pv_panel_direction,total_amount_of_pv,amount_of_south_facing_pv,amount_of_west_facing_pv,amount_of_east_facing_pv,Percent Output
0,661,11/21/2018 15:15,0.124,0.276,123.915,124.277,yes,South,6.3,6.3,,,0.04381
1,661,11/21/2018 15:30,0.251,0.167,123.959,124.293,yes,South,6.3,6.3,,,0.026508
2,661,11/21/2018 15:45,0.419,0.179,123.886,124.24,yes,South,6.3,6.3,,,0.028413
3,661,11/21/2018 16:00,0.833,0.076,123.88,124.175,yes,South,6.3,6.3,,,0.012063
4,661,11/21/2018 16:15,1.105,0.064,123.633,124.226,yes,South,6.3,6.3,,,0.010159


In [5]:
# merge weather and solar data
# IMPROVEMENT AREA: drops all solar info except 'percent output'
merged = merged[['dataid','local_15min','Percent Output']]
weatherdata = pd.read_csv('Compiled Weather Data.csv')

# Converting join columns to matching datetime data types
merged['local_15min'] = pd.to_datetime(merged['local_15min'])
weatherdata['datetime'] = pd.to_datetime(weatherdata['datetime'])

In [6]:
trainingdata = merged.merge(weatherdata, left_on='local_15min', right_on='datetime', how="inner")
trainingdata.head()

Unnamed: 0,dataid,local_15min,Percent Output,name,datetime,temp,feelslike,dew,humidity,precip,...,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,severerisk,conditions,icon,stations
0,661,2018-01-01,,"Austin, TX",2018-01-01,26.9,17.2,21.1,78.65,0.0,...,1038.3,100.0,9.9,,,,,Overcast,cloudy,"KATT,KAUS,72064800230,72254013904,72254413958"
1,1642,2018-01-01,-0.000785,"Austin, TX",2018-01-01,26.9,17.2,21.1,78.65,0.0,...,1038.3,100.0,9.9,,,,,Overcast,cloudy,"KATT,KAUS,72064800230,72254013904,72254413958"
2,2335,2018-01-01,-0.001014,"Austin, TX",2018-01-01,26.9,17.2,21.1,78.65,0.0,...,1038.3,100.0,9.9,,,,,Overcast,cloudy,"KATT,KAUS,72064800230,72254013904,72254413958"
3,2361,2018-01-01,,"Austin, TX",2018-01-01,26.9,17.2,21.1,78.65,0.0,...,1038.3,100.0,9.9,,,,,Overcast,cloudy,"KATT,KAUS,72064800230,72254013904,72254413958"
4,2818,2018-01-01,-0.000928,"Austin, TX",2018-01-01,26.9,17.2,21.1,78.65,0.0,...,1038.3,100.0,9.9,,,,,Overcast,cloudy,"KATT,KAUS,72064800230,72254013904,72254413958"


In [7]:
## IMPROVEMENT OPPORTUNITY: 'conditions' categorical variable is not used
trainingdata.drop(['name', 'datetime', 'dataid', 'stations', 'icon', 'conditions'], axis=1, inplace=True) 
list(trainingdata.columns.values)

['local_15min',
 'Percent Output',
 'temp',
 'feelslike',
 'dew',
 'humidity',
 'precip',
 'precipprob',
 'preciptype',
 'snow',
 'snowdepth',
 'windgust',
 'windspeed',
 'winddir',
 'sealevelpressure',
 'cloudcover',
 'visibility',
 'solarradiation',
 'solarenergy',
 'uvindex',
 'severerisk']

In [8]:
trainingdata.dtypes

local_15min         datetime64[ns]
Percent Output             float64
temp                       float64
feelslike                  float64
dew                        float64
humidity                   float64
precip                     float64
precipprob                 float64
preciptype                 float64
snow                         int64
snowdepth                    int64
windgust                   float64
windspeed                  float64
winddir                    float64
sealevelpressure           float64
cloudcover                 float64
visibility                 float64
solarradiation             float64
solarenergy                float64
uvindex                    float64
severerisk                 float64
dtype: object

In [14]:
# turn datetime into unix timestamp
trainingdata['local_15min'] = trainingdata['local_15min'].map(lambda x: time.mktime(x.timetuple()))

In [8]:
# turn datetime into 4 new columns: year, month, day, time
trainingdata['year'] = trainingdata['local_15min'].dt.year
trainingdata['month'] = trainingdata['local_15min'].dt.month
trainingdata['day'] = trainingdata['local_15min'].dt.day
trainingdata['time'] = trainingdata['local_15min'].dt.hour + trainingdata['local_15min'].dt.minute / 60

In [9]:
# reorder so the predicted feature is at the end
column_to_reorder = trainingdata.pop('Percent Output')
trainingdata.insert(len(trainingdata. columns), 'Percent Output', column_to_reorder)

trainingdata.dtypes

local_15min         datetime64[ns]
temp                       float64
feelslike                  float64
dew                        float64
humidity                   float64
precip                     float64
precipprob                 float64
preciptype                 float64
snow                         int64
snowdepth                    int64
windgust                   float64
windspeed                  float64
winddir                    float64
sealevelpressure           float64
cloudcover                 float64
visibility                 float64
solarradiation             float64
solarenergy                float64
uvindex                    float64
severerisk                 float64
year                         int64
month                        int64
day                          int64
time                       float64
Percent Output             float64
dtype: object

In [17]:
trainingdata.tail()

Unnamed: 0,local_15min,temp,feelslike,dew,humidity,precip,precipprob,preciptype,snow,snowdepth,...,visibility,solarradiation,solarenergy,uvindex,severerisk,year,month,day,time,Percent Output
73787,2018-08-31 23:00:00,83.8,88.1,69.9,63.08,0.0,,,0,0,...,9.9,,,,,2018,8,31,23.0,
73788,2018-08-31 23:00:00,83.8,88.1,69.9,63.08,0.0,,,0,0,...,9.9,,,,,2018,8,31,23.0,-0.001042
73789,2018-08-31 23:00:00,83.8,88.1,69.9,63.08,0.0,,,0,0,...,9.9,,,,,2018,8,31,23.0,-0.000222
73790,2018-08-31 23:00:00,83.8,88.1,69.9,63.08,0.0,,,0,0,...,9.9,,,,,2018,8,31,23.0,
73791,2018-08-31 23:00:00,83.8,88.1,69.9,63.08,0.0,,,0,0,...,9.9,,,,,2018,8,31,23.0,-0.001304


In [10]:
trainingdata.drop(['local_15min'], axis=1, inplace=True) 

In [15]:
trainingdata.describe()

Unnamed: 0,temp,feelslike,dew,humidity,precip,snow,snowdepth,windgust,windspeed,winddir,...,cloudcover,visibility,solarradiation,solarenergy,uvindex,year,month,day,time,Percent Output
count,9231.0,9231.0,9231.0,9231.0,9231.0,9231.0,9231.0,9231.0,9231.0,9231.0,...,9231.0,9231.0,9231.0,9231.0,9231.0,9231.0,9231.0,9231.0,9231.0,9231.0
mean,79.676785,80.713541,54.160925,44.645994,0.003705,0.0,0.0,20.669527,9.639248,173.017983,...,40.603586,9.735337,387.443506,1.393078,3.840646,2018.0,4.846929,16.221211,14.345033,0.357038
std,17.517095,20.088386,17.012247,17.153555,0.053846,0.0,0.0,3.832644,2.811253,88.716513,...,36.135209,0.743708,263.732556,0.949083,2.683334,0.0,2.77289,8.719142,2.901415,0.231429
min,25.0,13.7,5.5,9.38,0.0,0.0,0.0,16.1,1.8,1.0,...,0.0,1.6,4.0,0.0,0.0,2018.0,1.0,1.0,8.0,-0.010518
25%,68.1,68.1,47.1,31.64,0.0,0.0,0.0,18.1,7.8,147.5,...,9.1,9.9,148.0,0.5,1.0,2018.0,3.0,9.0,12.0,0.144495
50%,81.1,82.3,61.1,41.88,0.0,0.0,0.0,19.7,9.3,176.0,...,23.0,9.9,372.0,1.3,4.0,2018.0,3.0,16.0,14.0,0.378507
75%,94.6,97.5,65.9,54.5,0.0,0.0,0.0,22.7,11.4,200.0,...,80.9,9.9,621.0,2.2,6.0,2018.0,8.0,23.0,17.0,0.567619
max,104.6,105.3,75.8,97.07,1.1,0.0,0.0,37.5,19.5,360.0,...,100.0,9.9,902.0,3.2,9.0,2018.0,8.0,31.0,21.0,0.829048


In [12]:
trainingdata.drop(['precipprob','preciptype','severerisak'], axis=1, inplace=True) 

In [14]:
trainingdata.dropna(inplace=True)

In [16]:
trainingdata.to_csv("SolarTrainingData.csv")