In [114]:
import pandas as pd
import numpy as np
import datetime
import time

# COMBINING DAILY SOLAR DATA WITH CAPACITY INFO
1. Get solar capacity, S solar, E solar, W solar
2. Iterate through manipulated data and append to row
3. Every time you get to a new data id, get next row from metadata
4. Repeat

In [115]:
# import data
metadata = pd.read_csv('metadata.csv')
metadata.drop(index=metadata.index[0], axis=0, inplace=True)
keeper_columns = ['dataid', 'pv', 'pv_panel_direction', 'total_amount_of_pv', 'amount_of_south_facing_pv', 'amount_of_west_facing_pv',
                 'amount_of_east_facing_pv']
metadata = metadata[keeper_columns]
metadata[["dataid", "total_amount_of_pv"]] = metadata[["dataid", "total_amount_of_pv"]].apply(pd.to_numeric)

solardata = pd.read_csv('manipulated_15minute_data_austin.csv')
solardata[["dataid"]] = solardata[["dataid"]].apply(pd.to_numeric)
solardata.dropna(inplace=True)

metadata

Unnamed: 0,dataid,pv,pv_panel_direction,total_amount_of_pv,amount_of_south_facing_pv,amount_of_west_facing_pv,amount_of_east_facing_pv
1,2836,,,,,,
2,2743,,,,,,
3,5323,,,,,,
4,8560,,,,,,
5,3313,,,,,,
...,...,...,...,...,...,...,...
1728,11360,yes,West,5.25,,5.25,
1729,5361,,,,,,
1730,8217,,,,,,
1731,8057,yes,,,,,


In [116]:
# Combine info

merged = solardata.merge(metadata, on='dataid', how="inner")
merged.dropna(inplace=True)
merged.head()

Unnamed: 0,dataid,local_15min,grid,solar,leg1v,leg2v,pv,pv_panel_direction,total_amount_of_pv,amount_of_south_facing_pv,amount_of_west_facing_pv,amount_of_east_facing_pv
173447,3039,5/15/2018 10:15,0.427,1.298,123.58,123.757,yes,West,6.25,0.0,6.25,0.0
173448,3039,5/15/2018 10:30,0.775,0.942,123.441,123.528,yes,West,6.25,0.0,6.25,0.0
173449,3039,5/15/2018 10:45,-0.065,1.817,123.614,123.792,yes,West,6.25,0.0,6.25,0.0
173450,3039,5/15/2018 11:00,-0.474,2.197,123.648,123.8,yes,West,6.25,0.0,6.25,0.0
173451,3039,5/15/2018 11:15,-0.589,2.317,123.797,123.999,yes,West,6.25,0.0,6.25,0.0


In [117]:
# Create "Percent Output" column
# Unit Assumptions. Near certain these are right:
#      -"total amount of pv" unit is kW (looked at average PV installation sizes)
#      -"solar" and "grid" units are kW (looked at average home consumption)
# IMPROVEMENT AREA: Does not account for orientation of panels
merged["Percent Output"] = merged["solar"] / merged["total_amount_of_pv"]
merged.head()

Unnamed: 0,dataid,local_15min,grid,solar,leg1v,leg2v,pv,pv_panel_direction,total_amount_of_pv,amount_of_south_facing_pv,amount_of_west_facing_pv,amount_of_east_facing_pv,Percent Output
173447,3039,5/15/2018 10:15,0.427,1.298,123.58,123.757,yes,West,6.25,0.0,6.25,0.0,0.20768
173448,3039,5/15/2018 10:30,0.775,0.942,123.441,123.528,yes,West,6.25,0.0,6.25,0.0,0.15072
173449,3039,5/15/2018 10:45,-0.065,1.817,123.614,123.792,yes,West,6.25,0.0,6.25,0.0,0.29072
173450,3039,5/15/2018 11:00,-0.474,2.197,123.648,123.8,yes,West,6.25,0.0,6.25,0.0,0.35152
173451,3039,5/15/2018 11:15,-0.589,2.317,123.797,123.999,yes,West,6.25,0.0,6.25,0.0,0.37072


In [118]:
# Transforming output data to include time-series information
# IMPROVEMENT AREA: Fiddle with how far back you want the data to go to inform future predictions

current_house, onefourtyfive_ago, onethirty_ago, onefifteen_ago, one_ago, fourtyfive_ago, thirty_ago, fifteen_ago = None, None, None, None, None, None, None, None
merged['onefourtyfive_ago'] = None
merged['onethirty_ago'] = None
merged['onefifteen_ago'] = None
merged['one_ago'] = None
merged['fourtyfive_ago'] = None
merged['thirty_ago'] = None
merged['fifteen_ago'] = None

merged.head()

for i, row in merged.iterrows():
    
    if (current_house == row["dataid"]):              # if you are continuing with the same house
        merged.at[i,'onefourtyfive_ago'] = onefourtyfive_ago
        merged.at[i,'onethirty_ago'] = onethirty_ago
        merged.at[i,'onefifteen_ago'] = onefifteen_ago
        merged.at[i,'one_ago'] = one_ago
        merged.at[i,'fourtyfive_ago'] = fourtyfive_ago
        merged.at[i,'thirty_ago'] = thirty_ago
        merged.at[i,'fifteen_ago'] = fifteen_ago
        
        onefourtyfive_ago = onethirty_ago
        onethirty_ago = onefifteen_ago
        onefifteen_ago = one_ago
        one_ago = fourtyfive_ago
        fourtyfive_ago = thirty_ago                       # refresh previous data variables
        thirty_ago = fifteen_ago
        fifteen_ago = row["Percent Output"]
        
    else:                                             # if a row encounters a new house
        current_house = row["dataid"]                     # refresh previous data variables
        onefourtyfive_ago, onethirty_ago, onefiteen_ago, one_ago, fourtyfive_ago, thirty_ago = None, None, None, None, None, None        
        fifteen_ago = row["Percent Output"]           

In [119]:
merged.head(20)

Unnamed: 0,dataid,local_15min,grid,solar,leg1v,leg2v,pv,pv_panel_direction,total_amount_of_pv,amount_of_south_facing_pv,amount_of_west_facing_pv,amount_of_east_facing_pv,Percent Output,onefourtyfive_ago,onethirty_ago,onefifteen_ago,one_ago,fourtyfive_ago,thirty_ago,fifteen_ago
173447,3039,5/15/2018 10:15,0.427,1.298,123.58,123.757,yes,West,6.25,0.0,6.25,0.0,0.20768,,,,,,,
173448,3039,5/15/2018 10:30,0.775,0.942,123.441,123.528,yes,West,6.25,0.0,6.25,0.0,0.15072,,,,,,,0.20768
173449,3039,5/15/2018 10:45,-0.065,1.817,123.614,123.792,yes,West,6.25,0.0,6.25,0.0,0.29072,,,,,,0.20768,0.15072
173450,3039,5/15/2018 11:00,-0.474,2.197,123.648,123.8,yes,West,6.25,0.0,6.25,0.0,0.35152,,,,,0.20768,0.15072,0.29072
173451,3039,5/15/2018 11:15,-0.589,2.317,123.797,123.999,yes,West,6.25,0.0,6.25,0.0,0.37072,,,,0.20768,0.15072,0.29072,0.35152
173452,3039,5/15/2018 11:30,-0.826,2.584,123.703,124.031,yes,West,6.25,0.0,6.25,0.0,0.41344,,,0.20768,0.15072,0.29072,0.35152,0.37072
173453,3039,5/15/2018 11:45,-0.986,2.731,123.596,123.726,yes,West,6.25,0.0,6.25,0.0,0.43696,,0.20768,0.15072,0.29072,0.35152,0.37072,0.41344
173454,3039,5/16/2018 7:15,1.4,0.098,122.649,122.805,yes,West,6.25,0.0,6.25,0.0,0.01568,0.20768,0.15072,0.29072,0.35152,0.37072,0.41344,0.43696
173455,3039,5/16/2018 7:30,1.422,0.131,122.503,122.702,yes,West,6.25,0.0,6.25,0.0,0.02096,0.15072,0.29072,0.35152,0.37072,0.41344,0.43696,0.01568
173456,3039,5/16/2018 7:45,1.347,0.164,122.501,122.67,yes,West,6.25,0.0,6.25,0.0,0.02624,0.29072,0.35152,0.37072,0.41344,0.43696,0.01568,0.02096


In [120]:
# merge weather and solar data
# IMPROVEMENT AREA: drops all solar info except 'percent output'
merged = merged[['dataid','local_15min','Percent Output', 'onefourtyfive_ago', 'onethirty_ago', 'onefifteen_ago', 'one_ago', 'fourtyfive_ago', 'thirty_ago', 'fifteen_ago']]
weatherdata = pd.read_csv('Compiled Weather Data.csv')

# Converting join columns to matching datetime data types
merged['local_15min'] = pd.to_datetime(merged['local_15min'])
weatherdata['datetime'] = pd.to_datetime(weatherdata['datetime'])

In [121]:
trainingdata = merged.merge(weatherdata, left_on='local_15min', right_on='datetime', how="inner")
trainingdata.head()

Unnamed: 0,dataid,local_15min,Percent Output,onefourtyfive_ago,onethirty_ago,onefifteen_ago,one_ago,fourtyfive_ago,thirty_ago,fifteen_ago,...,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,severerisk,conditions,icon,stations
0,3039,2018-01-01 01:00:00,-0.00064,0.03328,0.036,0.04672,0.07392,0.10032,0.15872,0.1464,...,1038.1,100.0,9.9,,,,,Overcast,cloudy,"KATT,KAUS,72064800230,KEDC,72254013904,7225441..."
1,3039,2018-01-01 02:00:00,-0.0008,0.10032,0.15872,0.1464,-0.00064,-0.0008,-0.0008,-0.0008,...,1038.6,100.0,9.9,,,,,Overcast,cloudy,"KATT,KAUS,72064800230,KEDC,72254013904,7225441..."
2,3039,2018-01-01 05:00:00,-0.0008,-0.0008,-0.0008,-0.0008,-0.0008,-0.0008,-0.0008,-0.0008,...,1039.7,100.0,9.9,,,,,Overcast,cloudy,"KATT,KAUS,72064800230,KEDC,72254013904,7225441..."
3,3039,2018-01-01 06:00:00,-0.0008,-0.0008,-0.0008,-0.0008,-0.0008,-0.0008,-0.0008,-0.0008,...,1040.1,100.0,9.9,,,,,Overcast,cloudy,"KATT,KAUS,72064800230,KEDC,72254013904,7225441..."
4,3039,2018-01-01 08:00:00,0.01696,-0.0008,-0.0008,-0.0008,-0.0008,-0.0008,-0.0008,-0.0008,...,1041.6,97.5,9.9,18.0,0.1,0.0,,Overcast,cloudy,"KATT,KAUS,72064800230,E4737,KEDC,72254013904,7..."


In [122]:
merged.head()

Unnamed: 0,dataid,local_15min,Percent Output,onefourtyfive_ago,onethirty_ago,onefifteen_ago,one_ago,fourtyfive_ago,thirty_ago,fifteen_ago
173447,3039,2018-05-15 10:15:00,0.20768,,,,,,,
173448,3039,2018-05-15 10:30:00,0.15072,,,,,,,0.20768
173449,3039,2018-05-15 10:45:00,0.29072,,,,,,0.20768,0.15072
173450,3039,2018-05-15 11:00:00,0.35152,,,,,0.20768,0.15072,0.29072
173451,3039,2018-05-15 11:15:00,0.37072,,,,0.20768,0.15072,0.29072,0.35152


In [123]:
## IMPROVEMENT OPPORTUNITY: 'conditions' categorical variable is not used
trainingdata.drop(['name', 'datetime', 'dataid', 'stations', 'icon', 'conditions'], axis=1, inplace=True) 
list(trainingdata.columns.values)

['local_15min',
 'Percent Output',
 'onefourtyfive_ago',
 'onethirty_ago',
 'onefifteen_ago',
 'one_ago',
 'fourtyfive_ago',
 'thirty_ago',
 'fifteen_ago',
 'temp',
 'feelslike',
 'dew',
 'humidity',
 'precip',
 'precipprob',
 'preciptype',
 'snow',
 'snowdepth',
 'windgust',
 'windspeed',
 'winddir',
 'sealevelpressure',
 'cloudcover',
 'visibility',
 'solarradiation',
 'solarenergy',
 'uvindex',
 'severerisk']

In [124]:
trainingdata.dtypes

local_15min          datetime64[ns]
Percent Output              float64
onefourtyfive_ago            object
onethirty_ago                object
onefifteen_ago               object
one_ago                      object
fourtyfive_ago               object
thirty_ago                   object
fifteen_ago                  object
temp                        float64
feelslike                   float64
dew                         float64
humidity                    float64
precip                      float64
precipprob                  float64
preciptype                  float64
snow                          int64
snowdepth                     int64
windgust                    float64
windspeed                   float64
winddir                     float64
sealevelpressure            float64
cloudcover                  float64
visibility                  float64
solarradiation              float64
solarenergy                 float64
uvindex                     float64
severerisk                  

In [125]:
# turn datetime into 4 new columns: year, month, day, time
trainingdata['year'] = trainingdata['local_15min'].dt.year
trainingdata['month'] = trainingdata['local_15min'].dt.month
trainingdata['day'] = trainingdata['local_15min'].dt.day
trainingdata['time'] = trainingdata['local_15min'].dt.hour + trainingdata['local_15min'].dt.minute / 60

In [126]:
# reorder so the predicted feature is at the end
column_to_reorder = trainingdata.pop('Percent Output')
trainingdata.insert(len(trainingdata. columns), 'Percent Output', column_to_reorder)

trainingdata.dtypes

local_15min          datetime64[ns]
onefourtyfive_ago            object
onethirty_ago                object
onefifteen_ago               object
one_ago                      object
fourtyfive_ago               object
thirty_ago                   object
fifteen_ago                  object
temp                        float64
feelslike                   float64
dew                         float64
humidity                    float64
precip                      float64
precipprob                  float64
preciptype                  float64
snow                          int64
snowdepth                     int64
windgust                    float64
windspeed                   float64
winddir                     float64
sealevelpressure            float64
cloudcover                  float64
visibility                  float64
solarradiation              float64
solarenergy                 float64
uvindex                     float64
severerisk                  float64
year                        

In [58]:
trainingdata.tail()

Unnamed: 0,local_15min,temp,feelslike,dew,humidity,precip,precipprob,preciptype,snow,snowdepth,...,visibility,solarradiation,solarenergy,uvindex,severerisk,year,month,day,time,Percent Output
2872,2018-08-31 19:00:00,95.2,96.5,62.7,34.32,0.0,,,0,0,...,9.9,89.0,0.3,1.0,,2018,8,31,19.0,0.21984
2873,2018-08-31 20:00:00,92.8,94.2,63.3,37.72,0.0,,,0,0,...,9.9,14.0,0.1,0.0,,2018,8,31,20.0,-0.00448
2874,2018-08-31 21:00:00,90.5,92.4,64.7,42.72,0.0,,,0,0,...,9.9,,,,,2018,8,31,21.0,-0.00064
2875,2018-08-31 22:00:00,87.0,91.7,69.7,56.62,0.0,,,0,0,...,9.9,,,,,2018,8,31,22.0,-0.00064
2876,2018-08-31 23:00:00,83.8,88.1,69.9,63.08,0.0,,,0,0,...,9.9,,,,,2018,8,31,23.0,-0.00064


In [127]:
trainingdata.drop(['local_15min'], axis=1, inplace=True) 

In [132]:
trainingdata.describe()

Unnamed: 0,temp,feelslike,dew,humidity,precip,snow,snowdepth,windgust,windspeed,winddir,...,cloudcover,visibility,solarradiation,solarenergy,uvindex,year,month,day,time,Percent Output
count,504.0,504.0,504.0,504.0,504.0,504.0,504.0,504.0,504.0,504.0,...,504.0,504.0,504.0,504.0,504.0,504.0,504.0,504.0,504.0,504.0
mean,80.566071,81.720635,54.68631,44.245893,0.00377,0.0,0.0,20.681349,9.589881,170.744048,...,39.777579,9.731151,393.680556,1.415278,3.902778,2018.0,4.988095,16.545635,14.382937,0.353057
std,17.233098,19.705182,16.837449,17.230788,0.054365,0.0,0.0,3.860815,2.827702,87.112961,...,35.542186,0.752598,265.28248,0.954588,2.701239,0.0,2.756496,8.71129,2.889308,0.214073
min,25.0,13.7,5.5,9.38,0.0,0.0,0.0,16.1,1.8,1.0,...,0.0,1.6,4.0,0.0,0.0,2018.0,1.0,1.0,8.0,-0.00464
25%,69.075,69.075,48.175,31.325,0.0,0.0,0.0,18.1,7.6,146.0,...,9.5,9.9,153.75,0.6,2.0,2018.0,3.0,10.0,12.0,0.15308
50%,84.05,86.05,61.5,41.685,0.0,0.0,0.0,19.7,9.25,174.5,...,22.8,9.9,384.0,1.4,4.0,2018.0,7.0,16.0,14.0,0.41184
75%,95.1,97.825,66.125,54.01,0.0,0.0,0.0,22.55,11.3,198.0,...,75.1,9.9,624.0,2.2,6.0,2018.0,8.0,24.0,17.0,0.55276
max,104.6,105.3,75.8,97.07,1.1,0.0,0.0,37.5,19.5,360.0,...,100.0,9.9,902.0,3.2,9.0,2018.0,8.0,31.0,21.0,0.68304


In [129]:
trainingdata.drop(['precipprob','preciptype','severerisk'], axis=1, inplace=True) 

In [130]:
#IMPROVEMENT AREA: a lot of data is dropped here. This might be something from adding the new columns
trainingdata.dropna(inplace=True)

In [131]:
trainingdata.head()

Unnamed: 0,onefourtyfive_ago,onethirty_ago,onefifteen_ago,one_ago,fourtyfive_ago,thirty_ago,fifteen_ago,temp,feelslike,dew,...,cloudcover,visibility,solarradiation,solarenergy,uvindex,year,month,day,time,Percent Output
4,-0.0008,-0.0008,-0.0008,-0.0008,-0.0008,-0.0008,-0.0008,25.0,14.1,11.4,...,97.5,9.9,18.0,0.1,0.0,2018,1,1,8.0,0.01696
5,-0.0008,-0.0008,-0.0008,0.01696,0.02544,0.05808,0.08224,26.5,15.9,11.7,...,94.3,9.9,266.0,1.0,3.0,2018,1,1,11.0,0.2528
45,0.5184,0.524,0.5272,0.524,0.52176,0.51216,-0.00064,54.9,54.9,20.9,...,6.2,9.9,489.0,1.8,5.0,2018,1,3,15.0,0.50416
130,0.01216,0.02208,0.03056,0.04384,0.10528,0.08992,0.1624,58.1,58.1,53.9,...,43.3,9.9,175.0,0.6,2.0,2018,1,7,10.0,0.1536
131,0.10528,0.08992,0.1624,0.1536,0.12272,0.10432,0.12912,61.8,61.8,54.7,...,100.0,9.9,194.0,0.7,2.0,2018,1,7,11.0,0.148


In [134]:
trainingdata.to_csv("Memory_SolarTrainingData.csv")