In [98]:
import pandas as pd
import numpy as np
import datetime
import time

# COMBINING DAILY SOLAR DATA WITH CAPACITY INFO

In [99]:
# import metadata
metadata = pd.read_csv('metadata.csv')
metadata.drop(index=metadata.index[0], axis=0, inplace=True)
keeper_columns = ['dataid', 'pv', 'pv_panel_direction', 'total_amount_of_pv', 'amount_of_south_facing_pv', 'amount_of_west_facing_pv',
                 'amount_of_east_facing_pv']
metadata = metadata[keeper_columns]
metadata[["dataid", "total_amount_of_pv", 'amount_of_south_facing_pv', 'amount_of_west_facing_pv',
                 'amount_of_east_facing_pv']] = metadata[["dataid", "total_amount_of_pv", 'amount_of_south_facing_pv', 'amount_of_west_facing_pv',
                 'amount_of_east_facing_pv']].apply(pd.to_numeric)

# filling nans in certain metadata columns
metadata[['amount_of_south_facing_pv', 'amount_of_west_facing_pv',
                 'amount_of_east_facing_pv']] = metadata[['amount_of_south_facing_pv', 'amount_of_west_facing_pv',
                 'amount_of_east_facing_pv']].fillna(0)

metadata

Unnamed: 0,dataid,pv,pv_panel_direction,total_amount_of_pv,amount_of_south_facing_pv,amount_of_west_facing_pv,amount_of_east_facing_pv
1,2836,,,,0.0,0.00,0.0
2,2743,,,,0.0,0.00,0.0
3,5323,,,,0.0,0.00,0.0
4,8560,,,,0.0,0.00,0.0
5,3313,,,,0.0,0.00,0.0
...,...,...,...,...,...,...,...
1728,11360,yes,West,5.25,0.0,5.25,0.0
1729,5361,,,,0.0,0.00,0.0
1730,8217,,,,0.0,0.00,0.0
1731,8057,yes,,,0.0,0.00,0.0


In [100]:
# import solar data
solardata = pd.read_csv('manipulated_15minute_data_austin.csv')
solardata[["dataid"]] = solardata[["dataid"]].apply(pd.to_numeric)
solardata.dropna(inplace=True)

In [101]:
# Combine info

merged = solardata.merge(metadata, on='dataid', how="inner")
merged.dropna(inplace=True)
merged.head()

Unnamed: 0,dataid,local_15min,grid,solar,leg1v,leg2v,pv,pv_panel_direction,total_amount_of_pv,amount_of_south_facing_pv,amount_of_west_facing_pv,amount_of_east_facing_pv
0,661,11/21/2018 15:15,0.124,0.276,123.915,124.277,yes,South,6.3,6.3,0.0,0.0
1,661,11/21/2018 15:30,0.251,0.167,123.959,124.293,yes,South,6.3,6.3,0.0,0.0
2,661,11/21/2018 15:45,0.419,0.179,123.886,124.24,yes,South,6.3,6.3,0.0,0.0
3,661,11/21/2018 16:00,0.833,0.076,123.88,124.175,yes,South,6.3,6.3,0.0,0.0
4,661,11/21/2018 16:15,1.105,0.064,123.633,124.226,yes,South,6.3,6.3,0.0,0.0


In [102]:
# NON-MEMORY: merge weather and solar data
# IMPROVEMENT AREA: drops all solar info except 'percent output'
merged = merged[['dataid','local_15min','total_amount_of_pv','solar']]
weatherdata = pd.read_excel('2018 Compiled Weather Data_Solar Nans Begone.xlsx')

# Converting join columns to matching datetime data types
merged['local_15min'] = pd.to_datetime(merged['local_15min'])
weatherdata['datetime'] = pd.to_datetime(weatherdata['datetime'])

  weatherdata = pd.read_excel('2018 Compiled Weather Data_Solar Nans Begone.xlsx')


In [103]:
trainingdata = merged.merge(weatherdata, left_on='local_15min', right_on='datetime', how="inner")
trainingdata.head()

Unnamed: 0,dataid,local_15min,total_amount_of_pv,solar,datetime,temp,feelslike,dew,humidity,precip,...,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,severerisk,conditions,icon,stations
0,661,2018-11-21 16:00:00,6.3,0.076,2018-11-21 16:00:00,54.0,54.0,43.1,66.34,0.0,...,1026.0,100.0,9.5,65.0,0.2,1.0,,Overcast,cloudy,"KATT,KAUS,72064800230,E4737,KEDC,72254013904,7..."
1,1642,2018-11-21 16:00:00,6.37,0.097,2018-11-21 16:00:00,54.0,54.0,43.1,66.34,0.0,...,1026.0,100.0,9.5,65.0,0.2,1.0,,Overcast,cloudy,"KATT,KAUS,72064800230,E4737,KEDC,72254013904,7..."
2,2335,2018-11-21 16:00:00,5.92,0.079,2018-11-21 16:00:00,54.0,54.0,43.1,66.34,0.0,...,1026.0,100.0,9.5,65.0,0.2,1.0,,Overcast,cloudy,"KATT,KAUS,72064800230,E4737,KEDC,72254013904,7..."
3,2818,2018-11-21 16:00:00,5.39,0.127,2018-11-21 16:00:00,54.0,54.0,43.1,66.34,0.0,...,1026.0,100.0,9.5,65.0,0.2,1.0,,Overcast,cloudy,"KATT,KAUS,72064800230,E4737,KEDC,72254013904,7..."
4,3039,2018-11-21 16:00:00,6.25,0.152,2018-11-21 16:00:00,54.0,54.0,43.1,66.34,0.0,...,1026.0,100.0,9.5,65.0,0.2,1.0,,Overcast,cloudy,"KATT,KAUS,72064800230,E4737,KEDC,72254013904,7..."


In [104]:
## IMPROVEMENT OPPORTUNITY: 'conditions' categorical variable is not used
trainingdata.drop(['datetime', 'windgust', 'dataid', 'stations', 'icon', 'conditions'], axis=1, inplace=True) 
list(trainingdata.columns.values)

['local_15min',
 'total_amount_of_pv',
 'solar',
 'temp',
 'feelslike',
 'dew',
 'humidity',
 'precip',
 'precipprob',
 'preciptype',
 'snow',
 'snowdepth',
 'windspeed',
 'winddir',
 'sealevelpressure',
 'cloudcover',
 'visibility',
 'solarradiation',
 'solarenergy',
 'uvindex',
 'severerisk']

In [105]:
# turn datetime into 4 new columns: year, month, day, time
trainingdata['year'] = trainingdata['local_15min'].dt.year
trainingdata['month'] = trainingdata['local_15min'].dt.month
trainingdata['day'] = trainingdata['local_15min'].dt.day
trainingdata['time'] = trainingdata['local_15min'].dt.hour + trainingdata['local_15min'].dt.minute / 60

In [106]:
trainingdata.drop(['local_15min', 'precipprob','preciptype','severerisk', 'snow', 'snowdepth', 'year'], axis=1, inplace=True) 

trainingdata[['solarradiation', 'solarenergy','uvindex']] = trainingdata[['solarradiation', 'solarenergy','uvindex']].fillna(0)

#IMPROVEMENT AREA: a lot of data is dropped here. This might be something from adding the new columns
trainingdata.dropna(inplace=True)
trainingdata.head()

Unnamed: 0,total_amount_of_pv,solar,temp,feelslike,dew,humidity,precip,windspeed,winddir,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,month,day,time
0,6.3,0.076,54.0,54.0,43.1,66.34,0.0,1.1,300.0,1026.0,100.0,9.5,65.0,0.2,1.0,11,21,16.0
1,6.37,0.097,54.0,54.0,43.1,66.34,0.0,1.1,300.0,1026.0,100.0,9.5,65.0,0.2,1.0,11,21,16.0
2,5.92,0.079,54.0,54.0,43.1,66.34,0.0,1.1,300.0,1026.0,100.0,9.5,65.0,0.2,1.0,11,21,16.0
3,5.39,0.127,54.0,54.0,43.1,66.34,0.0,1.1,300.0,1026.0,100.0,9.5,65.0,0.2,1.0,11,21,16.0
4,6.25,0.152,54.0,54.0,43.1,66.34,0.0,1.1,300.0,1026.0,100.0,9.5,65.0,0.2,1.0,11,21,16.0


In [107]:
# Aggregate solar output by hour. Creates grid-level info rather than house-level
Start_index = 0
End_index = 0
Starting_fresh = True

Cumulative_capacity = 0
Cumulative_output = 0

Previous_month = 11
Previous_day = 12
Previous_time = 10


for i, row in trainingdata.iterrows():
    if Starting_fresh == True:
        Previous_month = trainingdata.at[i,'month']
        Previous_day = trainingdata.at[i,'day']
        Previous_time = trainingdata.at[i,'time']
        
        Cumulative_capacity = 0
        Cumulative_output = 0
        
        Start_index = i
        End_index = i
        Starting_fresh = False
    
    Current_month = trainingdata.at[i,'month']
    Current_day = trainingdata.at[i,'day']
    Current_time = trainingdata.at[i,'time']
    
    if (Current_month != Previous_month) or (Current_day != Previous_day) or (Current_time != Previous_time):
        trainingdata.at[Start_index,'total_amount_of_pv'] = Cumulative_capacity #overwrite first time with cumulative data
        trainingdata.at[Start_index,'solar'] = Cumulative_output
        
        if (Start_index != End_index):
            trainingdata.drop(list(range(Start_index+1, End_index+1)), inplace=True) #drop other rows with the same time after that. this might be a bug if dropping during iterrows causes it to skip
                      
        Previous_month = trainingdata.at[i,'month']
        Previous_day = trainingdata.at[i,'day']
        Previous_time = trainingdata.at[i,'time']
        
        Cumulative_capacity = trainingdata.at[i,'total_amount_of_pv']
        Cumulative_output = trainingdata.at[i,'solar']
        
        Start_index = i
        End_index = i
        
    else:
        Cumulative_capacity = Cumulative_capacity + trainingdata.at[i,'total_amount_of_pv']
        Cumulative_output = Cumulative_output + trainingdata.at[i,'solar']
        End_index = i

In [108]:
trainingdata.head(10)

Unnamed: 0,total_amount_of_pv,solar,temp,feelslike,dew,humidity,precip,windspeed,winddir,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,month,day,time
0,88.93,1.609,54.0,54.0,43.1,66.34,0.0,1.1,300.0,1026.0,100.0,9.5,65.0,0.2,1.0,11,21,16.0
16,88.93,-0.273,53.1,53.1,44.0,71.14,0.0,2.6,334.0,1025.9,100.0,9.9,22.0,0.1,0.0,11,21,17.0
32,88.93,-0.113,51.3,51.3,45.2,79.54,0.0,1.1,296.0,1026.8,100.0,9.5,49.0,0.2,0.0,11,21,18.0
48,88.93,-0.101,51.2,51.2,46.4,83.72,0.0,0.0,0.0,1027.3,100.0,9.9,17.0,0.1,0.0,11,21,19.0
64,88.93,-0.1,51.1,51.1,45.5,81.1,0.0,1.3,337.0,1027.6,100.0,9.9,1.0,0.0,0.0,11,21,20.0
80,88.93,-0.002,60.6,60.6,46.6,59.73,0.0,1.3,80.0,1021.0,100.0,9.9,43.0,0.2,0.0,11,22,17.0
96,88.93,-0.122,59.5,59.5,47.2,63.79,0.0,0.8,60.0,1021.0,100.0,9.9,11.0,0.0,0.0,11,22,18.0
112,88.93,-0.102,59.3,59.3,47.3,64.5,0.0,0.8,60.0,1021.2,100.0,9.9,31.0,0.1,0.0,11,22,19.0
128,88.93,-0.099,58.9,58.9,47.5,65.86,0.0,3.2,160.0,1020.7,100.0,9.9,1.0,0.0,0.0,11,22,20.0
144,88.93,-0.097,58.9,58.9,47.8,66.51,0.0,6.8,148.0,1020.8,100.0,9.9,0.0,0.0,0.0,11,22,21.0


In [109]:
# Create "Percent Output" column
# Unit Assumptions. Near certain these are right:
#      -"total amount of pv" unit is kW (looked at average PV installation sizes)
#      -"solar" and "grid" units are kW (looked at average home consumption)
# IMPROVEMENT AREA: Does not account for orientation of panels
trainingdata["Percent Output"] = trainingdata["solar"] / trainingdata["total_amount_of_pv"]
trainingdata.head()

Unnamed: 0,total_amount_of_pv,solar,temp,feelslike,dew,humidity,precip,windspeed,winddir,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,month,day,time,Percent Output
0,88.93,1.609,54.0,54.0,43.1,66.34,0.0,1.1,300.0,1026.0,100.0,9.5,65.0,0.2,1.0,11,21,16.0,0.018093
16,88.93,-0.273,53.1,53.1,44.0,71.14,0.0,2.6,334.0,1025.9,100.0,9.9,22.0,0.1,0.0,11,21,17.0,-0.00307
32,88.93,-0.113,51.3,51.3,45.2,79.54,0.0,1.1,296.0,1026.8,100.0,9.5,49.0,0.2,0.0,11,21,18.0,-0.001271
48,88.93,-0.101,51.2,51.2,46.4,83.72,0.0,0.0,0.0,1027.3,100.0,9.9,17.0,0.1,0.0,11,21,19.0,-0.001136
64,88.93,-0.1,51.1,51.1,45.5,81.1,0.0,1.3,337.0,1027.6,100.0,9.9,1.0,0.0,0.0,11,21,20.0,-0.001124


In [110]:
trainingdata.drop(columns=['solar','total_amount_of_pv'], inplace=True)
trainingdata.head()

Unnamed: 0,temp,feelslike,dew,humidity,precip,windspeed,winddir,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,month,day,time,Percent Output
0,54.0,54.0,43.1,66.34,0.0,1.1,300.0,1026.0,100.0,9.5,65.0,0.2,1.0,11,21,16.0,0.018093
16,53.1,53.1,44.0,71.14,0.0,2.6,334.0,1025.9,100.0,9.9,22.0,0.1,0.0,11,21,17.0,-0.00307
32,51.3,51.3,45.2,79.54,0.0,1.1,296.0,1026.8,100.0,9.5,49.0,0.2,0.0,11,21,18.0,-0.001271
48,51.2,51.2,46.4,83.72,0.0,0.0,0.0,1027.3,100.0,9.9,17.0,0.1,0.0,11,21,19.0,-0.001136
64,51.1,51.1,45.5,81.1,0.0,1.3,337.0,1027.6,100.0,9.9,1.0,0.0,0.0,11,21,20.0,-0.001124


In [111]:
# reorder so the predicted feature is at the end
column_to_reorder = trainingdata.pop('Percent Output')
trainingdata.insert(len(trainingdata. columns), 'Percent Output', column_to_reorder)

trainingdata.dtypes

temp                float64
feelslike           float64
dew                 float64
humidity            float64
precip              float64
windspeed           float64
winddir             float64
sealevelpressure    float64
cloudcover          float64
visibility          float64
solarradiation      float64
solarenergy         float64
uvindex             float64
month                 int64
day                   int64
time                float64
Percent Output      float64
dtype: object

In [73]:
# Transforming output data to include time-series information
# IMPROVEMENT AREA: Fiddle with how far back you want the data to go to inform future predictions

current_house, onefourtyfive_ago, onethirty_ago, onefifteen_ago, one_ago, fourtyfive_ago, thirty_ago, fifteen_ago = None, None, None, None, None, None, None, None
trainingdata['onefourtyfive_ago'] = None
trainingdata['onethirty_ago'] = None
trainingdata['onefifteen_ago'] = None
trainingdata['one_ago'] = None
trainingdata['fourtyfive_ago'] = None
trainingdata['thirty_ago'] = None
trainingdata['fifteen_ago'] = None

trainingdata.head()

for i, row in trainingdata.iterrows():
    
    
    trainingdata.at[i,'onefourtyfive_ago'] = onefourtyfive_ago
    trainingdata.at[i,'onethirty_ago'] = onethirty_ago
    trainingdata.at[i,'onefifteen_ago'] = onefifteen_ago
    trainingdata.at[i,'one_ago'] = one_ago
    trainingdata.at[i,'fourtyfive_ago'] = fourtyfive_ago
    trainingdata.at[i,'thirty_ago'] = thirty_ago
    trainingdata.at[i,'fifteen_ago'] = fifteen_ago
        
    onefourtyfive_ago = onethirty_ago
    onethirty_ago = onefifteen_ago
    onefifteen_ago = one_ago
    one_ago = fourtyfive_ago
    fourtyfive_ago = thirty_ago                       # refresh previous data variables
    thirty_ago = fifteen_ago
    fifteen_ago = row['Percent Output']   

trainingdata[['onefourtyfive_ago', 'onethirty_ago', 'onefifteen_ago', 'one_ago', 
              'fourtyfive_ago', 'thirty_ago', 'fifteen_ago']] = trainingdata[['onefourtyfive_ago', 'onethirty_ago', 
                                                                              'onefifteen_ago', 'one_ago', 'fourtyfive_ago', 
                                                                              'thirty_ago', 'fifteen_ago']].apply(pd.to_numeric)

In [112]:
# reorder so the predicted feature is at the end
column_to_reorder = trainingdata.pop('Percent Output')
trainingdata.insert(len(trainingdata. columns), 'Percent Output', column_to_reorder)

trainingdata.dtypes

temp                float64
feelslike           float64
dew                 float64
humidity            float64
precip              float64
windspeed           float64
winddir             float64
sealevelpressure    float64
cloudcover          float64
visibility          float64
solarradiation      float64
solarenergy         float64
uvindex             float64
month                 int64
day                   int64
time                float64
Percent Output      float64
dtype: object

In [113]:
trainingdata.describe()

Unnamed: 0,temp,feelslike,dew,humidity,precip,windspeed,winddir,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,month,day,time,Percent Output
count,7789.0,7789.0,7789.0,7789.0,7789.0,7789.0,7789.0,7789.0,7789.0,7789.0,7789.0,7789.0,7789.0,7789.0,7789.0,7789.0,7789.0
mean,68.685069,69.034292,54.797561,65.400261,0.003665,6.144434,155.847503,1016.861972,48.713147,9.223264,158.352574,0.569033,1.556297,6.316985,15.72153,11.492618,0.130228
std,16.734328,18.664735,15.959446,21.324862,0.037646,3.695078,107.556726,6.604588,41.007277,1.800442,242.718468,0.874376,2.450186,3.481339,8.869966,6.918455,0.201332
min,17.8,8.5,4.7,9.38,0.0,0.0,0.0,1000.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,-0.0037
25%,56.3,56.3,43.5,48.13,0.0,3.7,53.0,1012.4,6.6,9.8,0.0,0.0,0.0,3.0,8.0,5.0,-0.00108
50%,70.1,70.1,59.5,68.01,0.0,6.1,163.0,1015.6,33.9,9.9,17.0,0.1,0.0,6.0,16.0,11.0,-0.000798
75%,80.6,82.5,68.4,84.38,0.0,8.5,202.0,1020.7,97.5,9.9,229.0,0.8,2.0,10.0,23.0,17.0,0.201496
max,108.8,109.8,76.0,100.0,1.35,22.8,360.0,1043.5,100.0,9.9,984.0,3.5,10.0,12.0,31.0,23.0,0.743664


In [114]:
#Normalizing data
for column in trainingdata.columns:
    if column == 'Percent Output':
        break
    trainingdata[column] = trainingdata[column]
    trainingdata[column] = (trainingdata[column] - trainingdata[column].mean()) / trainingdata[column].std()

In [115]:
trainingdata.describe()

Unnamed: 0,temp,feelslike,dew,humidity,precip,windspeed,winddir,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,month,day,time,Percent Output
count,7789.0,7789.0,7789.0,7789.0,7789.0,7789.0,7789.0,7789.0,7789.0,7789.0,7789.0,7789.0,7789.0,7789.0,7789.0,7789.0,7789.0
mean,-1.423058e-14,-2.975039e-16,-1.768603e-15,7.823872e-15,8.572728e-16,-1.109288e-14,9.456495e-16,1.02808e-13,-2.10389e-14,-1.957739e-13,1.704433e-15,-1.550236e-16,3.878183e-15,-2.094027e-14,-3.244149e-16,6.117701000000001e-17,0.130228
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.201332
min,-3.04076,-3.243244,-3.139054,-2.626993,-0.09736502,-1.66287,-1.44898,-2.55307,-1.187915,-5.122776,-0.6524125,-0.6507876,-0.6351752,-1.527282,-1.659705,-1.661154,-0.0037
25%,-0.7400996,-0.6822648,-0.7078918,-0.8098651,-0.09736502,-0.661538,-0.9562164,-0.6755868,-1.026968,0.3203304,-0.6524125,-0.6507876,-0.6351752,-0.9527903,-0.8705254,-0.938449,-0.00108
50%,0.08455262,0.05709741,0.2946493,0.1223801,-0.09736502,-0.01202531,0.06649977,-0.1910751,-0.3612321,0.3758723,-0.5823726,-0.5364203,-0.6351752,-0.09105276,0.03139467,-0.07120343,-0.000798
75%,0.7120054,0.7214519,0.8523127,0.8900287,-0.09736502,0.6374874,0.4290991,0.5811154,1.189712,0.3758723,0.2910674,0.2641503,0.1810894,1.057931,0.8205747,0.7960422,0.201496
max,2.397164,2.184103,1.32852,1.622507,35.7628,4.5075,1.898091,4.033261,1.250677,0.3758723,3.401667,3.352065,3.446148,1.632422,1.722495,1.663288,0.743664


In [116]:
trainingdata.to_csv("Normalized Aggregated 2018 Compiled Weather Data.csv")