In [136]:
import pandas as pd
import numpy as np
import datetime
import time

# COMBINING DAILY SOLAR DATA WITH CAPACITY INFO

In [137]:
# import metadata
metadata = pd.read_csv('metadata.csv')
metadata.drop(index=metadata.index[0], axis=0, inplace=True)
keeper_columns = ['dataid', 'pv', 'pv_panel_direction', 'total_amount_of_pv', 'amount_of_south_facing_pv', 'amount_of_west_facing_pv',
                 'amount_of_east_facing_pv']
metadata = metadata[keeper_columns]
metadata[["dataid", "total_amount_of_pv", 'amount_of_south_facing_pv', 'amount_of_west_facing_pv',
                 'amount_of_east_facing_pv']] = metadata[["dataid", "total_amount_of_pv", 'amount_of_south_facing_pv', 'amount_of_west_facing_pv',
                 'amount_of_east_facing_pv']].apply(pd.to_numeric)

# filling nans in certain metadata columns
metadata[['amount_of_south_facing_pv', 'amount_of_west_facing_pv',
                 'amount_of_east_facing_pv']] = metadata[['amount_of_south_facing_pv', 'amount_of_west_facing_pv',
                 'amount_of_east_facing_pv']].fillna(0)

metadata

Unnamed: 0,dataid,pv,pv_panel_direction,total_amount_of_pv,amount_of_south_facing_pv,amount_of_west_facing_pv,amount_of_east_facing_pv
1,2836,,,,0.0,0.00,0.0
2,2743,,,,0.0,0.00,0.0
3,5323,,,,0.0,0.00,0.0
4,8560,,,,0.0,0.00,0.0
5,3313,,,,0.0,0.00,0.0
...,...,...,...,...,...,...,...
1728,11360,yes,West,5.25,0.0,5.25,0.0
1729,5361,,,,0.0,0.00,0.0
1730,8217,,,,0.0,0.00,0.0
1731,8057,yes,,,0.0,0.00,0.0


In [138]:
# import solar data
solardata = pd.read_csv('manipulated_15minute_data_austin.csv')
solardata[["dataid"]] = solardata[["dataid"]].apply(pd.to_numeric)
solardata.dropna(inplace=True)

In [139]:
# Combine info

merged = solardata.merge(metadata, on='dataid', how="inner")
merged.dropna(inplace=True)
merged.head()

Unnamed: 0,dataid,local_15min,grid,solar,leg1v,leg2v,pv,pv_panel_direction,total_amount_of_pv,amount_of_south_facing_pv,amount_of_west_facing_pv,amount_of_east_facing_pv
0,661,11/21/2018 15:15,0.124,0.276,123.915,124.277,yes,South,6.3,6.3,0.0,0.0
1,661,11/21/2018 15:30,0.251,0.167,123.959,124.293,yes,South,6.3,6.3,0.0,0.0
2,661,11/21/2018 15:45,0.419,0.179,123.886,124.24,yes,South,6.3,6.3,0.0,0.0
3,661,11/21/2018 16:00,0.833,0.076,123.88,124.175,yes,South,6.3,6.3,0.0,0.0
4,661,11/21/2018 16:15,1.105,0.064,123.633,124.226,yes,South,6.3,6.3,0.0,0.0


In [140]:
merged.describe()

Unnamed: 0,dataid,grid,solar,leg1v,leg2v,total_amount_of_pv,amount_of_south_facing_pv,amount_of_west_facing_pv,amount_of_east_facing_pv
count,556994.0,556994.0,556994.0,556994.0,556994.0,556994.0,556994.0,556994.0,556994.0
mean,4886.084917,0.685658,0.714319,122.361693,122.496614,5.555535,3.356332,2.062626,0.0
std,2596.528661,1.697746,1.165013,1.393875,1.37997,0.930482,1.793952,1.720272,0.0
min,661.0,-7.65,-0.068,0.483,0.483,2.88,0.0,0.0,0.0
25%,3039.0,0.148,-0.006,121.862,121.994,5.145,2.695,0.0,0.0
50%,4373.0,0.514,-0.001,122.443,122.561,5.92,3.185,2.035,0.0
75%,7719.0,1.304,1.029,122.944,123.067,6.25,4.42,2.94,0.0
max,9160.0,15.522,8.687,601.954,601.811,6.37,6.3,6.25,0.0


In [141]:
# Create "Percent Output" column
# Unit Assumptions. Near certain these are right:
#      -"total amount of pv" unit is kW (looked at average PV installation sizes)
#      -"solar" and "grid" units are kW (looked at average home consumption)
# IMPROVEMENT AREA: Does not account for orientation of panels
merged["Percent Output"] = merged["solar"] / merged["total_amount_of_pv"]
merged.head()

Unnamed: 0,dataid,local_15min,grid,solar,leg1v,leg2v,pv,pv_panel_direction,total_amount_of_pv,amount_of_south_facing_pv,amount_of_west_facing_pv,amount_of_east_facing_pv,Percent Output
0,661,11/21/2018 15:15,0.124,0.276,123.915,124.277,yes,South,6.3,6.3,0.0,0.0,0.04381
1,661,11/21/2018 15:30,0.251,0.167,123.959,124.293,yes,South,6.3,6.3,0.0,0.0,0.026508
2,661,11/21/2018 15:45,0.419,0.179,123.886,124.24,yes,South,6.3,6.3,0.0,0.0,0.028413
3,661,11/21/2018 16:00,0.833,0.076,123.88,124.175,yes,South,6.3,6.3,0.0,0.0,0.012063
4,661,11/21/2018 16:15,1.105,0.064,123.633,124.226,yes,South,6.3,6.3,0.0,0.0,0.010159


In [142]:
# Transforming output data to include time-series information
# IMPROVEMENT AREA: Fiddle with how far back you want the data to go to inform future predictions

current_house, onefourtyfive_ago, onethirty_ago, onefifteen_ago, one_ago, fourtyfive_ago, thirty_ago, fifteen_ago = None, None, None, None, None, None, None, None
merged['onefourtyfive_ago'] = None
merged['onethirty_ago'] = None
merged['onefifteen_ago'] = None
merged['one_ago'] = None
merged['fourtyfive_ago'] = None
merged['thirty_ago'] = None
merged['fifteen_ago'] = None

merged.head()

for i, row in merged.iterrows():
    
    if (current_house == row["dataid"]):              # if you are continuing with the same house
        merged.at[i,'onefourtyfive_ago'] = onefourtyfive_ago
        merged.at[i,'onethirty_ago'] = onethirty_ago
        merged.at[i,'onefifteen_ago'] = onefifteen_ago
        merged.at[i,'one_ago'] = one_ago
        merged.at[i,'fourtyfive_ago'] = fourtyfive_ago
        merged.at[i,'thirty_ago'] = thirty_ago
        merged.at[i,'fifteen_ago'] = fifteen_ago
        
        onefourtyfive_ago = onethirty_ago
        onethirty_ago = onefifteen_ago
        onefifteen_ago = one_ago
        one_ago = fourtyfive_ago
        fourtyfive_ago = thirty_ago                       # refresh previous data variables
        thirty_ago = fifteen_ago
        fifteen_ago = row["Percent Output"]
        
    else:                                             # if a row encounters a new house
        current_house = row["dataid"]                     # refresh previous data variables
        onefourtyfive_ago, onethirty_ago, onefiteen_ago, one_ago, fourtyfive_ago, thirty_ago = None, None, None, None, None, None        
        fifteen_ago = row["Percent Output"]           

merged[['onefourtyfive_ago', 'onethirty_ago', 'onefifteen_ago', 'one_ago', 
              'fourtyfive_ago', 'thirty_ago', 'fifteen_ago']] = merged[['onefourtyfive_ago', 'onethirty_ago', 
                                                                              'onefifteen_ago', 'one_ago', 'fourtyfive_ago', 
                                                                              'thirty_ago', 'fifteen_ago']].apply(pd.to_numeric)

In [143]:
merged.head(20)

Unnamed: 0,dataid,local_15min,grid,solar,leg1v,leg2v,pv,pv_panel_direction,total_amount_of_pv,amount_of_south_facing_pv,amount_of_west_facing_pv,amount_of_east_facing_pv,Percent Output,onefourtyfive_ago,onethirty_ago,onefifteen_ago,one_ago,fourtyfive_ago,thirty_ago,fifteen_ago
0,661,11/21/2018 15:15,0.124,0.276,123.915,124.277,yes,South,6.3,6.3,0.0,0.0,0.04381,,,,,,,
1,661,11/21/2018 15:30,0.251,0.167,123.959,124.293,yes,South,6.3,6.3,0.0,0.0,0.026508,,,,,,,0.04381
2,661,11/21/2018 15:45,0.419,0.179,123.886,124.24,yes,South,6.3,6.3,0.0,0.0,0.028413,,,,,,0.04381,0.026508
3,661,11/21/2018 16:00,0.833,0.076,123.88,124.175,yes,South,6.3,6.3,0.0,0.0,0.012063,,,,,0.04381,0.026508,0.028413
4,661,11/21/2018 16:15,1.105,0.064,123.633,124.226,yes,South,6.3,6.3,0.0,0.0,0.010159,,,,0.04381,0.026508,0.028413,0.012063
5,661,11/21/2018 16:30,0.869,0.048,123.641,124.142,yes,South,6.3,6.3,0.0,0.0,0.007619,,,0.04381,0.026508,0.028413,0.012063,0.010159
6,661,11/21/2018 16:45,1.324,-0.023,123.435,124.033,yes,South,6.3,6.3,0.0,0.0,-0.003651,,0.04381,0.026508,0.028413,0.012063,0.010159,0.007619
7,661,11/21/2018 17:00,0.993,-0.039,123.371,123.98,yes,South,6.3,6.3,0.0,0.0,-0.00619,0.04381,0.026508,0.028413,0.012063,0.010159,0.007619,-0.003651
8,661,11/21/2018 17:15,0.906,-0.022,123.161,123.682,yes,South,6.3,6.3,0.0,0.0,-0.003492,0.026508,0.028413,0.012063,0.010159,0.007619,-0.003651,-0.00619
9,661,11/21/2018 17:30,1.203,-0.023,122.831,123.563,yes,South,6.3,6.3,0.0,0.0,-0.003651,0.028413,0.012063,0.010159,0.007619,-0.003651,-0.00619,-0.003492


In [144]:
# merge weather and solar data
# IMPROVEMENT AREA: drops all solar info except 'percent output'
merged = merged[['dataid','local_15min','Percent Output', 'onefourtyfive_ago', 'onethirty_ago', 'onefifteen_ago', 'one_ago', 'fourtyfive_ago', 'thirty_ago', 'fifteen_ago']]
weatherdata = pd.read_csv('Complete Compiled Weather Data.csv')

# Converting join columns to matching datetime data types
merged['local_15min'] = pd.to_datetime(merged['local_15min'])
weatherdata['datetime'] = pd.to_datetime(weatherdata['datetime'])

In [122]:
# NON-MEMORY: merge weather and solar data
# IMPROVEMENT AREA: drops all solar info except 'percent output'
merged = merged[['dataid','local_15min','Percent Output']]
weatherdata = pd.read_csv('Complete Compiled Weather Data.csv')

# Converting join columns to matching datetime data types
merged['local_15min'] = pd.to_datetime(merged['local_15min'])
weatherdata['datetime'] = pd.to_datetime(weatherdata['datetime'])

In [145]:
trainingdata = merged.merge(weatherdata, left_on='local_15min', right_on='datetime', how="inner")
trainingdata.head()

Unnamed: 0,dataid,local_15min,Percent Output,onefourtyfive_ago,onethirty_ago,onefifteen_ago,one_ago,fourtyfive_ago,thirty_ago,fifteen_ago,...,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,severerisk,conditions,icon,stations
0,661,2018-11-21 16:00:00,0.012063,,,,,0.04381,0.026508,0.028413,...,1026.0,100.0,9.5,65.0,0.2,1.0,,Overcast,cloudy,"KATT,KAUS,72064800230,E4737,KEDC,72254013904,7..."
1,1642,2018-11-21 16:00:00,0.015228,0.044898,0.0427,0.052747,0.024647,0.044741,0.030769,0.031083,...,1026.0,100.0,9.5,65.0,0.2,1.0,,Overcast,cloudy,"KATT,KAUS,72064800230,E4737,KEDC,72254013904,7..."
2,2335,2018-11-21 16:00:00,0.013345,0.050338,0.047635,0.059291,0.025,0.050676,0.032264,0.033108,...,1026.0,100.0,9.5,65.0,0.2,1.0,,Overcast,cloudy,"KATT,KAUS,72064800230,E4737,KEDC,72254013904,7..."
3,2818,2018-11-21 16:00:00,0.023562,0.057328,0.055102,0.065492,0.034323,0.057699,0.040631,0.041002,...,1026.0,100.0,9.5,65.0,0.2,1.0,,Overcast,cloudy,"KATT,KAUS,72064800230,E4737,KEDC,72254013904,7..."
4,3039,2018-11-21 16:00:00,0.02432,0.05536,0.05136,0.06464,0.03392,0.05232,0.04272,0.03904,...,1026.0,100.0,9.5,65.0,0.2,1.0,,Overcast,cloudy,"KATT,KAUS,72064800230,E4737,KEDC,72254013904,7..."


In [146]:
trainingdata.head()

Unnamed: 0,dataid,local_15min,Percent Output,onefourtyfive_ago,onethirty_ago,onefifteen_ago,one_ago,fourtyfive_ago,thirty_ago,fifteen_ago,...,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,severerisk,conditions,icon,stations
0,661,2018-11-21 16:00:00,0.012063,,,,,0.04381,0.026508,0.028413,...,1026.0,100.0,9.5,65.0,0.2,1.0,,Overcast,cloudy,"KATT,KAUS,72064800230,E4737,KEDC,72254013904,7..."
1,1642,2018-11-21 16:00:00,0.015228,0.044898,0.0427,0.052747,0.024647,0.044741,0.030769,0.031083,...,1026.0,100.0,9.5,65.0,0.2,1.0,,Overcast,cloudy,"KATT,KAUS,72064800230,E4737,KEDC,72254013904,7..."
2,2335,2018-11-21 16:00:00,0.013345,0.050338,0.047635,0.059291,0.025,0.050676,0.032264,0.033108,...,1026.0,100.0,9.5,65.0,0.2,1.0,,Overcast,cloudy,"KATT,KAUS,72064800230,E4737,KEDC,72254013904,7..."
3,2818,2018-11-21 16:00:00,0.023562,0.057328,0.055102,0.065492,0.034323,0.057699,0.040631,0.041002,...,1026.0,100.0,9.5,65.0,0.2,1.0,,Overcast,cloudy,"KATT,KAUS,72064800230,E4737,KEDC,72254013904,7..."
4,3039,2018-11-21 16:00:00,0.02432,0.05536,0.05136,0.06464,0.03392,0.05232,0.04272,0.03904,...,1026.0,100.0,9.5,65.0,0.2,1.0,,Overcast,cloudy,"KATT,KAUS,72064800230,E4737,KEDC,72254013904,7..."


In [147]:
## IMPROVEMENT OPPORTUNITY: 'conditions' categorical variable is not used
trainingdata.drop(['name', 'datetime', 'dataid', 'stations', 'icon', 'conditions'], axis=1, inplace=True) 
list(trainingdata.columns.values)

['local_15min',
 'Percent Output',
 'onefourtyfive_ago',
 'onethirty_ago',
 'onefifteen_ago',
 'one_ago',
 'fourtyfive_ago',
 'thirty_ago',
 'fifteen_ago',
 'temp',
 'feelslike',
 'dew',
 'humidity',
 'precip',
 'precipprob',
 'preciptype',
 'snow',
 'snowdepth',
 'windgust',
 'windspeed',
 'winddir',
 'sealevelpressure',
 'cloudcover',
 'visibility',
 'solarradiation',
 'solarenergy',
 'uvindex',
 'severerisk']

In [161]:
trainingdata.dtypes

onefourtyfive_ago    float64
onethirty_ago        float64
onefifteen_ago       float64
one_ago              float64
fourtyfive_ago       float64
thirty_ago           float64
fifteen_ago          float64
temp                 float64
feelslike            float64
dew                  float64
humidity             float64
precip               float64
snow                   int64
snowdepth              int64
windgust             float64
windspeed            float64
winddir              float64
sealevelpressure     float64
cloudcover           float64
visibility           float64
solarradiation       float64
solarenergy          float64
uvindex              float64
year                   int64
month                  int64
day                    int64
time                 float64
Percent Output       float64
dtype: object

In [148]:
# turn datetime into 4 new columns: year, month, day, time
trainingdata['year'] = trainingdata['local_15min'].dt.year
trainingdata['month'] = trainingdata['local_15min'].dt.month
trainingdata['day'] = trainingdata['local_15min'].dt.day
trainingdata['time'] = trainingdata['local_15min'].dt.hour + trainingdata['local_15min'].dt.minute / 60

In [149]:
# reorder so the predicted feature is at the end
column_to_reorder = trainingdata.pop('Percent Output')
trainingdata.insert(len(trainingdata. columns), 'Percent Output', column_to_reorder)

trainingdata.dtypes

local_15min          datetime64[ns]
onefourtyfive_ago            object
onethirty_ago                object
onefifteen_ago               object
one_ago                      object
fourtyfive_ago               object
thirty_ago                   object
fifteen_ago                  object
temp                        float64
feelslike                   float64
dew                         float64
humidity                    float64
precip                      float64
precipprob                  float64
preciptype                   object
snow                          int64
snowdepth                     int64
windgust                    float64
windspeed                   float64
winddir                     float64
sealevelpressure            float64
cloudcover                  float64
visibility                  float64
solarradiation              float64
solarenergy                 float64
uvindex                     float64
severerisk                  float64
year                        

In [160]:
trainingdata[['onefourtyfive_ago', 'onethirty_ago', 'onefifteen_ago', 'one_ago', 
              'fourtyfive_ago', 'thirty_ago', 'fifteen_ago']] = trainingdata[['onefourtyfive_ago', 'onethirty_ago', 
                                                                              'onefifteen_ago', 'one_ago', 'fourtyfive_ago', 
                                                                              'thirty_ago', 'fifteen_ago']].apply(pd.to_numeric)

In [157]:
trainingdata.tail()

Unnamed: 0,onefourtyfive_ago,onethirty_ago,onefifteen_ago,one_ago,fourtyfive_ago,thirty_ago,fifteen_ago,temp,feelslike,dew,...,cloudcover,visibility,solarradiation,solarenergy,uvindex,year,month,day,time,Percent Output
127662,0.428829,0.526781,0.565274,0.672072,0.664701,0.663227,0.669124,73.9,73.9,60.2,...,6.2,9.8,302.0,1.1,3.0,2018,10,10,14.0,0.657658
127663,0.515714,0.547937,0.723492,0.733016,0.754286,0.754762,0.752857,73.9,73.9,60.2,...,6.2,9.8,302.0,1.1,3.0,2018,10,10,14.0,0.738095
127664,0.440039,0.531778,0.602721,0.739164,0.734694,0.727308,0.719145,73.9,73.9,60.2,...,6.2,9.8,302.0,1.1,3.0,2018,10,10,14.0,0.704956
127665,0.401736,0.539931,0.543056,0.518056,0.488889,0.477431,0.453819,73.9,73.9,60.2,...,6.2,9.8,302.0,1.1,3.0,2018,10,10,14.0,0.419792
127666,0.388,0.534,0.577333,0.618667,0.628444,0.635111,0.644444,73.9,73.9,60.2,...,6.2,9.8,302.0,1.1,3.0,2018,10,10,14.0,0.650444


In [151]:
trainingdata.drop(['local_15min'], axis=1, inplace=True) 

In [163]:
trainingdata.describe()

Unnamed: 0,onefourtyfive_ago,onethirty_ago,onefifteen_ago,one_ago,fourtyfive_ago,thirty_ago,fifteen_ago,temp,feelslike,dew,...,cloudcover,visibility,solarradiation,solarenergy,uvindex,year,month,day,time,Percent Output
count,24166.0,24166.0,24166.0,24166.0,24166.0,24166.0,24166.0,24166.0,24166.0,24166.0,...,24166.0,24166.0,24166.0,24166.0,24166.0,24166.0,24166.0,24166.0,24166.0,24166.0
mean,0.303097,0.305465,0.309845,0.308735,0.309875,0.308469,0.306848,75.922689,76.757035,55.298548,...,48.097815,9.628395,357.351941,1.285604,3.547463,2018.0,6.190102,15.979061,14.229496,0.301592
std,0.234462,0.233799,0.234602,0.234489,0.235269,0.235064,0.237118,16.834525,19.631637,15.751385,...,38.207897,1.000527,274.014788,0.986949,2.7915,0.0,3.259789,8.792138,3.267704,0.238091
min,-0.008108,-0.007939,-0.008446,-0.008108,-0.00777,-0.010518,-0.008446,25.0,13.7,5.5,...,0.0,1.2,0.2,0.0,0.0,2018.0,1.0,1.0,6.0,-0.010518
25%,0.082722,0.083333,0.085528,0.086411,0.08734,0.083598,0.078576,65.8,65.8,44.2,...,11.3,9.9,110.0,0.4,1.0,2018.0,4.0,9.0,12.0,0.073925
50%,0.272966,0.281528,0.284848,0.277156,0.286486,0.287293,0.282787,78.0,78.0,62.3,...,36.2,9.9,307.0,1.1,3.0,2018.0,6.0,15.0,14.0,0.267486
75%,0.522621,0.519619,0.523833,0.526017,0.530172,0.52527,0.523111,89.7,94.2,67.2,...,92.1,9.9,582.0,2.1,6.0,2018.0,8.0,24.0,17.0,0.523391
max,0.837302,1.930444,0.832264,0.843651,0.836984,1.930444,0.880159,104.6,106.2,76.0,...,100.0,9.9,984.0,3.5,10.0,2018.0,12.0,31.0,21.0,0.843651


In [153]:
trainingdata.drop(['precipprob','preciptype','severerisk'], axis=1, inplace=True) 

In [154]:
#IMPROVEMENT AREA: a lot of data is dropped here. This might be something from adding the new columns
trainingdata.dropna(inplace=True)

In [101]:
trainingdata.head(20)

Unnamed: 0,local_15min,temp,feelslike,dew,humidity,precip,snow,snowdepth,windgust,windspeed,...,cloudcover,visibility,solarradiation,solarenergy,uvindex,year,month,day,time,Percent Output
8,2018-01-01 08:00:00,25.0,14.1,11.4,55.86,0.0,0,0,18.9,11.7,...,97.5,9.9,18.0,0.1,0.0,2018,1,1,8.0,0.01696
9,2018-01-01 11:00:00,26.5,15.9,11.7,53.11,0.0,0,0,19.7,11.9,...,94.3,9.9,266.0,1.0,3.0,2018,1,1,11.0,0.2528
49,2018-01-03 15:00:00,54.9,54.9,20.9,26.37,0.0,0,0,18.3,2.3,...,6.2,9.9,489.0,1.8,5.0,2018,1,3,15.0,0.50416
135,2018-01-07 10:00:00,58.1,58.1,53.9,85.91,0.0,0,0,17.2,7.8,...,43.3,9.9,175.0,0.6,2.0,2018,1,7,10.0,0.1536
136,2018-01-07 11:00:00,61.8,61.8,54.7,77.42,0.0,0,0,21.7,8.2,...,100.0,9.9,194.0,0.7,2.0,2018,1,7,11.0,0.148
137,2018-01-07 12:00:00,65.7,65.7,55.8,70.36,0.0,0,0,29.9,13.4,...,100.0,9.9,117.0,0.4,1.0,2018,1,7,12.0,0.07616
138,2018-01-07 13:00:00,66.8,66.8,56.5,69.54,0.0,0,0,25.7,13.6,...,100.0,9.9,137.0,0.5,1.0,2018,1,7,13.0,0.05632
139,2018-01-07 14:00:00,66.5,66.5,56.5,70.06,0.0,0,0,23.3,9.1,...,100.0,9.9,75.0,0.3,1.0,2018,1,7,14.0,0.04144
140,2018-01-07 15:00:00,66.4,66.4,56.8,71.11,0.0,0,0,17.8,8.5,...,100.0,9.9,58.0,0.2,1.0,2018,1,7,15.0,0.06048
172,2018-01-10 13:00:00,61.8,61.8,52.8,72.33,0.0,0,0,20.8,8.8,...,99.4,9.9,267.0,1.0,3.0,2018,1,10,13.0,0.30336


In [164]:
trainingdata.to_csv("Memory_SolarTrainingData.csv")