In [None]:
# Import Libraries
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Maybe don't need
from sklearn.model_selection import train_test_split

In [None]:
# Other imports
!pip install meteostat
from datetime import datetime
from meteostat import Hourly
from meteostat import Point
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import regularizers

In [None]:
# Testing Variables
numStations = 1    # Change num to len(fips) for full test

In [None]:
# Set time period for data imports
startTime = datetime(2018, 1, 1)
endTime = datetime(2018, 12, 31, 23, 59)

startTime = pd.to_datetime(startTime)
endTime = pd.to_datetime(endTime)
timeSeries = pd.date_range(start=startTime, end=endTime, freq='15min')

In [None]:
# Pull data for centroid of US counties
tempData = pd.read_excel("us-county-boundaries.xlsx")
countyData = tempData[['GEOID', 'Geo Point']]
countyData.sort_values(by = ['GEOID'], inplace=True)
countyData[['lat', 'long']] = countyData['Geo Point'].str.split(',', expand=True)
countyData.drop(columns=['Geo Point'], inplace=True)
print(countyData.head(10))

In [None]:
# Fetch weather data in US in time range
weatherData = pd.DataFrame()
for i in range(numStations):
  row = countyData.iloc[i]
  latTemp = float(row['lat'])
  longTemp = float(row['long'])
  tempPoint = Point(latTemp, longTemp)
  temp = Hourly(tempPoint, startTime, endTime)
  dataTemp = temp.fetch()
  if(not dataTemp.empty):
    dataTemp.drop('dwpt', axis=1, inplace=True)
    dataTemp.drop('snow', axis=1, inplace=True)
    dataTemp.drop('wpgt', axis=1, inplace=True)
    dataTemp.drop('tsun', axis=1, inplace=True)
    dataTemp.drop('coco', axis=1, inplace=True)

    # Rename Data to unique identifiers
    dataTemp.rename(columns={'temp': str(row['GEOID']) + 'temp'}, inplace=True)
    dataTemp.rename(columns={'rhum': str(row['GEOID']) + 'rhum'}, inplace=True)
    dataTemp.rename(columns={'prcp': str(row['GEOID']) + 'prcp'}, inplace=True)
    dataTemp.rename(columns={'wdir': str(row['GEOID']) + 'wdir'}, inplace=True)
    dataTemp.rename(columns={'wspd': str(row['GEOID']) + 'wspd'}, inplace=True)
    dataTemp.rename(columns={'pres': str(row['GEOID']) + 'pres'}, inplace=True)

    # Begin Joining Data w/ unique column names
    if weatherData.empty:
      weatherData = dataTemp
      print('reset the Dataframe')
    else:
      weatherData = pd.concat([weatherData, dataTemp], axis=1)
      print('concat with dataframe')
  else:
    print('Empty dataTemp')
  # Add delay in fetching data to prevent query errors. Experimentation required for reasonable amount of sleep
  time.sleep(.25)

# Store this data so it doesn't need to be loaded everytime

# Print DataFrame
print(weatherData)

In [None]:
# Clean Data
weatherData15min = weatherData.resample('15min').mean()
weatherData15min = weatherData15min.interpolate()
weatherData15min = weatherData15min.bfill()
weatherData15min = weatherData15min.dropna(axis=1)

# Ensure index matches neccesary format. Including last 45 minutes
    # Extra forward and backfill ensure that the data includes first and last hour if not included
    # Need to add a method to ensure we are not just forward and backfilling most of a column
weatherData15min = weatherData15min.reindex(timeSeries)
weatherData15min = weatherData15min.ffill()
weatherData15min = weatherData15min.bfill()

In [None]:
# Standardize the Weather Data set
scaler = StandardScaler()
weatherData15min = scaler.fit_transform(weatherData15min)

In [None]:
# Pull Outage Data into Program
tempData = pd.read_csv("eaglei_outages_2018.csv")
outageData = tempData[['run_start_time', 'fips_code']]

In [None]:
# Fetching Outage Data
for i in range(numStations):
  # Get Data
  curFips = countyData.iloc[i]['GEOID']
  temp = outageData.loc[outageData['fips_code'].isin([curFips])]

  # Adjust Data to have run_start_time as index, and values as 1 or 0 based on outage.
  temp['run_start_time'] = pd.to_datetime(temp['run_start_time'], format='%Y-%m-%d %H:%M:%S')
  temp.set_index('run_start_time', inplace=True)
  temp.loc[temp['fips_code'] == curFips, 'fips_code'] = 1
  temp.rename(columns={'fips_code': str(curFips)}, inplace=True)

  # Concatenate data onto Outage Data 2 set
  if i == 0:
    outageData2 = pd.DataFrame(temp, index = timeSeries)
  else:
    outageData2 = outageData2.join(temp)

# Fill NaN with 0, to be readable by ML model
outageData2 = outageData2.fillna(0)

# Explore the Output
print(outageData2.head())
print(outageData2.shape)
outageData2.describe()

In [None]:
# Create training and testing sets
x = weatherData15min
y = outageData2
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=False)
inputShape = x_train.shape[1]
print(inputShape)
outputShape = y_train.shape[1]
print(outputShape)

# Reshape the input data to have correct shape for CNN
x_train = x_train.reshape(x_train.shape[0], x_train.shape[1], 1)
x_test = x_test.reshape(x_test.shape[0], x_test.shape[1], 1)
print(x_train.shape)

# Compute Class Weights
class_weights = {}
count = {0: 0, 1: 0}
for col in y_train.columns:
  tempCount = y_train[col].value_counts()
  count[0] += tempCount.get(0,0)
  count[1] += tempCount.get(1,0)
print(count)
if(count[0] > count[1]):
  class_weights[0] = count[0] / count[0]
  class_weights[1] = count[0] / count[1]
else:
  class_weights[0] = count[1] / count[0]
  class_weights[1] = count[1] / count[1]
print(class_weights)

# Convert y data into array
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [None]:
# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=15, min_lr=1e-6)
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath = '/tmp/ckpt/checkpoint.model.keras', save_best_only=True, monitor='test_loss', mode='min')

In [None]:
def createModel(cnnLayersParam, denseLayersParam, cnnFilterParam, denseUnitsParam, dropoutParam):
  modelTemp = tf.keras.models.Sequential()

  # Add CNN Layers
  for i in range(cnnLayersParam):
    modelTemp.add(tf.keras.layers.Conv1D(filters=cnnFilterParam*(i+1), kernel_size=3, padding='same', activation='relu', input_shape=(inputShape,1)))
    modelTemp.add(tf.keras.layers.BatchNormalization())

  # Add Flatten Layer
  modelTemp.add(tf.keras.layers.Flatten())

  # Add Dense Layers
  for i in range(denseLayersParam-1):
    modelTemp.add(tf.keras.layers.Dense(units=denseUnitsParam*(i+1), activation='relu'))
    modelTemp.add(tf.keras.layers.Dropout(dropoutParam))

  # Add Output Layer
  modelTemp.add(tf.keras.layers.Dense(units=outputShape, activation='sigmoid'))

  return modelTemp

In [None]:
# Define model params: Big test
#cnnLayers = [2,3,4,5,6]
#denseLayers = [2,3,4,5,6]
#cnnFilter = [16,32,64]
#denseUnits = [32,64,128]
#dropout = [0.2,0.3,0.4,0.5]

# Define model params: Small Test
cnnLayers = [3,4]
denseLayers = [3,4]
cnnFilter = [16,32]
denseUnits = [32]
dropout = [0.2,0.4]

In [None]:
scores = {}
count = 0
for i in cnnLayers:
  for j in denseLayers:
    for k in cnnFilter:
      for l in denseUnits:
        for m in dropout:
          model = createModel(i, j, k, l, m)
          model.compile(optimizer='adam',
                        loss='binary_crossentropy',
                        metrics=['accuracy'])
          history = model.fit(x_train, y_train,
                              epochs=100,
                              batch_size=128,
                              validation_split = 0.2,
                              callbacks=[early_stopping, reduce_lr],
                              class_weight = class_weights,
                              shuffle=False,
                              verbose=0)
          scores[count] = [i,j,k,l,m, history.history['val_accuracy'][-1], history.history['val_loss'][-1], history.history['accuracy'][-1], history.history['loss'][-1]]
          count += 1
          print(f"Validation accuracy: {history.history['val_accuracy'][-1]}    \tValidation loss: {history.history['val_loss'][-1]}")

print(f"The parameters with the best validation accuracy is: {max(scores, key = lambda i : i[5])[:]}")
print(f"The parameters with the best validation loss is: {min(scores, key = lambda i : i[6])[:]}")


In [None]:
print(f"The parameters with the best validation accuracy is: {max(scores, key = lambda i : i[5])[:]}")
print(f"The parameters with the best validation loss is: {min(scores, key = lambda i : i[6])[:]}")

In [None]:
df = pd.DataFrame(scores, columns=['cnnLayers', 'denseLayers', 'cnnFilter', 'denseUnits', 'dropout', 'val_accuracy', 'val_loss'])
df.to_csv('scores.csv')