# Wind Machine Learning Model
This notebook creates a machine learning model 

In [1]:
# Initial Imports
from path import Path
import requests
import json

# Data manipulation
import pandas as pd
import numpy as np

# Database Connection
import config
import pymongo

# datetime
from datetime import datetime
from datetime import timedelta

# ML Libraries
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, balanced_accuracy_score

# don't show warnings
import warnings
warnings.filterwarnings('ignore')

from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import StandardScaler

# Import Data from Database

In [2]:
# set string variables
DEFAULT_DATABASE = 'wind_solar_data' 
USERNAME = config.USERNAME
PASSWORD = config.PASSWORD

#create connection to database
client = pymongo.MongoClient(f"mongodb+srv://{USERNAME}:{PASSWORD}@austin-green-energy.pwzpm.mongodb.net/{DEFAULT_DATABASE}?retryWrites=true&w=majority")
try:
    client.server_info()
    print("Mongodb connected")
except:
    print("The Mongodb failed to connect. Check username/password in connection string.")

Mongodb connected


In [3]:
# select database
db = client.get_database('wind_solar_data')
# select collection
collection = db.wind_data

# pull collection into dataframe
wind_df = pd.DataFrame(list(collection.find()))
wind_df

Unnamed: 0,_id,Date_Time,Year,Month,Day,Hour,MWH,MWH_perTurbine,Temperature_F,Humidity_percent,WindSpeed_mph,WindGust_mph,WindDirection_degrees,WindDirection_compass,Weather_Description
0,5f98662ac1c5e33be427ce93,2019-01-01 00:00:00,2019,1,1,0,5.009100,0.069571,35,73,12,24,126,SE,Clear
1,5f98662ac1c5e33be427ce94,2019-01-01 01:00:00,2019,1,1,1,110.487950,1.534555,35,74,13,23,89,E,Clear
2,5f98662ac1c5e33be427ce95,2019-01-01 02:00:00,2019,1,1,2,72.020225,1.000281,35,76,14,23,53,NE,Clear
3,5f98662ac1c5e33be427ce96,2019-01-01 03:00:00,2019,1,1,3,67.639475,0.939437,35,77,15,22,17,NNE,Clear
4,5f98662ac1c5e33be427ce97,2019-01-01 04:00:00,2019,1,1,4,63.718900,0.884985,35,77,14,21,18,NNE,Clear
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13866,5f98662ac1c5e33be42804bd,2020-07-31 19:00:00,2020,7,31,19,10.764125,0.149502,82,35,8,11,104,ESE,Patchy rain possible
13867,5f98662ac1c5e33be42804be,2020-07-31 20:00:00,2020,7,31,20,4.998600,0.069425,82,39,8,12,78,ENE,Patchy rain possible
13868,5f98662ac1c5e33be42804bf,2020-07-31 21:00:00,2020,7,31,21,16.390275,0.227643,82,43,7,13,52,NE,Patchy rain possible
13869,5f98662ac1c5e33be42804c0,2020-07-31 22:00:00,2020,7,31,22,20.637800,0.286636,82,47,7,13,55,NE,Patchy rain possible


### Drop Columns
The first cleaning is to drop the columns we dont't need. We'll be dropping the _id column because this is an artifact of the Mongodb storage and isn't a feature of the dataset. The time column will be dropped because there is not a linear relationship between time and wind power. The winddirection compas is dropped because this data is less granular than the winddirection degrees.

In [4]:
# Drop uneeded columns
wind_clean_df = wind_df.drop(['_id', "WindDirection_compass"], axis=1)

### Type Data

In [5]:
# Check for NaN values
wind_clean_df.isnull().sum()

Date_Time                0
Year                     0
Month                    0
Day                      0
Hour                     0
MWH                      0
MWH_perTurbine           0
Temperature_F            0
Humidity_percent         0
WindSpeed_mph            0
WindGust_mph             0
WindDirection_degrees    0
Weather_Description      0
dtype: int64

In [6]:
wind_clean_df.dtypes

Date_Time                 object
Year                       int64
Month                      int64
Day                        int64
Hour                       int64
MWH                      float64
MWH_perTurbine           float64
Temperature_F              int64
Humidity_percent           int64
WindSpeed_mph              int64
WindGust_mph               int64
WindDirection_degrees      int64
Weather_Description       object
dtype: object

In [7]:
# Drop any NaN values
wind_clean_df = wind_clean_df.dropna()

In [8]:
#wind_clean_df["WindSpeed_mph"] = wind_clean_df["WindSpeed_mph"].round(0).astype(int)
#wind_clean_df["WindDirection_degrees"] = wind_clean_df["WindDirection_degrees"].round(0).astype(int)
#wind_clean_df["WindGust_mph"] = wind_clean_df["WindGust_mph"].round(0).astype(int)
wind_clean_df['Date_Time'] = pd.to_datetime(wind_clean_df['Date_Time'])
#wind_clean_df["MWH"] = wind_clean_df["MWH"].round(0).astype(int)
wind_clean_df.dtypes

Date_Time                datetime64[ns]
Year                              int64
Month                             int64
Day                               int64
Hour                              int64
MWH                             float64
MWH_perTurbine                  float64
Temperature_F                     int64
Humidity_percent                  int64
WindSpeed_mph                     int64
WindGust_mph                      int64
WindDirection_degrees             int64
Weather_Description              object
dtype: object

# ML Models

## Multiple Linear Regression

Date time not supported in linear Regression.

### Split Data

In [9]:
# Define the features set.
#wind_clean_df = wind_clean_df.reset_index()
X = wind_clean_df.drop(["MWH", 'Year','Weather_Description', 'Date_Time', "MWH_perTurbine"], axis=1)
y = wind_clean_df["MWH"].ravel()

#split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [10]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [11]:
from sklearn.linear_model import LinearRegression
# Train

regr = LinearRegression()
regr.fit(X_train_scaled,y_train)


LinearRegression()

In [12]:
# test
y_pred = regr.predict(X_test)
y_pred

array([769.03932463, 915.75265318, 531.20662696, ..., 796.86851729,
       532.66949697, 837.46556021])

In [13]:
accuracy = regr.score(X_test_scaled,y_test)
print(accuracy*100,'%')
print(f"R^2 Value:{regr.score(X_test_scaled,y_test)}")


39.27884124948253 %
R^2 Value:0.3927884124948253


## Neural Network

In [18]:
import tensorflow as tf
# Define the features set.
X = wind_clean_df.drop(["MWH", 'Year','Weather_Description', 'Date_Time', 'MWH_perTurbine'], axis=1)
y = wind_clean_df["MWH"].ravel()

#split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [19]:
number_input_features = 8
hidden_nodes_layer1 =  number_input_features*3
hidden_nodes_layer2 =  number_input_features*3
hidden_nodes_layer3 =  number_input_features*2

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2,activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="relu"))

# Compile the Sequential model together and customize metrics
nn.compile(loss='mse', optimizer='adam', metrics=['mse','mae'])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=20)

y_pred = nn.predict(X_test_scaled)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [20]:
results = pd.DataFrame()
results['test'] = y_test
results['pred'] = y_pred
results[:10]

Unnamed: 0,test,pred
0,54.55395,57.344769
1,12.98925,76.038986
2,12.85315,19.75844
3,4.598025,12.051669
4,31.905375,52.389866
5,59.23925,59.591732
6,29.0798,30.648155
7,108.195925,86.139885
8,9.414775,24.735331
9,48.5876,41.214073


## Random Forrest

In [None]:
# Resample the training data with the RandomOversampler
from imblearn.ensemble import BalancedRandomForestClassifier
brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brfc.fit(X_train, y_train)
Counter(y_train)

In [None]:
# Calculated the balanced accuracy score
y_pred = brfc.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
importances = brfc.feature_importances_
sorted(zip(brfc.feature_importances_, X.columns), reverse=True)