## Machine Learning Mutli-variate Analysis for Weather Forecast

In [1]:
%matplotlib inline

# import required libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

sns.set()
DATA_PATH = "..\\data\\"

In [2]:
# suppress warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=UserWarning) 
warnings.filterwarnings("ignore", category=FutureWarning) 

In [3]:
# initialize a list to store test results
test_ar = []

In [4]:
# Accuracy metrics
def accuracy_metrics(forecast, actual):
    mape = np.mean(np.abs(forecast - actual)/np.abs(actual))  # MAPE
    mae = np.mean(np.abs(forecast - actual))    # MAE
    rmse = np.mean((forecast - actual)**2)**.5  # RMSE
    print('Mean Absolute Percent Error ', mape, 'Mean Absolute Error', mae, 'Root Mean Square Error (rmse) ' , rmse)
    return mape, mae, rmse

### Machine Learning - Weather all Cities

In [5]:
# read 'weather_all.csv'  dataset as a dataframe
df = pd.read_csv(DATA_PATH + 'weather_all.csv', low_memory =  False)
# Convert index column to datetime dtype.
df.datetime = pd.to_datetime(df.datetime, infer_datetime_format = True)
df.set_index('datetime', inplace = True)

Features = df.columns
print(Features)

print(df.shape)
#print(df.info(verbose = True, null_counts = True))

# All data needs to be scaled to a small range like 0 to 1 for the neural
# network to work well. Create scalers for the inputs and outputs.

df_Scaled = MinMaxScaler(feature_range=(0.1, 1.1)).fit_transform(df.values)

Index(['Vancouver.temperature', 'Vancouver.pressure', 'Vancouver.humidity',
       'Vancouver.wind_speed', 'Vancouver.wind_direction',
       'Portland.temperature', 'Portland.pressure', 'Portland.humidity',
       'Portland.wind_speed', 'Portland.wind_direction',
       ...
       'Tel Aviv District.latitude', 'Tel Aviv District.longitude',
       'Eilat.latitude', 'Eilat.longitude', 'Haifa.latitude',
       'Haifa.longitude', 'Nahariyya.latitude', 'Nahariyya.longitude',
       'Jerusalem.latitude', 'Jerusalem.longitude'],
      dtype='object', length=252)
(44460, 252)


In [6]:
lags_size = 6 # 6hrs
n_outputs = df.shape[1] - 36 * 2
num_samples = df.shape[0] - lags_size - 1
a = np.empty((num_samples, lags_size * n_outputs + 36 * 2))
y_new = np.empty((num_samples, n_outputs))
for i in range(num_samples):
    for j in range(n_outputs): 
        for k in range(lags_size):
            a[i][j * lags_size + k] = df_Scaled[i + k][j]
        y_new[i][j] = df_Scaled[i+lags_size][j]
    for m in range(36 * 2):
        a[i][n_outputs * lags_size + m] = df_Scaled[i][n_outputs + m]
X_new = pd.DataFrame(a)
print(X_new.shape)
#print(X_new.info(verbose = True, null_counts = True))


# Create Training and Test 80/20 ratio
test_size = 10    # 10 hrs for each city
training_size = X_new.shape[0] - test_size

x_train = X_new.iloc[:training_size]
x_test = X_new.iloc[training_size: training_size + test_size]
y_train = y_new[:training_size]
y_test = y_new[training_size:training_size + test_size]
print(x_test.shape)
print(x_train.shape)

(44453, 1152)
(10, 1152)
(44443, 1152)


### Random Forest

In [7]:
model_rf = RandomForestRegressor(random_state=0, n_estimators = 500, max_depth = None, 
                              max_features = 20, min_samples_split = 10, min_samples_leaf = 10, bootstrap = False)
model_rf = model_rf.fit(x_train, y_train)
y_pred_rf = model_rf.predict(x_test)

mape, mae, rmse = accuracy_metrics(y_pred_rf, y_test)
test_ar.append({'label':'RandomForestRegressor', 'mape' : mape, 'mae': mae,'rmse':rmse})

Mean Absolute Percent Error  0.3440666998513223 Mean Absolute Error 0.11797807630725914 Root Mean Square Error (rmse)  0.1746045568586359


### Linear Regression

In [8]:
model_lr = LinearRegression(fit_intercept = False)
model_lr = model_lr.fit(x_train, y_train)
y_pred_lr = model_lr.predict(x_test)

mape, mae, rmse = accuracy_metrics(y_pred_lr, y_test)
test_ar.append({'label':'LinearRegression', 'mape' : mape, 'mae': mae,'rmse':rmse})

Mean Absolute Percent Error  0.20294193334454128 Mean Absolute Error 0.057657808165392815 Root Mean Square Error (rmse)  0.1178985613119973


#### Print Test results

In [9]:
print(test_ar)

[{'label': 'RandomForestRegressor', 'mape': 0.3440666998513223, 'mae': 0.11797807630725914, 'rmse': 0.1746045568586359}, {'label': 'LinearRegression', 'mape': 0.20294193334454128, 'mae': 0.057657808165392815, 'rmse': 0.1178985613119973}]
