In [1]:
import pandas as pd
import numpy as np
from utils import *
from df_utils import *
from training_utils import *
pd.set_option('display.max_columns', None)

city_info = get_city_info()
city = 'Chicago'

noaa_path = city_info[city]['noaa']
om_path = city_info[city]['om']
wrh_path = city_info[city]['wrh']
aq_path = city_info[city]['aq']
solar_path = city_info[city]['ss']
attn_lstm_path = city_info[city]['attn_lstm']
lstm_path = city_info[city]['lstm']
scaler_features_path = city_info[city]['scaler']

# Load the data for the city
# all_df is the main dataframe that contains all the data combined. I included subsets of the dataframes as well,
# in case I need them in the future. Predictor is simply the last row. I extracted it so that it does not get
# deleted by dropna.   
daily_df, daily_df_2, daily_df_3, all_df, predictor_final = load_all_dfs(noaa_path, om_path, solar_path, wrh_path, aq_path)



In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

columns_to_ignore = ['date', 'next_day_max_temp']
target_column = 'next_day_max_temp'

data = all_df

X = data.drop(columns=columns_to_ignore)
y = data[target_column]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, shuffle=False)

# scale the data
scaler = load_scaler(scaler_features_path)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [3]:
# XGBoost
from xgboost import XGBRegressor

model = XGBRegressor(
    n_estimators=1000, 
    learning_rate=0.01, 
    n_jobs=4, 
    early_stopping_rounds=15,
    min_child_weight=1,
)
model.fit(
    X_train_scaled, 
    y_train,
    eval_set=[(X_train_scaled, y_train), (X_val_scaled, y_val)]
)


[0]	validation_0-rmse:21.29946	validation_1-rmse:20.91279
[1]	validation_0-rmse:21.11211	validation_1-rmse:20.72916
[2]	validation_0-rmse:20.92679	validation_1-rmse:20.54777
[3]	validation_0-rmse:20.74360	validation_1-rmse:20.36835
[4]	validation_0-rmse:20.56235	validation_1-rmse:20.19072
[5]	validation_0-rmse:20.38290	validation_1-rmse:20.01513
[6]	validation_0-rmse:20.20550	validation_1-rmse:19.84080
[7]	validation_0-rmse:20.03021	validation_1-rmse:19.66986
[8]	validation_0-rmse:19.85672	validation_1-rmse:19.49953
[9]	validation_0-rmse:19.68516	validation_1-rmse:19.33145
[10]	validation_0-rmse:19.51546	validation_1-rmse:19.16515
[11]	validation_0-rmse:19.34770	validation_1-rmse:19.00056
[12]	validation_0-rmse:19.18174	validation_1-rmse:18.83817
[13]	validation_0-rmse:19.01772	validation_1-rmse:18.67724
[14]	validation_0-rmse:18.85539	validation_1-rmse:18.51865
[15]	validation_0-rmse:18.69480	validation_1-rmse:18.36171
[16]	validation_0-rmse:18.53613	validation_1-rmse:18.20723
[17]	va

In [4]:
# import explained variance score
from sklearn.metrics import explained_variance_score
import plotly.graph_objs as go

# Make predictions on the test data
y_pred = model.predict(X_test_scaled)

# Print explained variance on the test data
print("Explained Variance (R-squared) on Test Data:", explained_variance_score(y_test, y_pred))

# Print mean squared error on the test data
mse = np.mean((y_pred - y_test)**2)
print("Mean Squared Error (MSE) on Test Data:", mse)

mae = np.mean(np.abs(y_pred - y_test))
print("Mean Absolute Error (MAE) on Test Data:", mae)

std_error = np.std(y_pred - y_test)
print("Standard Error on Test Data:", std_error)

confidence_interval = 1.96 * std_error
print("95% Confidence Interval on Test Data:", confidence_interval)

# Create upper and lower bounds for uncertainty shading
upper_bound = y_pred + confidence_interval
lower_bound = y_pred - confidence_interval

# Plot the predicted vs. actual values
# Create traces for actual and predicted values
trace_actual = go.Scatter(
    x=np.arange(len(y_test)),
    y=y_test,
    mode='lines',
    name='Actual'
)

trace_predicted = go.Scatter(
    x=np.arange(len(y_pred)),
    y=y_pred,
    mode='lines',
    name='Predicted'
)

# Create traces for the uncertainty shading
trace_upper_bound = go.Scatter(
    x=np.arange(len(y_pred)),
    y=upper_bound,
    mode='lines',
    marker=dict(color="#444"),
    line=dict(width=0),
    showlegend=False
)

trace_lower_bound = go.Scatter(
    x=np.arange(len(y_pred)),
    y=lower_bound,
    mode='lines',
    marker=dict(color="#444"),
    line=dict(width=0),
    fillcolor='rgba(68, 68, 68, 0.3)',
    fill='tonexty',
    showlegend=False
)

# Create the layout for the figure
layout = go.Layout(
    title='Predicted vs. Actual Values with 95% Confidence Interval',
    xaxis=dict(title='Data Points'),
    yaxis=dict(title='Values')
)

# Create the figure
fig = go.Figure(data=[trace_actual, trace_predicted, trace_upper_bound, trace_lower_bound], layout=layout)

# Display the figure
fig.show()

Explained Variance (R-squared) on Test Data: 0.8943732730417925
Mean Squared Error (MSE) on Test Data: 46.70410537739629
Mean Absolute Error (MAE) on Test Data: 5.281347216489479
Standard Error on Test Data: 6.821178087520376
95% Confidence Interval on Test Data: 13.369509051539936


In [5]:
# save the model
import joblib
joblib.dump(model, f'./models/{city}_xgb.pkl')

['./models/Chicago_xgb.pkl']