In [2]:
import pandas as pd
import seaborn as sns
from scipy.stats import zscore
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import rioxarray as rio
from sklearn import linear_model
import statsmodels.api as ssm #for detail description of linear coefficients, intercepts, deviations, and many more
import xarray as xr

In [3]:

#Read Water Quality Data
water_quality_fp="../data/epd_water_quality/prc/epd_water_quality_1986_2022.csv"
water_quality_df=pd.read_csv(water_quality_fp).dropna()
summer_mean=pd.read_csv("../data/epd_water_quality/prc/summer_mean_1986_2022.csv")

vars=['chla_surf', 'diss_o_surf', 'ph_surf',
       'salinity_surf', 'turbidity_surf', 'temp_surf', 'suspended_solids_surf',
       'nitrates_surf','chla_bott', 'diss_o_bott', 'ph_bott',
       'salinity_bott', 'turbidity_bott', 'temp_bott', 'suspended_solids_bott',
       'nitrates_bott']

# Read EPD Stations Metadata
epd_stations_fp="../data/epd_water_quality/prc/epd_stations.csv"
epd_stations=pd.read_csv(epd_stations_fp)

## Linear Regression

##### Check for Colliniearty with VIF

In [4]:
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor

#find design matrix for regression model using 'rating' as response variable 
y, X = dmatrices('temp_bott ~ temp_surf+depth_m+chla_surf+ph_surf', data=summer_mean, return_type='dataframe')

#create DataFrame to hold VIF values
vif_df = pd.DataFrame()
vif_df['variable'] = X.columns 

#calculate VIF for each predictor variable 
vif_df['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

#view VIF for each predictor variable 
print(vif_df)


    variable          VIF
0  Intercept  7382.533872
1  temp_surf     1.932516
2    depth_m     1.753399
3  chla_surf     1.500366
4    ph_surf     1.375298


### Ordinary Least Squares Model

In [19]:
# predict_vars=["chla_surf","depth_m","temp_surf","ph_surf"]
X = summer_mean[["depth_m","temp_surf"]]
y = summer_mean["temp_bott"]

X_train=X[:42]
y_train=y[:42]
X_train=ssm.add_constant(X_train)        #to add constant value in the model

X_test=X[42:]
y_test=y[42:]
X_test=ssm.add_constant(X_test)        #to add constant value in the model

model= ssm.OLS(y_train,X_train).fit()         #fitting the model
predictions= model.summary()      #summary of the model
predictions

0,1,2,3
Dep. Variable:,temp_bott,R-squared:,0.833
Model:,OLS,Adj. R-squared:,0.825
Method:,Least Squares,F-statistic:,97.41
Date:,"Tue, 17 Sep 2024",Prob (F-statistic):,6.8e-16
Time:,09:37:09,Log-Likelihood:,-35.925
No. Observations:,42,AIC:,77.85
Df Residuals:,39,BIC:,83.06
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,22.4863,4.215,5.335,0.000,13.960,31.012
depth_m,-0.1560,0.013,-12.447,0.000,-0.181,-0.131
temp_surf,0.1911,0.146,1.307,0.199,-0.105,0.487

0,1,2,3
Omnibus:,18.15,Durbin-Watson:,1.328
Prob(Omnibus):,0.0,Jarque-Bera (JB):,23.747
Skew:,1.376,Prob(JB):,6.97e-06
Kurtosis:,5.449,Cond. No.,1490.0


# Spatial Regression

Apply Regressor across spatial raster datasets

In [24]:
# Fit model to FULL dataset
X = summer_mean[["depth_m","temp_surf"]]
y = summer_mean["temp_bott"]
X=ssm.add_constant(X)   
model= ssm.OLS(y,X).fit()         #fitting the model
predictions= model.summary()
predictions

0,1,2,3
Dep. Variable:,temp_bott,R-squared:,0.845
Model:,OLS,Adj. R-squared:,0.839
Method:,Least Squares,F-statistic:,146.9
Date:,"Tue, 17 Sep 2024",Prob (F-statistic):,1.45e-22
Time:,09:38:44,Log-Likelihood:,-48.608
No. Observations:,57,AIC:,103.2
Df Residuals:,54,BIC:,109.3
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,19.9228,3.657,5.448,0.000,12.591,27.254
depth_m,-0.1647,0.011,-15.043,0.000,-0.187,-0.143
temp_surf,0.2882,0.127,2.271,0.027,0.034,0.543

0,1,2,3
Omnibus:,18.272,Durbin-Watson:,1.579
Prob(Omnibus):,0.0,Jarque-Bera (JB):,24.773
Skew:,1.188,Prob(JB):,4.17e-06
Kurtosis:,5.188,Cond. No.,1490.0


In [25]:

# model= ssm.OLS(y,X).fit() 

# Prepare raster datasets
sst_fp="../data/sea_surface_temperature/sst_summer_mean_2002_2020.tif"
depth_fp=r"C:\Users\medo_\OneDrive\Desktop\HK_ThermalRefugia\data\bathymetry\gebco_2024_n23.2842_s21.6651_w112.5659_e114.5956.tif"

sst_summer_mean=rio.open_rasterio(sst_fp).rio.write_crs(4326)
sst_summer_mean=sst_summer_mean
depth=xr.open_dataset(depth_fp).rio.write_crs(4326)
depth=depth.elevation * -1
depth=depth.where(depth > 0 ).rio.write_crs(4326)
# Match Rasters to depth raster
sst_summer_mean=sst_summer_mean.rio.reproject_match(depth)
sst_summer_mean=sst_summer_mean.where(sst_summer_mean > 0)


In [26]:
# Format raster data into arrays shaped [[v1,v2,v3...]] for model prediction
depth_vals=depth.values.flatten()
ss_vals=sst_summer_mean.values.flatten()

X_predict=[[float(depth_vals[x]),float(ss_vals[x])] for x in range(0,len(depth_vals))]

y_predict=np.array([float(model.predict([1.0,X_predict[x][0], X_predict[x][1]])[0]) for x in range(0,len(X_predict))]).reshape(depth.shape)
y_predict_ds=xr.DataArray(data=y_predict,coords=depth.coords).rio.write_crs(4326)

In [48]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error, mean_absolute_percentage_error,max_error
import numpy as np

# Generate some sample data

# Calculate loss metrics based on EPD Data
bott_temp_validation=summer_mean.copy()
bott_temp_validation["pred_temp_bott"]=summer_mean[['latitude','longitude']].apply(lambda st: float(y_predict_ds.sel(x=st.longitude,y=st.latitude, method="nearest")), axis=1)
bott_temp_validation.dropna(subset=["pred_temp_bott"], inplace=True)

y_true=bott_temp_validation.temp_bott
y_pred=bott_temp_validation.pred_temp_bott

mae=mean_absolute_error(y_true,y_pred)
mse=mean_squared_error(y_true,y_pred)
rmse=root_mean_squared_error(y_true,y_pred)
mape=mean_absolute_percentage_error(y_true,y_pred)
me=max_error(y_true,y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"Mean Absolute Percentage Error: {mape}")
print(f"Max Error: {me}")

Mean Absolute Error: 0.6330958350129284
Mean Squared Error: 0.9470119830329263
Root Mean Squared Error: 0.9731454069320403
Mean Absolute Percentage Error: 0.02478054812679395
Max Error: 4.990657935492745


In [46]:
y_predict_ds.where(depth>0).rio.to_raster("../data/bottom_sst_predictions/predicted_bottom_temp.tif")
depth.rio.to_raster("../data/bottom_sst_predictions/depth.tif")
sst_summer_mean.where(depth>0).rio.to_raster("../data/bottom_sst_predictions/sst_summer_mean.tif")

## Spatial Regression