In [1]:
import pandas as pd
import numpy as np
import elapid
from elapid import MaxentModel
import xarray as xr
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, balanced_accuracy_score
import matplotlib.pyplot as plt
import joblib
import rasterio
from rasterio.transform import from_origin
from rasterio.enums import Resampling
from sklearn.model_selection import GroupKFold, GridSearchCV
import pickle
from sklearn.model_selection import train_test_split


In [6]:
X_train = pd.read_csv("/cluster/home/maikents/sinmod_features_masters/ready_for_training/pipeline6_v2/X_train2.csv")
X_test = pd.read_csv("/cluster/home/maikents/sinmod_features_masters/ready_for_training/pipeline6_v2/X_test2.csv")
y_train = pd.read_csv("/cluster/home/maikents/sinmod_features_masters/ready_for_training/pipeline6_v2/y_train2.csv")
y_test = pd.read_csv("/cluster/home/maikents/sinmod_features_masters/ready_for_training/pipeline6_v2/y_test2.csv")

In [2]:
combined_df_nans = pd.read_parquet("/cluster/home/maikents/sinmod_features_masters/dataframes_pipeline_6/total_df_with_nans_and_labels.parquet")


In [3]:
x_coords = combined_df_nans['x']
y_coords = combined_df_nans['y']

In [3]:
combined_df_nans = combined_df_nans.drop(columns=['chlor_a_10th_percentile', 'sediment_nitrate_mean',
'sediment_nitrate_90th_percentile', 'temperature_max', 'salinity_10th_percentile', 'salinity_90th_percentile',
'current_speed_90th_percentile', 'chlor_a_90th_percentile', 'sediment_silicate_mean'])

In [8]:
#Set up and run GridSearchCV for MaxEnt

model = elapid.MaxentModel()
param_grid = {
    'beta_multiplier': [0.5, 1.0, 2.0, 3.0, 4.0],
    'beta_hinge': [0.5, 1.0, 1.5, 2.0],
    'beta_threshold': [0.5, 1.0, 1.5, 2.0]
}

#Perform Grid Search CV
cv_model = GridSearchCV(
    model, param_grid, 
    scoring={
        'ROC': make_scorer(roc_auc_score, needs_proba=True)
    }, 
    refit='ROC', 
    cv=10,
    return_train_score=True
)
cv_model.fit(X_train, y_train)



KeyboardInterrupt: 

In [13]:
joblib.dump(cv_model, '/cluster/home/maikents/maxent_model_pipeline_6_v3.pkl')

['/cluster/home/maikents/maxent_model_pipeline_6_v3.pkl']

In [4]:
#If using spatial block cv:
X_train = pd.read_csv("/cluster/home/maikents/sinmod_features_masters/ready_for_training/pipeline6/X_train_spatial.csv")
X_test = pd.read_csv("/cluster/home/maikents/sinmod_features_masters/ready_for_training/pipeline6/X_test_spatial.csv")
y_train = pd.read_csv("/cluster/home/maikents/sinmod_features_masters/ready_for_training/pipeline6/y_train_spatial.csv")
y_test = pd.read_csv("/cluster/home/maikents/sinmod_features_masters/ready_for_training/pipeline6/y_test_spatial.csv")
groups_train = pd.read_csv("/cluster/home/maikents/sinmod_features_masters/ready_for_training/pipeline6/groups_train_spatial.csv")
groups_test = pd.read_csv("/cluster/home/maikents/sinmod_features_masters/ready_for_training/pipeline6/groups_test_spatial.csv")

In [6]:
model = elapid.MaxentModel()
gkf = GroupKFold(n_splits=6)
cv_splits = list(gkf.split(X_train, y_train, groups=groups_train))

param_grid = {
    'beta_multiplier': [0.5, 1.0, 2.0, 3.0, 4.0],
    'beta_hinge': [0.5, 1.0, 1.5, 2.0],
    'beta_threshold': [0.5, 1.0, 1.5, 2.0]
}



cv_model = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=cv_splits,  
    scoring={'ROC': make_scorer(roc_auc_score, needs_proba=True)},
    refit='ROC',
    return_train_score=True
)

feature_cols = X_train.columns.difference(['x_bin', 'y_bin'])
X_train_clean = X_train[feature_cols]

cv_model.fit(X_train_clean, y_train)


Traceback (most recent call last):
  File "/cluster/home/maikents/PyEnvCoralMapping/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 139, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "/cluster/home/maikents/PyEnvCoralMapping/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 371, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/cluster/home/maikents/PyEnvCoralMapping/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 89, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "/cluster/home/maikents/PyEnvCoralMapping/lib/python3.12/site-packages/sklearn/utils/_response.py", line 214, in _get_response_values
    y_pred = _process_predict_proba(
             ^^^^^^^^^^^^^^^^^^^^^^^
  File "/cluster/home/maikents/PyEnvCoralMapping/lib/python3.12/site-packages/sklearn/utils/_response.py", line 57, in _process_predict_proba
    col_idx = np.flatnonzero(class

In [7]:
joblib.dump(cv_model, '/cluster/home/maikents/maxent_model_pipeline_6_spatial.pkl')


['/cluster/home/maikents/maxent_model_pipeline_6_spatial.pkl']

In [5]:
#Next

In [8]:
cv_model = joblib.load('/cluster/home/maikents/maxent_model_pipeline_6_spatial.pkl')


In [11]:
#Predict probabilities on the training set
y_train_pred = best_model.predict_proba(X_train)[:, 1]

train_auc_score = roc_auc_score(y_train, y_train_pred)

print(f"Train AUC: {train_auc_score:.3f}")


Train AUC: 0.966


In [10]:
#Predict across full model area 
x_coords = combined_df_nans['x']
y_coords = combined_df_nans['y']
#10-fold
#features_only = combined_df_nans.drop(columns=['x', 'y', 'labels'], errors='ignore')
#X_grid_clean = features_only.dropna()


#spatial block
feature_cols = X_train.columns.difference(['x_bin', 'y_bin'])
X_grid_clean = combined_df_nans[feature_cols].dropna()
X_grid_clean = X_grid_clean.drop(columns=['x', 'y', 'labels'], errors='ignore')
valid_idx = X_grid_clean.index


chunk_size = 2000
num_chunks = int(np.ceil(len(X_grid_clean) / chunk_size))
predicted_probabilities_list = []

for i in range(num_chunks):
    start = i * chunk_size
    end = min((i + 1) * chunk_size, len(X_grid_clean))
    chunk = X_grid_clean.iloc[start:end]
    probs = best_model.predict_proba(chunk)[:, 1]
    predicted_probabilities_list.append(probs)

predicted_probs = np.concatenate(predicted_probabilities_list)


In [12]:
#Evaluate the test performance
best_model = cv_model.best_estimator_
X_grid_clean = X_grid_clean[feature_cols]

y_test_pred = best_model.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test, y_test_pred)

print(f"Test AUC: {auc_score:.3f}")
print("Best regularization parameters found:", cv_model.best_params_)

Test AUC: 0.938
Best regularization parameters found: {'beta_hinge': 0.5, 'beta_multiplier': 0.5, 'beta_threshold': 0.5}


In [13]:
all_probabilities = np.full((len(combined_df_nans),), np.nan)

all_probabilities[valid_idx] = predicted_probs

num_x = len(np.unique(x_coords))
num_y = len(np.unique(y_coords))

In [14]:
sorted_idx = np.lexsort((x_coords, y_coords))  
all_probs_sorted = all_probabilities[sorted_idx]


In [15]:
predicted_grid = all_probs_sorted.reshape((num_y, num_x))


In [16]:
np.save("/cluster/home/maikents/sinmod_features_masters/predicted_probabilities_pipeline6_spatial.npy", predicted_grid)


In [17]:
predicted_probabilities_grid = np.load("/cluster/home/maikents/sinmod_features_masters/predicted_probabilities_pipeline6_spatial.npy")

In [24]:
res_x = (x_coords.max() - x_coords.min()) / predicted_probabilities_grid.shape[1]
res_y = (y_coords.max() - y_coords.min()) / predicted_probabilities_grid.shape[0]

transform = from_origin(x_coords.min(), y_coords.max(), res_x, res_y)

#Save to GeoTIFF with correct CRS
with rasterio.open(
    "/cluster/home/maikents/sinmod_features_masters/coral_prediction_emodnet_pipeline_6_spatial_1.tif",
    "w",
    driver="GTiff",
    height=predicted_probabilities_grid.shape[0],
    width=predicted_probabilities_grid.shape[1],
    count=1,
    dtype="float32",
    crs="EPSG:25833",  #EMODnet UTM Zone 33N / ETRS89
    transform=transform,
) as dst:
    dst.write(predicted_probabilities_grid.astype("float32"), 1)

In [25]:
with rasterio.open("/cluster/home/maikents/sinmod_features_masters/coral_prediction_emodnet_pipeline_6_spatial_1.tif") as src:
    data = src.read(1)

    flipped_data = np.flipud(data)

    meta = src.meta.copy()

    with rasterio.open('/cluster/home/maikents/sinmod_features_masters/coral_prediction_emodnet_pipeline_6_spatial_2.tif', 'w', **meta) as dst:
        dst.write(flipped_data, 1)

print("Vertical flip complete. File saved as 'flipped_vertical.tif'.")


Vertical flip complete. File saved as 'flipped_vertical.tif'.


In [29]:
with rasterio.open("/cluster/home/maikents/sinmod_features_masters/coral_prediction_emodnet_pipeline_6_spatial_2.tif") as src:
    data = src.read(1)
    profile = src.profile

#Replace NaNs with -9999
data_clean = np.where(np.isnan(data), -9999, data)

#Update NoData value in metadata
profile.update(nodata=-9999)

with rasterio.open('/cluster/home/maikents/sinmod_features_masters/coral_prediction_emodnet_pipeline_6_spatial_3.tif', "w", **profile) as dst:
    dst.write(data_clean, 1)