In [3]:
import pandas as pd
import numpy as np
import elapid
from elapid import MaxentModel
import xarray as xr
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, balanced_accuracy_score
import matplotlib.pyplot as plt
import joblib

In [2]:
X_train = pd.read_csv("/cluster/home/maikents/sinmod_features_masters/ready_for_training/X_train.csv")
X_test = pd.read_csv("/cluster/home/maikents/sinmod_features_masters/ready_for_training/X_test.csv")
y_train = pd.read_csv("/cluster/home/maikents/sinmod_features_masters/ready_for_training/y_train.csv")
y_test = pd.read_csv("/cluster/home/maikents/sinmod_features_masters/ready_for_training/y_test.csv")

In [2]:
combined_df_nans = pd.read_parquet("/cluster/home/maikents/sinmod_features_masters/dataframes_pipeline_1/total_df_with_nans_and_labels.parquet")


NB!!! For de andre pipelinene: spatial block cv!!! ikke 10-fold 

In [5]:
#Set up and run GridSearchCV for MaxEnt

model = elapid.MaxentModel()
param_grid = {
    'beta_multiplier': [0.5, 1.0, 2.0, 3.0, 4.0],
    'beta_hinge': [0.5, 1.0, 1.5, 2.0],
    'beta_threshold': [0.5, 1.0, 1.5, 2.0]
}

#Perform Grid Search CV
cv_model = GridSearchCV(
    model, param_grid, 
    scoring={
        'ROC': make_scorer(roc_auc_score, needs_proba=True)
    }, 
    refit='ROC', 
    cv=10,
    return_train_score=True
)
cv_model.fit(X_train, y_train)

Traceback (most recent call last):
  File "/cluster/home/maikents/PyEnvCoralMapping/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 139, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "/cluster/home/maikents/PyEnvCoralMapping/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 371, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/cluster/home/maikents/PyEnvCoralMapping/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 89, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "/cluster/home/maikents/PyEnvCoralMapping/lib/python3.12/site-packages/sklearn/utils/_response.py", line 214, in _get_response_values
    y_pred = _process_predict_proba(
             ^^^^^^^^^^^^^^^^^^^^^^^
  File "/cluster/home/maikents/PyEnvCoralMapping/lib/python3.12/site-packages/sklearn/utils/_response.py", line 57, in _process_predict_proba
    col_idx = np.flatnonzero(class

In [7]:
joblib.dump(cv_model, '/cluster/home/maikents/maxent_model_pipeline_1.pkl')

['/cluster/home/maikents/maxent_model_pipeline_1.pkl']

In [4]:
cv_model = joblib.load('/cluster/home/maikents/maxent_model_pipeline_1.pkl')

In [8]:
#Evaluate the test performance
best_model = cv_model.best_estimator_

y_test_pred = best_model.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test, y_test_pred)

print(f"Test AUC: {auc_score:.3f}")
print("Best regularization parameters found:", cv_model.best_params_)

Test AUC: 0.912
Best regularization parameters found: {'beta_hinge': 0.5, 'beta_multiplier': 0.5, 'beta_threshold': 0.5}


In [9]:
#Predict probabilities on the training set
y_train_pred = best_model.predict_proba(X_train)[:, 1]

train_auc_score = roc_auc_score(y_train, y_train_pred)

print(f"Train AUC: {train_auc_score:.3f}")

Train AUC: 0.946


In [10]:
#Predict across full model area 
x_coords = combined_df_nans['x']
y_coords = combined_df_nans['y']
features_only = combined_df_nans.drop(columns=['x', 'y', 'labels'], errors='ignore')
X_grid_clean = features_only.dropna()
valid_idx = X_grid_clean.index

chunk_size = 2000
num_chunks = int(np.ceil(len(X_grid_clean) / chunk_size))
predicted_probabilities_list = []

for i in range(num_chunks):
    start = i * chunk_size
    end = min((i + 1) * chunk_size, len(X_grid_clean))
    chunk = X_grid_clean.iloc[start:end]
    probs = best_model.predict_proba(chunk)[:, 1]
    predicted_probabilities_list.append(probs)

predicted_probs = np.concatenate(predicted_probabilities_list)


In [6]:
features_only = combined_df_nans.drop(columns=['x', 'y', 'labels'], errors='ignore')

In [None]:
valid_mask = features_only.notna().all(axis=1)

predictions = cv_model.predict_proba(features_only[valid_mask])[:, 1]  
full_predictions = np.full(len(features_only), np.nan)
full_predictions[valid_mask] = predictions


In [11]:
all_probabilities = np.full((len(combined_df_nans),), np.nan)

all_probabilities[valid_idx] = predicted_probs

num_x = len(np.unique(x_coords))
num_y = len(np.unique(y_coords))

In [12]:
sorted_idx = np.lexsort((x_coords, y_coords))  
all_probs_sorted = all_probabilities[sorted_idx]

In [13]:
predicted_grid = all_probs_sorted.reshape((num_y, num_x))

In [None]:
import matplotlib.pyplot as plt

x_sorted = np.sort(np.unique(x_coords))
y_sorted = np.sort(np.unique(y_coords))

# Plot
plt.figure(figsize=(10, 8))
plt.imshow(predicted_grid, cmap='Blues', origin='lower', extent=[x_sorted[0], x_sorted[-1], y_sorted[0], y_sorted[-1]])
plt.colorbar(label='Predicted Probability')
plt.title('Predicted Probability of Presence')
plt.xlabel('X')
plt.ylabel('Y')
plt.tight_layout()
plt.show()

In [25]:
np.save("/cluster/home/maikents/sinmod_features_masters/predicted_probabilities_pipeline1.npy", predicted_grid)


In [2]:
coral_filepath_emod = '/cluster/home/maikents/features_midnor_2019/coral_data/midnor_coral_data_emod_grid.parquet'
coral_data_emod = pd.read_parquet(coral_filepath_emod)

In [3]:
coral_x = coral_data_emod['x']
coral_y = coral_data_emod['y']

In [None]:
combined_df = pd.DataFrame({
    'x': combined_df_nans['x'],
    'y': combined_df_nans['y'],
    'predicted_probability': all_probs_sorted
})

In [None]:
combined_df_not_sorted = pd.DataFrame({
    'x': combined_df_nans['x'],
    'y': combined_df_nans['y'],
    'predicted_probability': all_probabilities.reshape((num_y, num_x))
})

In [48]:
combined_df.to_parquet('/cluster/home/maikents/sinmod_features_masters/predicted_probabilities_with_coordinates.parquet', index=False)


In [None]:
combined_df_not_sorted.to_parquet('/cluster/home/maikents/sinmod_features_masters/predicted_probabilities_with_coordinates_not_sorted.parquet', index=False)


In [4]:
combined_df = pd.read_parquet('/cluster/home/maikents/sinmod_features_masters/predicted_probabilities_with_coordinates.parquet')

In [None]:
def jackknife_test(model, X_train, y_train, X_test, y_test):
    n_features = X_train.shape[1]
    
    auc_scores_individual_features = {}
    auc_scores_leave_one_out = {}

    for i in range(n_features):
        X_train_single = X_train.iloc[:, i].values.reshape(-1, 1)
        X_test_single = X_test.iloc[:, i].values.reshape(-1, 1)

        model.fit(X_train_single, y_train)

        y_pred_single = model.predict(X_test_single)
        auc_single = roc_auc_score(y_test, y_pred_single)
        auc_scores_individual_features[X_train.columns[i]] = auc_single
    
    for i in range(n_features):
        X_train_loo = X_train.drop(X_train.columns[i], axis=1)
        X_test_loo = X_test.drop(X_test.columns[i], axis=1)

        model.fit(X_train_loo, y_train)

        y_pred_loo = model.predict(X_test_loo)
        auc_loo = roc_auc_score(y_test, y_pred_loo)
        auc_scores_leave_one_out[X_train.columns[i]] = auc_loo
    
    model.fit(X_train, y_train)
    y_pred_full = model.predict(X_test)
    full_auc = roc_auc_score(y_test, y_pred_full)

    return full_auc, auc_scores_individual_features, auc_scores_leave_one_out


full_auc, auc_scores_individual_features, auc_scores_leave_one_out = jackknife_test(best_model, X_train, y_train, X_test, y_test)

print(f"Full AUC score = {full_auc:.3f}")


for feature, auc in auc_scores_leave_one_out.items():
    print(f"{feature} dropped: AUC = {auc:.3f}")


for feature, auc in auc_scores_individual_features.items():
    print(f"{feature} only: AUC = {auc:.3f}")