In [184]:
import pandas as pd
import numpy as np
import elapid
from elapid import MaxentModel
import xarray as xr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score
import elapid
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn import metrics



In [185]:
data_filepath = '/cluster/home/maikents/coral-mapping/processed_data/df_ready_for_training.parquet'
df = pd.read_parquet(data_filepath)

In [186]:
print(df.columns)

Index(['x', 'y', 'bottom_statistical_northness_features',
       'bottom_statistical_eastness_features', 'current_aspect_angle',
       'bottom_salinity_features_mean', 'bottom_current_features_mean',
       'bottom_temperature_features_10th_percentile', 'label',
       'aspect_cos_clipped', 'aspect_sin_clipped',
       'bathymetry_32N_Clip_sample_clipped', 'broad_BPI_std_clipped',
       'fine_BPI_std_clipped', 'slope_clipped'],
      dtype='object')


In [187]:
absence_df = df[df['label'] == 0]
downsampled_absences = absence_df.sample(n=2000, random_state=42)

presence_df = df[df['label'] == 1]

In [188]:
balanced_df = pd.concat([downsampled_absences, presence_df])
print(balanced_df.columns)

Index(['x', 'y', 'bottom_statistical_northness_features',
       'bottom_statistical_eastness_features', 'current_aspect_angle',
       'bottom_salinity_features_mean', 'bottom_current_features_mean',
       'bottom_temperature_features_10th_percentile', 'label',
       'aspect_cos_clipped', 'aspect_sin_clipped',
       'bathymetry_32N_Clip_sample_clipped', 'broad_BPI_std_clipped',
       'fine_BPI_std_clipped', 'slope_clipped'],
      dtype='object')


In [189]:
#Define features and labels
X = balanced_df.drop(columns=['label', 'x', 'y'])  #Drop label and coordinates
y = balanced_df['label']

In [190]:
#Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [191]:
#Initialize MaxEnt model
model = elapid.MaxentModel()


In [192]:
#Set up K fold cross validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)


In [225]:
scores = []
auc_scores = []
f1_scores = []
precision_scores = []
recall_scores = []
balanced_accuracy_scores = []

# Loop through each cross-validation fold
for train_index, test_index in kf.split(X):
    X_train_cv, X_test_cv = X.iloc[train_index], X.iloc[test_index]
    y_train_cv, y_test_cv = y.iloc[train_index], y.iloc[test_index]

    # Initialize the MaxEnt model
    model = elapid.MaxentModel(transform='cloglog'#,
                               #beta_multiplier=4.0,     #adjusts regularization scale
                               #beta_hinge = 3.0,        #controls hinge regularization
                               #beta_threshold = 1.0    #applies to threshold regularization
    )

    # Fit the model on the training data
    model.fit(X_train_cv, y_train_cv)

    # Predict on the test data
    y_pred = model.predict(X_test_cv)
    y_test_pred_labels = (y_pred > 0.5).astype(int)  # Convert probabilities to binary labels

    

    # Evaluate the model using AUC score

    auc_elapid = metrics.roc_auc_score(y_test_cv, y_pred)
    auc_scores.append(auc_elapid)

    f1 = metrics.f1_score(y_test_cv, y_test_pred_labels)
    f1_scores.append(f1)
    
    precision = metrics.precision_score(y_test_cv, y_test_pred_labels)
    precision_scores.append(precision)    
    
    recall = metrics.recall_score(y_test_cv, y_test_pred_labels)
    recall_scores.append(recall)

    balanced_accuracy_score = metrics.balanced_accuracy_score(y_test_cv, y_test_pred_labels)
    balanced_accuracy_scores.append(balanced_accuracy_score)


    # Optionally, calculate accuracy
    #accuracy = accuracy_score(y_test_cv, y_pred)
    #scores.append(accuracy)

# Calculate the average AUC and accuracy across all folds
avg_auc = sum(auc_scores) / len(auc_scores)
avg_f1 = sum(f1_scores) / len(f1_scores)
avg_recall = sum(recall_scores) / len(recall_scores)
avg_precision = sum(precision_scores) / len(precision_scores)
avg_balanced_accuracy = sum(balanced_accuracy_scores) / len(balanced_accuracy_scores)




print(f"Average AUC: {avg_auc:.4f}")
print(f"Average F1: {avg_f1:.4f}")
print(f"Average Precison: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average balanced accuracy score: {avg_balanced_accuracy:.4f}")





A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.drop(["geometry"], axis=1, errors="ignore", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.drop(["geometry"], axis=1, errors="ignore", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.drop(["geometry"], axis=1, errors="ignore", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-vers

Average AUC: 0.9590
Average F1: 0.8004
Average Precison: 0.8431
Average Recall: 0.7639
Average balanced accuracy score: 0.8562


In [198]:
# Train the model on the full training data
model = elapid.MaxentModel(transform='cloglog', beta_multiplier=2.0)
model.fit(X_train, y_train)

# Predict on the test data
y_test_pred = model.predict(X_test)
y_test_pred_labels = (y_test_pred > 0.5).astype(int)  # Convert probabilities to binary labels


# Evaluate using accuracy and AUC
auc_elapid = metrics.roc_auc_score(y_test, y_test_pred)
f1 = metrics.f1_score(y_test, y_test_pred_labels)
precision = metrics.precision_score(y_test, y_test_pred_labels)
recall = metrics.recall_score(y_test, y_test_pred_labels)
balanced_accuracy = metrics.balanced_accuracy_score(y_test, y_test_pred_labels)


print(f"Test AUC score: {auc_elapid:0.3f}")
print(f"Test F1 score: {f1:0.3f}")
print(f"Test precision score: {precision:0.3f}")
print(f"Test recall score: {recall:0.3f}")
print(f"Balanced accuracy score: {balanced_accuracy:0.3f}")


Test AUC score: 0.957
Test F1 score: 0.772
Test precision score: 0.824
Test recall score: 0.725
Balanced accuracy score: 0.835


In [226]:
#Regularization tuning

model = elapid.MaxentModel(
    transform='cloglog',
    beta_multiplier=4.0,     #adjusts regularization scale
    beta_hinge = 3.0,        #controls hinge regularization
    beta_threshold = 1.0    #applies to threshold regularization
)
model.fit(X_train, y_train)

# Predict on the test data
y_test_pred = model.predict(X_test)
y_test_pred_labels = (y_test_pred > 0.5).astype(int)  # Convert probabilities to binary labels


# Evaluate using accuracy and AUC
auc_elapid = metrics.roc_auc_score(y_test, y_test_pred)
f1 = metrics.f1_score(y_test, y_test_pred_labels)
precision = metrics.precision_score(y_test, y_test_pred_labels)
recall = metrics.recall_score(y_test, y_test_pred_labels)
balanced_accuracy = metrics.balanced_accuracy_score(y_test, y_test_pred_labels)


print(f"Test AUC score: {auc_elapid:0.3f}")
print(f"Test F1 score: {f1:0.3f}")
print(f"Test precision score: {precision:0.3f}")
print(f"Test recall score: {recall:0.3f}")
print(f"Balanced accuracy score: {balanced_accuracy:0.3f}")

Test AUC score: 0.958
Test F1 score: 0.779
Test precision score: 0.832
Test recall score: 0.732
Balanced accuracy score: 0.840
