In [1]:
import os
import pandas as pd
import warnings

import matplotlib.pyplot as plt



from numpy import mean
from numpy import std

warnings.simplefilter(action='ignore', category=FutureWarning)
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold ### delete this
from sklearn.model_selection import StratifiedKFold #### add this
from sklearn.model_selection import GridSearchCV
from pkg_resources import resource_filename
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, roc_auc_score, confusion_matrix #RDCOMM adeed roc_auc_score and confusion_matrix

from xgboost import XGBClassifier
from xgboost import plot_importance

### RDCOMM added
import pickle
import seaborn as sns


In [2]:
# Create the full path to the CSV file
seed_data_path = os.path.join('data/ml_paper_seeds_all.csv')



In [3]:
## Create folders

names_all_features= ["Area", "Perim.", "Feret", "MinFeret", "Circ.", "AR", "Round", "Solidity", "Mean_L", "Mean_a","Mean_b", "Mean_grey", "Mean_core_grey","Dissimilarity","Contrast","Homogeneity","Energy","Correlation","ASM","Bin_germ"]
names_colour_features= ["Area", "Perim.", "Feret", "MinFeret", "Circ.", "AR", "Round", "Solidity", "Mean_L", "Mean_a","Mean_b","Dissimilarity","Contrast","Homogeneity","Energy","Correlation","ASM", "Bin_germ"]
names_xray_features= ["Mean_grey", "Mean_core_grey", "Bin_germ"]


species = ["Alnus_glutinosa","Betula_pendula","Betula_pubescens","Pinus_sylvestris","Sorbus_aucuparia","all_species"]
traits = [names_all_features,names_colour_features,names_xray_features]

for i in species:
    for j in traits:

        species = i
        names = j
        list_name = [name for name, obj in globals().items() if obj is j][0]

        folder_path=os.path.join("Outputs/XGBoost/",species)
        folder_path2= os.path.join(folder_path, list_name)

        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
            print(f"Folder '{species}' created successfully at {folder_path}")
        else:
            print(f"Folder '{species}' already exists at {folder_path}")


        if not os.path.exists(folder_path2):
            os.makedirs(folder_path2)
            print(f"Folder '{list_name}' created successfully at {folder_path2}")
        else:
            print(f"Folder '{list_name}' already exists at {folder_path2}")
    
        #RDCOMM changed this section to include the holdout

        seeds_all_data = pd.read_csv(seed_data_path)
        seeds_sp = seeds_all_data[seeds_all_data.Species == species] 

        seeds = seeds_sp[seeds_sp.Set == "train"] 
        seeds = seeds[names]

        X, y = seeds[seeds.columns.tolist()[:-1]], seeds[seeds.columns.tolist()[-1]]
        holdout = seeds_sp[seeds_sp.Set == "Hold out"] 
        holdout = holdout[names]
        X_test, y_test = holdout[holdout.columns.tolist()[:-1]], holdout[holdout.columns.tolist()[-1]]
        gridsearch_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)  #RDCOMM Try 10?

            # define the model
        gbm = xgb.XGBClassifier(objective='binary:hinge') 

        # params
        gbm_param_grid = {'learning_rate': [0.01, 0.05, 0.1,0.3],
                          'n_estimators': [100, 500, 1000],
                  'subsample': [0.3, 0.5, 0.9,1],
                  'min_child_weight': [1, 5, 9, 11],
                  'gamma': [0, 1, 4],
                  'colsample_bytree': [0.6, 0.8, 1],
                  'max_depth': [2, 3, 5, 6, 8, 10]}

        # define search
        grid_mse = GridSearchCV(estimator=gbm, param_grid=gbm_param_grid,
                        scoring='f1', cv=gridsearch_cv, verbose=3, n_jobs=-1, return_train_score=True,
                        refit=True)  ### configure the hyperparameter search to refit a final model with the entire training dataset using the best hyperparameters found during the search."

        model_trained_on_all_data_except_holdout = grid_mse.fit(X, y)

        yhat = model_trained_on_all_data_except_holdout.predict(X_test)

        yhat_probs = model_trained_on_all_data_except_holdout.predict_proba(X_test)[:, 1] # Probability of positive class to calculate AUC, but I think in this case it is the same as yhat. Probably the threshold has already been applied.

        modname = f'cv_model.dat'
        file_path3 = os.path.join(folder_path2, modname)

        with open(file_path3, 'wb') as file:
            pickle.dump(model_trained_on_all_data_except_holdout, file)

        filename = f'output_cv.csv'
    
        tosave=pd.DataFrame(model_trained_on_all_data_except_holdout.cv_results_)

        saving_path=os.path.join(folder_path2,filename)

        tosave.to_csv(saving_path, index=False,mode='w+')
    
        #evaluate the model
        acc = accuracy_score(y_test, yhat)
        prec = precision_score(y_test, yhat)
        f1 = f1_score(y_test, yhat)
        rec = recall_score(y_test, yhat) #same as sensitivity
        auc = roc_auc_score(y_test, yhat_probs) #AUC improves considerably when XGBClassifier(objective = 'binary:logistic')

        # Calculate specificity
        tn, fp, fn, tp = confusion_matrix(y_test, yhat).ravel()
        specif = tn / (tn + fp)

    
        print('Accuracy: %.3f, Precision: %.3f, f1: %.3f, Recall: %.3f, AUC: %.3f, Specificity: %.3f' % (acc, prec, f1, rec, auc, specif))
    
    





Folder 'Alnus_glutinosa' already exists at Outputs/XGBoost/Alnus_glutinosa
Folder 'names_all_features' already exists at Outputs/XGBoost/Alnus_glutinosa\names_all_features
Fitting 5 folds for each of 10368 candidates, totalling 51840 fits
Accuracy: 0.975, Precision: 0.974, f1: 0.984, Recall: 0.993, AUC: 0.956, Specificity: 0.918
Folder 'Alnus_glutinosa' already exists at Outputs/XGBoost/Alnus_glutinosa
Folder 'names_colour_features' already exists at Outputs/XGBoost/Alnus_glutinosa\names_colour_features
Fitting 5 folds for each of 10368 candidates, totalling 51840 fits
Accuracy: 0.751, Precision: 0.804, f1: 0.844, Recall: 0.888, AUC: 0.607, Specificity: 0.327
Folder 'Alnus_glutinosa' already exists at Outputs/XGBoost/Alnus_glutinosa
Folder 'names_xray_features' already exists at Outputs/XGBoost/Alnus_glutinosa\names_xray_features
Fitting 5 folds for each of 10368 candidates, totalling 51840 fits
Accuracy: 0.980, Precision: 0.974, f1: 0.987, Recall: 1.000, AUC: 0.959, Specificity: 0.918