First, load the data,in this notebook we are using the smokoy mountain data as the example. And will use basic AE to get the transformation of the data and do prediction using the same models.

In [1]:
import sys
sys.path.append('../../../Code')
import loadData 
import RunML
import RunML_continue
import FS
import metric

In [2]:
import pandas as pd
import numpy as np
import random
from sklearn import svm
import pickle
import matplotlib.pyplot as plt

In [66]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

TensorFlow version: 2.18.0
Num GPUs Available:  0


### 1. Load the data and data preprocess
After reading the data, convert the abundance matrix into relative abundance matrix; for the response variables, since we have multiple variables, organize them into an array with each variable as a column.

In [4]:
data,burn_label,un_label,duration_label,ASVs,df=loadData.loadSoilData("../data/count_table/merge_proportion_asv_smoky_moutain.csv")

In [5]:
df = pd.DataFrame(data,columns = ASVs)# for function later

In [6]:
print(data.shape)
print(burn_label.shape)

(59, 1768)
(59,)


In [7]:
yList= np.column_stack((burn_label,un_label,duration_label))# y list is a 2D array, each column is a response outcome

In [8]:
y_index=['burn_label', 'un_label', 'duration_label']

In [9]:
print(yList[:5])
      
for i in range(yList.shape[1]):
    print(pd.Series(yList[:,i]).value_counts())

[['No' 'Natural' 'Annual']
 ['No' 'Natural' 'Perennial']
 ['No' 'Urban' 'Perennial']
 ['No' 'Urban' 'Annual']
 ['No' 'Urban' 'Perennial']]
Yes    42
No     17
Name: count, dtype: int64
Urban      33
Natural    26
Name: count, dtype: int64
Perennial    37
Annual       22
Name: count, dtype: int64


### 2.AE


In [64]:

# Define the Autoencoder Model
def create_AE(input_dim=1, latent_dim=100, activation='relu', loss='mae', optimizer='adam'):
    autoencoder = tf.keras.Sequential([
        tf.keras.Input(shape=(input_dim,)),
        layers.Flatten(),
        layers.Dense(latent_dim, activation=activation),
        layers.Dense(input_dim, activation='sigmoid')  # Decoder to reconstruct the input
    ])
    
    autoencoder.compile(loss=loss, optimizer=optimizer, metrics=['mse'])
    
    return autoencoder

# Manual GridSearchCV function
def run_AE(X_train_scaled, X_test_scaled, param_grid=None):
    if param_grid is None:
        param_grid = {
            'input_dim': [X_train_scaled.shape[1]], 
            'latent_dim': [10, 25, 50, 100],
            'activation': ['relu', 'sigmoid', 'tanh'],
            'loss': ['mae'],#, 'binary_crossentropy'],#
            'optimizer': ['sgd', 'adam'],
            'epochs': [10],
            'batch_size': [32]
        }

    # Manually define a function for model training
    def fit_model(input_dim, latent_dim, activation, loss, optimizer, epochs, batch_size):
        autoencoder = create_AE(input_dim=input_dim, latent_dim=latent_dim, activation=activation, 
                                 loss=loss, optimizer=optimizer)
        
        # Fit the model with validation data
        autoencoder.fit(X_train_scaled, X_train_scaled, epochs=epochs, batch_size=batch_size, 
                        validation_data=(X_test_scaled, X_test_scaled), verbose=0)
        return autoencoder

    # Custom GridSearchCV logic
    best_model = None
    best_params = None
    best_score = float('inf')

    for latent_dim in param_grid['latent_dim']:
        for activation in param_grid['activation']:
            for loss in param_grid['loss']:
                for optimizer in param_grid['optimizer']:
                    for epochs in param_grid['epochs']:
                        for batch_size in param_grid['batch_size']:
                            
                            # Fit the model with a specific combination of hyperparameters
                            autoencoder = fit_model(X_train_scaled.shape[1], latent_dim, activation, loss, optimizer, epochs, batch_size)
                            
                            # Get the validation loss (first item in the returned list)
                            val_loss = autoencoder.evaluate(X_test_scaled, X_test_scaled, verbose=0)[0]
                            # Track the best parameters based on validation loss
                            if val_loss < best_score:
                                best_score = val_loss
                                best_params = {
                                    'latent_dim': latent_dim,
                                    'activation': activation,
                                    'loss': loss,
                                    'optimizer': optimizer,
                                    'epochs': epochs,
                                    'batch_size': batch_size
                                }
                                best_model = autoencoder
    print(f"latent dimension: {latent_dim}, activation:{activation}, opt:{optimizer}")               
    # Ensure the best model is built
    best_model.predict(X_train_scaled[:1])  # Call the model on one sample to build it

    # **New Fix: Call the encoder with proper outputs**
    encoder_layer = tf.keras.Sequential(best_model.layers[:2])  # Extract only the encoder layers
    
    # Ensure the encoder is also built
    encoder_layer.predict(X_train_scaled[:1])  # Trigger the encoder to initialize

    # Generate latent features from encoder representation
    AE_train = pd.DataFrame(encoder_layer.predict(X_train_scaled))
    AE_train = AE_train.add_prefix('feature_')


    AE_test = pd.DataFrame(encoder_layer.predict(X_test_scaled))
    AE_test = AE_test.add_prefix('feature_')
    
    return AE_train, AE_test


In [24]:
data_train, data_test, y_train, y_test = RunML_continue.split_and_scale_data(data,burn_label)
print(data_train.shape)
print(data_test.shape)

(41, 1768)
(18, 1768)


In [65]:
AE_train, AE_test = run_AE(data_train,data_test)

  super().__init__(**kwargs)


latent dimension: 100, activation:tanh, opt:adam
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step


In [51]:
data_AE = pd.concat([AE_train, AE_test], axis=0).to_numpy()


In [30]:
iter =30
cls = ["RF","SVM", "CatBoost","NB"]

In [52]:
data_subset = {"AllFeatures":data, 
               "AE": data_AE
              }

In [53]:
print(np.shape(data))
print(np.shape(data_AE))

(59, 1768)
(59, 100)


The  function will print out the accuracy and AUC for each dataset using each classifier, and also will return the y_actual, y_predict, y_predprob for future use.

In [56]:
targetLabel=burn_label
dict_cm = RunML_continue.runClassifier_FScompare(data_subsets= data_subset,y= targetLabel,N=iter,classifiers=cls,SMOTE=True)

             RF_Accuracy    RF_AUC  SVM_Accuracy   SVM_AUC  CatBoost_Accuracy  \
AllFeatures     0.796970  0.879630      0.746970  0.850000           0.795455   
AE              0.709091  0.664352      0.354545  0.417593           0.693939   

             CatBoost_AUC  NB_Accuracy    NB_AUC  
AllFeatures      0.921528     0.828788  0.797222  
AE               0.664352     0.522727  0.659259  


In [58]:
print(metric.metric_sum(dict_cm))

                      Accuracy  Precision    Recall  Specification       Mcc
AllFeatures_RF        0.796610   0.857143  0.352941       0.976190  0.460965
AllFeatures_SVM       0.745763   0.571429  0.470588       0.857143  0.348896
AllFeatures_CatBoost  0.796610   0.857143  0.352941       0.976190  0.460965
AllFeatures_NB        0.830508   0.705882  0.705882       0.880952  0.586835
AE_RF                 0.711864   0.500000  0.352941       0.857143  0.236376
AE_SVM                0.355932   0.294118  0.882353       0.142857  0.033350
AE_CatBoost           0.694915   0.466667  0.411765       0.809524  0.230163
AE_NB                 0.525424   0.342857  0.705882       0.452381  0.145912


In [63]:
dict_cm_0 = RunML_continue.ML_model_SCV(data, burn_label, "RF", SMOTE=True,k=5)
print(dict_cm_0)

{'mean_accuracy': 0.796969696969697, 'mean_auc': 0.8796296296296298, 'y_true': ['No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No'], 'y_pred': ['Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No'], 'y_pred_prob': [0.77, 0.5, 0.85, 0.99, 0.96, 0.98, 0.91, 0.48, 0.64, 1.0, 0.98, 0.95, 0.96, 0.95, 0.97, 0

In [62]:
dict_cm_svm = RunML_continue.ML_model_SCV(data, burn_label, "SVM", SMOTE=True,k=5)
print(dict_cm_svm)

{'mean_accuracy': 0.746969696969697, 'mean_auc': 0.85, 'y_true': ['No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No'], 'y_pred': ['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No'], 'y_pred_prob': [0.5092169307792195, 0.6210576446537301, 0.5171072123455642, 0.844217133419973, 0.9283800461852867, 0.942338920

In [55]:
targetLabel=un_label
dict_cm_un = RunML_continue.runClassifier_FScompare(data_subsets= data_subset,y= targetLabel,N=iter,classifiers=cls,SMOTE=True)

             RF_Accuracy    RF_AUC  SVM_Accuracy   SVM_AUC  CatBoost_Accuracy  \
AllFeatures     0.624242  0.703175      0.556061  0.424444           0.577273   
AE              0.524242  0.558810      0.422727  0.647460           0.422727   

             CatBoost_AUC  NB_Accuracy    NB_AUC  
AllFeatures      0.684921     0.625758  0.608571  
AE               0.438730     0.457576  0.403333  


In [57]:
targetLabel=duration_label
dict_cm_dur = RunML_continue.runClassifier_FScompare(data_subsets= data_subset,y= targetLabel,N=iter,classifiers=cls,SMOTE=True)

             RF_Accuracy    RF_AUC  SVM_Accuracy   SVM_AUC  CatBoost_Accuracy  \
AllFeatures     0.610606  0.344554      0.490909  0.471607           0.490909   
AE              0.487879  0.387946      0.409091  0.538929           0.490909   

             CatBoost_AUC  NB_Accuracy    NB_AUC  
AllFeatures      0.322857     0.578788  0.558571  
AE               0.399643     0.442424  0.418304  


####  Model 2

##### Annual/ Perennial
For comparison, only use the selected features for Annual/ Perennial classification. 

Even though the performance of our method is still not as good as Lasso, but it gives us better prediction using only 19 features compared with the union selection which has 86 featues.

In [38]:
indices_3 = np.where(weights[2,:] > stats.chi2.ppf(1 - 0.1, 1))[0]

In [39]:
indices_3_sort = sorted(indices_3, key=lambda col: weights[2,col], reverse=True)

In [40]:
len(indices_3_sort)

26

In [41]:
X_FS_dur = data[:,indices_3_sort]

In [42]:
data_subset2 = {"AllFeatures":data, 
               "SelectMicro": X_FS_dur,
               "Lasso":X_lasso_dur,
               "Lasso_finetune":X_lasso_ft_dur,
               "Random":data
              }
print(np.shape(data))
print(np.shape(X_FS_dur))
print(np.shape(X_lasso_dur))
print(np.shape(X_lasso_ft_dur))

(59, 1768)
(59, 26)
(59, 54)
(59, 9)


In [43]:
dict_cm_dur2 = RunML_continue.runClassifier_FScompare(data_subsets= data_subset2,y= duration_label,N=iter,classifiers=cls,SMOTE=True)

Key: mean_accuracy, Type: <class 'numpy.float64'>
Key: mean_auc, Type: <class 'numpy.float64'>
Key: y_true, Type: <class 'list'>
Key: y_pred, Type: <class 'list'>
Key: y_pred_prob, Type: <class 'list'>
Key: mean_accuracy, Type: <class 'numpy.float64'>
Key: mean_auc, Type: <class 'numpy.float64'>
Key: y_true, Type: <class 'list'>
Key: y_pred, Type: <class 'list'>
Key: y_pred_prob, Type: <class 'list'>
Key: mean_accuracy, Type: <class 'numpy.float64'>
Key: mean_auc, Type: <class 'numpy.float64'>
Key: y_true, Type: <class 'list'>
Key: y_pred, Type: <class 'list'>
Key: y_pred_prob, Type: <class 'list'>
Key: mean_accuracy, Type: <class 'numpy.float64'>
Key: mean_auc, Type: <class 'numpy.float64'>
Key: y_true, Type: <class 'list'>
Key: y_pred, Type: <class 'list'>
Key: y_pred_prob, Type: <class 'list'>
Key: mean_accuracy, Type: <class 'numpy.float64'>
Key: mean_auc, Type: <class 'numpy.float64'>
Key: y_true, Type: <class 'list'>
Key: y_pred, Type: <class 'list'>
Key: y_pred_prob, Type: <clas

##### burn
Let's also try it on burn label (expected to have better performance than Lasso, but ......)

So, we check the feature ratio for burn label.

In [44]:
weights[0,:]
indices_1 = np.where(weights[0,:] > stats.chi2.ppf(1 - 0.1, 1))[0]
indices_1_sort = sorted(indices_1, key=lambda col: weights[0,col], reverse=True)
len(indices_1_sort)

X_FS_burn = data[:,indices_1_sort]
X_lasso = data[:,xlabel_lasso_burn]

In [45]:
data_subset3 = {"AllFeatures":data, 
               "SelectMicro": X_FS_burn,
               "Lasso":X_lasso_burn,
               "Lasso_finetune":X_lasso_ft_burn,
               "Random":data
              }
print(np.shape(data))
print(np.shape(X_FS_burn))
print(np.shape(X_lasso))

(59, 1768)
(59, 141)
(59, 16)


In [46]:
dict_cm_burn2 = RunML_continue.runClassifier_FScompare(data_subsets= data_subset3,y= burn_label,N=iter,classifiers=cls,SMOTE=True)

Key: mean_accuracy, Type: <class 'numpy.float64'>
Key: mean_auc, Type: <class 'numpy.float64'>
Key: y_true, Type: <class 'list'>
Key: y_pred, Type: <class 'list'>
Key: y_pred_prob, Type: <class 'list'>
Key: mean_accuracy, Type: <class 'numpy.float64'>
Key: mean_auc, Type: <class 'numpy.float64'>
Key: y_true, Type: <class 'list'>
Key: y_pred, Type: <class 'list'>
Key: y_pred_prob, Type: <class 'list'>
Key: mean_accuracy, Type: <class 'numpy.float64'>
Key: mean_auc, Type: <class 'numpy.float64'>
Key: y_true, Type: <class 'list'>
Key: y_pred, Type: <class 'list'>
Key: y_pred_prob, Type: <class 'list'>
Key: mean_accuracy, Type: <class 'numpy.float64'>
Key: mean_auc, Type: <class 'numpy.float64'>
Key: y_true, Type: <class 'list'>
Key: y_pred, Type: <class 'list'>
Key: y_pred_prob, Type: <class 'list'>
Key: mean_accuracy, Type: <class 'numpy.float64'>
Key: mean_auc, Type: <class 'numpy.float64'>
Key: y_true, Type: <class 'list'>
Key: y_pred, Type: <class 'list'>
Key: y_pred_prob, Type: <clas

In [47]:
targetLabel=burn_label
print(set(targetLabel))
selectedASV1 = [ASVs[i] for i in indices_1_sort]
selectedASV1_lasso = [ASVs[i] for i in xlabel_lasso_burn]
FS.plotPresenseRatio(X_FS_burn,targetLabel,selectedASV1,posLabel="Yes",posText="Burned",negText="Not Burned",entries=len(selectedASV1))

{'Yes', 'No'}


TypeError: plotPresenseRatio() got an unexpected keyword argument 'entries'

In [None]:
FS.plotPresenseRatio(X_lasso,targetLabel,selectedASV1_lasso,posLabel="Yes",posText="Burned",negText="Not Burned",entries=len(selectedASV1_lasso))


In [None]:
print(np.asarray(selectedASV1))
print(np.asarray(selectedASV1_lasso))
print(np.isin(selectedASV1_lasso, selectedASV1))

#### Model3
To compare, also consider the union of the features selected by Lasso: the result shows that
1. for burn label, lasso's RF performance is worse, SVM is much better
2. for un_label,lasso's RF-accuracy SVM-AUC are better, RF-AUC and SVM-accuracy are worse.
3. for dur label, lasso's RF performance is worse, SVM-accuracy is worse SVM-AUC is  better

In [None]:
# select the union of lasso feature selection
xlabel_lasso_union = pd.concat([pd.Series(xlabel_lasso), pd.Series(xlabel_lasso_burn), pd.Series(xlabel_lasso_dur)]).drop_duplicates().reset_index(drop=True)
len(xlabel_lasso_union)

In [None]:
X_lasso_union = data[:,xlabel_lasso_union]
X_lasso_union.shape

In [None]:
data_subset = {
    #"AllFeatures":data, 
            #   "SelectMicro": X_FS,
               "Lasso":X_lasso_union
              #, "Random":data
              }

print(np.shape(X_lasso_union))

In [None]:
for i in range(yList.shape[1]):
    targetLabel=yList[:,i]
    dict_cm_lasso = RunML_continue.runClassifier_FScompare(data_subsets= data_subset,y= targetLabel,N=iter,classifiers=cls)
    for dataset_name, classifiers in dict_cm_lasso.items():
        for classifier_name, labels in classifiers.items():
            actual_labels = labels[0]
            predicted_labels = labels[1]
            metric.plot_confusion_matrices(actual_labels, predicted_labels,f"{dataset_name} - {classifier_name}")

### SHARP value
Compare the SHARP value of the significant features selected by our method and Lasso.(pending)

In [None]:
#df_X_FS = pd.DataFrame(X_FS,columns=[ASVs[i] for i in selectedOTU_index] )
#df_X_FS.columns

In [None]:
#RunML_continue.sharp_value(df_X_FS,burn_label,classifier_name)

In [None]:
#df_X_lasso = pd.DataFrame(X_lasso_union,columns=[ASVs[i] for i in xlabel_lasso_union] )
#RunML_continue.sharp_value(df_X_lasso,burn_label,classifier_name)

### compare the first 15 index by their present ratio

In [None]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

entries=15
selectedOTU_index_15=selectedOTU_index[:entries]
selectedASVs_15=[ASVs[i] for i in selectedOTU_index_15]

X_FS_15=data[:,selectedOTU_index_15]


targetLabel=burn_label
print(set(targetLabel))
FS.plotPresenseRatio(X_FS_15,burn_label,selectedASVs_15,posLabel="Yes",posText="Burned",negText="Not Burned")

targetLabel=un_label
print(set(targetLabel))
FS.plotPresenseRatio(X_FS_15,un_label,selectedASVs_15,posLabel="Natural",posText="Natural",negText="Urban")


targetLabel=duration_label
print(set(targetLabel))
FS.plotPresenseRatio(X_FS_15,duration_label,selectedASVs_15,posLabel="Annual",posText="Annual",negText="Perennial")

In [None]:
selectedASVs=[ASVs[i] for i in selectedOTU_index]

In [None]:
print(np.asarray(selectedASVs))
print(len(selectedASVs))

### Negative Gini Impurity
Gini Impurity is the probability of incorrectly classifying a randomly chosen element in the dataset if it were randomly labeled according to the class distribution in the dataset. It’s calculated as:

$G = 1- \sum_{i=1}^C p_i^2$

where C is the number of classes. (which means it can be used to measure for multiple level classification)

Here I will use the negative Gini Impurity to measure each OTU, if NG is large (1) which means the OTU only exist in one class, if NG value is small($1/c$) which means the OTU is evenly distributed among  the classes.

$NG = \sum_{i=1}^C p_i^2$

In [None]:
# NG for selected OTU
NG_selected = metric.Neg_GINI(X_FS,yList)
print(NG_selected.shape)
# NG for Not selected OTU
X_FS_none = np.delete(data, selectedOTU_index, axis=1)
NG_noselected = metric.Neg_GINI(X_FS_none,yList)
print(NG_noselected.shape)

In [None]:
label_lasso = [xlabel_lasso_burn,xlabel_lasso,xlabel_lasso_dur]

In [None]:
ng_lasso_list = []
for i in range(len(y_index)):
    X_lasso_ng = data[:,label_lasso[i]]
    print(X_lasso_ng.shape)
    X_lasso_none_ng = np.delete(data, label_lasso[i], axis=1)
    print(X_lasso_none_ng.shape)
    Ng1 = metric.Neg_GINI(X_lasso_ng,np.transpose(yList)[i])
    Ng2 = metric.Neg_GINI(X_lasso_none_ng,np.transpose(yList)[i])
    ng_lasso_list.append([Ng1,Ng2])


#### Compare lasso and SelectMicro

In [None]:
# Number of subplots
num_plots = len(y_index)

# Create a figure with a grid of subplots
plt.figure(figsize=(4, 4 * num_plots))

# Loop through each index and create a subplot
for i in range(num_plots):
    plt.subplot(num_plots, 1, i + 1)  # (nrows, ncols, index)
    plt.boxplot([NG_selected[i, :], ng_lasso_list[i][0]], tick_labels=['SelectMicro', 'Lasso'])
    plt.title(f'NG results of the selected OTU vs. non-selected OTUs - {y_index[i]}')
    plt.ylabel('NG')
    plt.grid(axis='y')
    #print(len(NG_selected[i, :]))
# Adjust layout
plt.tight_layout()  # Adjusts the subplots to fit into the figure area.
plt.show()  # Show all plots at once

In [None]:
# compare the selected and non select by our method
for i in range(len(y_index)):
    plt.figure(figsize=(4, 4))
    plt.boxplot([NG_selected[i,:], NG_noselected[i,:]], tick_labels=['SelectMicro', 'Not selected'])
    plt.title(f'NG results of the selected OTU vs. non selected OTUs by our method - {y_index[i]}')
    plt.ylabel('NG')
    plt.grid(axis='y')
    
# Show the plot
plt.show()

In [None]:
# compare the selected and non select by lasso
# Number of subplots
num_plots = len(y_index)

# Create a figure with a grid of subplots
plt.figure(figsize=(4, 4 * num_plots))

# Loop through each index and create a subplot
for i in range(num_plots):
    plt.subplot(num_plots, 1, i + 1)  # (nrows, ncols, index)
    plt.boxplot([ng_lasso_list[i][0], ng_lasso_list[i][1]], tick_labels=['Lasso', 'Not selected'])
    plt.title(f'NG results of the selected OTU vs. non-selected OTUs by Lasso - {y_index[i]}')
    plt.ylabel('NG')
    plt.grid(axis='y')
# Adjust layout
plt.tight_layout()  # Adjusts the subplots to fit into the figure area.
plt.show()  # Show all plots at once

### Use LOOCV for fine tune model parameters and also the model's result

In [None]:
targetLabel=burn_label
