In [None]:
from lavaset.lavaset import LAVASET
import numpy as np
import pandas as pd
from scipy.spatial import distance_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

### Load the HNMR metabolomics dataset

Here we are loading the MTBLS1 publically available dataset, setting the X and y targets for classification and splitting our dataset to training and testing sets.

In [None]:
mtbls1 = pd.read_csv('example_data/MTBLS1.csv')
X = np.array(mtbls1.iloc[:, 1:])
y = np.array(mtbls1.iloc[:, 0], dtype=np.double)

if np.unique(y).any() != 0:
    y = np.where(y == 1, 0, 1).astype(np.double)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=180)

### Run LAVASET

Here we are calling the LAVASET model and the `knn_calculation` function from the model that calculates the nearest neighbors, either based on a number of neighbors set by the user or a distance matrix. 

In [None]:
model = LAVASET(ntrees=100, n_neigh=10, distance=False, nvartosample='sqrt', nsamtosample=0.5, oobe=True) # 425taking 1/3 of samples for bootstrapping
# knn = model.knn_calculation(dist) ### this is the input for the knn calcualtion 
knn = model.knn_calculation(mtbls1.columns[1:], data_type='1D') ### this is the input for the knn calculation 

Here, we are fitting LAVASET to the training data by calling the LAVASET-specific fitting function called `fit_lavaset`.

In [None]:
lavaset = model.fit_lavaset(X_train, y_train, knn, random_state=5)

Here, we are predicting the test data, with the LAVASET-specific predict function. The output of `predict_lavaset` consists of three items. The y predictions, the votes for each tree, and the out-of-bag (oobe) score. 

In [None]:
y_preds, votes, oobe = model.predict_lavaset(X_test, lavaset)

Below is just an example of different metrics that can be called via the `sklearn` package for testing the performance of LAVASET.

In [None]:
accuracy = accuracy_score(y_test, np.array(y_preds, dtype=int))
precision = precision_score(y_test, np.array(y_preds, dtype=int))
recall = recall_score(y_test, np.array(y_preds, dtype=int))
f1 = f1_score(y_test, np.array(y_preds, dtype=int))

Finally, to get the feature importance calculations from LAVASET you can call the `feature_evaluation` function 

In [None]:
pd.DataFrame(model.feature_evaluation(X_train, lavaset)).to_csv(f'~/Documents/lavaset_local/test_results.csv')