this is me caving and making a jupyter notebook because its easier to see my outputs this way (don't think I'm changing my ways casey)

## **Machine Learning Analysis** 
**what I want to predict:**
- **can the deltas (difference in relative abundance of taxonomic families between day -15 and day 3) be used to predict which mice had blooms at day 3 or not?**

**Needed libraries**

In [1]:
## for data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
import time

## visualization
import seaborn as sns
import matplotlib.pyplot as plt
#matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.ensemble import VotingClassifier
from collections import Counter
from BorutaShap import BorutaShap

**Functions**

smaller functions that are nested in the larger overall function (they can operate by themselves too)

In [2]:
## generate x and y dataframes for machine learning and/or boruta shap/kfold cross validation
def make_xy_tables(meta_df,
                   otu_df,
                   merge_on,
                   y_col):
    output_dict = {}
    
    mini_meta = meta_df.loc[:, (merge_on, y_col)]
    comb_df = otu_df.merge(mini_meta, how="left", on=[merge_on])
    ## x - the side that has the data I want the model to use to predict y
    pre_x_df = comb_df.copy()
    x_df = pre_x_df.drop(y_col, axis=1)
    x_df[merge_on] = x_df[merge_on].astype(float)
    x_df = x_df.drop(merge_on, axis=1)
    ## y - what is to be predicted
    y_df = comb_df[y_col]

    ## saving my outputs
    output_dict.update({"x_dataframe": x_df,
                        "y_dataframe": y_df})
    return(output_dict)

## will run the desired ml model (or list of models via a for loop)
## output is a python dictionary (aka "named list") of a few dataframes
def run_models(wanted_model,
               x_train,
               y_train,
               x_test):
    
    model_out = {}
    wanted_model.fit(x_train, y_train)
    model_y_pred = wanted_model.predict(x_test)
    acc_model = round(wanted_model.score(x_train, y_train) * 100, 2)
    model_out.update({"y_pred": model_y_pred,
                      "acc_score": acc_model})
    return(model_out)

## will shuffle the indices of the input table
def shuffle_index(wanted_table):
    rng = np.random.default_rng()
    new_index = rng.permutation(np.array(wanted_table.index))
    print(new_index)
    shuff_table = wanted_table.loc[new_index]
    return shuff_table

## will take the results from the run_models function above and nicely wrap them up in a couple
## dataframes for model performance comparison
## output is also a python dictionary ("named list" for all intents and purposes)
def model_results(model_name_list,
                  model_scores,
                  model_y_preds,
                  y_test,
                  value_dict):
    output_list = {}
    ## putting accuracy scores into a df
    model_score_df = pd.DataFrame({
        "model": model_name_list,
        "score": model_scores
    })

    ## putting model y preds in a df with the key 
    y_pred_df = pd.DataFrame(model_y_preds).T
    y_pred_df.columns = model_name_list
    y_pred_df["key"] = y_test.values
    y_pred_df = y_pred_df.set_index(y_test.index)
    
    ## counting how many of each value were predicted and mapping
    ## them to their categorical counterpart
    count_y_pred = []
    for columns in y_pred_df:
        y_pred_df[columns] = y_pred_df[columns].map(value_dict)
        count_y_pred.append(Counter(y_pred_df[columns]))
    
    ## putting accuracy scores and number of y preds in the same table 
    update_model_names = model_name_list.copy()
    update_model_names.append("key")
    count_y_pred_df = pd.DataFrame(count_y_pred)
    count_y_pred_df["model"] = update_model_names
    count_scores_df = model_score_df.merge(count_y_pred_df, how="right", on=["model"])

    ## saving my function outputs in a list
    output_list.update({"y_preds": y_pred_df,
                        "acc_count_table": count_scores_df})
    return(output_list)

## grid search to optimize chosen models
## chooses the best parameters for the model to use for prediction
def model_grid_search(model,
                      param_dict,
                      x_dev,
                      y_dev,
                      model_name,
                      x_eval,
                      y_eval):
    output_list = {}
    dict_clf = {}
    GS = GridSearchCV(model,
                      param_dict,
                      cv=4)

    # Fit the data and record time taking to train
    t0 = time.time()
    GS.fit(x_dev, y_dev)
    t = time.time() - t0

    # Store best parameters, score and estimator
    best_clf = GS.best_estimator_
    best_params = GS.best_params_
    best_score = GS.best_score_
    name = model_name

    best_clf.fit(x_dev, y_dev)
    acc_eval = accuracy_score(y_eval, best_clf.predict(x_eval))
    dict_clf[name] = {
        'best_par': best_params,
        'best_clf': best_clf,
        'best_score': best_score,
        'score_eval': acc_eval,
        'fit_time': t,
    }

    ## saving my outputs
    output_list.update({"acc_eval": acc_eval,
                        "dict_clf": dict_clf})
    return(output_list)

## model voting classifier for final y pred results
def model_voteClass(estimator_list,
                    x_train,
                    y_train,
                    x_test,
                    y_test):
    voter = VotingClassifier(estimators=estimator_list, voting='hard')
    voter.fit(x_train, y_train)
    final_y_pred = voter.predict(x_test).astype(int)
    end_comp = pd.DataFrame({
        "key": y_test,
        "final_y_pred": final_y_pred})
    return(end_comp)  

larger overall function that does the brunt of the work for you (yay!)

In [3]:
## puts above functions together to run grid search on kfold cross validated x/y train to get average 
## accuracy score for the given model
def kfold_model_predict(x_dataframe,
                        y_dataframe,
                        k_fold,
                        wanted_model,
                        paramgrid,
                        wanted_model_name,
                        add_estimator_list):
    output = {}

    model_y_pred = {}
    mean_acc = {}
    for i, (train_index, test_index) in enumerate(k_fold.split(x_dataframe, y_dataframe)):
        print(f"Fold {i}:")
        print(f"Training dataset index: {train_index}")
        print(f"Testing dataset index: {test_index}")
        ## setting up test/train datasets 
        x_train = x_dataframe.filter(items=train_index, axis=0)
        x_test = x_dataframe.filter(items=test_index, axis=0)
        y_train = y_dataframe.filter(items=train_index, axis=0)
        y_test = y_dataframe.filter(items=test_index, axis=0)

        ## splitting training set to development and evaluation dfs
        x_dev,x_eval,y_dev,y_eval=train_test_split(x_train,
                                                   y_train,
                                                   test_size=0.2,
                                                   random_state=42)

        ## grid search 
        grid_search = model_grid_search(model=wanted_model,
                                        param_dict=paramgrid,
                                        x_dev=x_dev,
                                        y_dev=y_dev,
                                        model_name=wanted_model_name,
                                        x_eval=x_eval,
                                        y_eval=y_eval)
        
        dict_clf = grid_search["dict_clf"]
        pre_estimators = [(wanted_model_name, dict_clf[wanted_model_name]['best_clf'])]
        estimators = pre_estimators + add_estimator_list

        y_pred = model_voteClass(estimator_list=estimators,
                                 x_train=x_train,
                                 y_train=y_train,
                                 x_test=x_test,
                                 y_test=y_test)
    
        ## seeing how accurate the model was at predicting which mice had blooms v not 
        y_pred["model_correct"] = np.where(y_pred["key"] == y_pred["final_y_pred"], 1, 0)
        y_pred["fold"] = f"f{i}"
        model_mean_acc = y_pred["model_correct"].mean()

        ## putting together accuracy scores for each fold with the model
        mean_acc.update({f"f{i}": model_mean_acc})
        model_y_pred.update({f"f{i}_yPred": y_pred})

    ## output list
    output.update({"model_mean_acc": mean_acc,
                   "model_y_pred": model_y_pred})
    return(output)

**File paths**

In [20]:
family_deltas_fp = "../data/family_deltas_wide.tsv"
meta_fp = "../data/ml_approved_metadata.tsv"
meta_keys_fp = "../data/meta_dict_keys.tsv"

**Reading in files and data wrangling**

In [21]:
family_deltas_df = pd.read_csv(family_deltas_fp, sep='\t')
meta = pd.read_csv(meta_fp, sep='\t')
meta_keys = pd.read_csv(meta_keys_fp, sep='\t')

In [22]:
## pulling out mouse id assigned numbers as a dictionary to map it to the deltas table
mouse_id_dict = dict(zip(meta_keys.mouse_id, meta_keys.assigned_num))
inverse_mouseID_dict = dict(zip(meta_keys.assigned_num, meta_keys.mouse_id))

## bloom status dictionary
inverse_bloom_dict = dict(zip(meta_keys.assigned_num, meta_keys.bloom_status))

## diet dictionary
inverse_diet_dict = dict(zip(meta_keys.assigned_num, meta_keys.diet))

## vendor dictionary
inverse_vendor_dict = dict(zip(meta_keys.assigned_num, meta_keys.vendor))

In [23]:
## im actually just generating my x and y dataframes 
## x - the side that has the data I want the model to use to predict y
## y - what is to be predicted
bloom_xy = make_xy_tables(meta_df=meta,
                          otu_df=family_deltas_df,
                          merge_on="mouse_id",
                          y_col="bloom_status")

family_deltas_x = bloom_xy["x_dataframe"]
family_deltas_y = bloom_xy["y_dataframe"]

**Yay! Let's finally run some machine learning models**

In [24]:
## saving models as a variable and putting them in a list
logreg = LogisticRegression()
svc = SVC()
knn = KNeighborsClassifier(n_neighbors=3)
gaussian = GaussianNB()
perceptron = Perceptron()
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier(n_estimators=100)
grad_boost = GradientBoostingClassifier(n_estimators=100)
ridge_class = RidgeClassifierCV()

model_list = [logreg, svc, knn, gaussian, perceptron, decision_tree,
              random_forest, grad_boost, ridge_class]
model_labs = ["logreg", "svc", "knn", "gaussian", "perceptron", "decision_tree",
              "random_forest", "grad_boost", "ridge_class"]
model_dict = dict(zip(model_labs, model_list))

In [25]:
## kfold cross validation attempt 
## 5 splits gives 80/20 distribution
kf = KFold(n_splits=5)

test_output = {}

for i, (train_index, test_index) in enumerate(kf.split(family_deltas_x, family_deltas_y)):
    print(f"Fold {i}:")
    print(f"Training dataset index: {train_index}")
    print(f"Testing dataset index: {test_index}")
    ## setting up test/train datasets 
    x_train = family_deltas_x.filter(items=train_index, axis=0)
    x_test = family_deltas_x.filter(items=test_index, axis=0)
    y_train = family_deltas_y.filter(items=train_index, axis=0)
    y_test = family_deltas_y.filter(items=test_index, axis=0)

    ## running my various models on all 5 datasets
    y_pred_output = []
    score_output = []
    model_name_output = []
    for label, model in model_dict.items():
        model_scores = run_models(model,
                                  x_train=x_train,
                                  y_train=y_train,
                                  x_test=x_test)
        y_pred_results = model_scores["y_pred"]
        score_results = model_scores["acc_score"]

        y_pred_output.append(y_pred_results)
        score_output.append(score_results)
        model_name_output.append(label)
    
    ## non shuffled data model results - y preds and accuracy scores by model
    ## so I can decide which one to use moving forward w my actual ML model
    test_model_results = model_results(model_name_list=model_name_output,
                                       model_scores=score_output,
                                       model_y_preds=y_pred_output,
                                       y_test=y_test,
                                       value_dict=inverse_diet_dict)
    test_yPreds = test_model_results["y_preds"]
    test_countScore = test_model_results["acc_count_table"].fillna(0)
    test_countScore["fold"] = f"f{i}"

    test_output.update({f"fold{i}_results": test_countScore})


Fold 0:
Training dataset index: [10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49]
Testing dataset index: [0 1 2 3 4 5 6 7 8 9]
Fold 1:
Training dataset index: [ 0  1  2  3  4  5  6  7  8  9 20 21 22 23 24 25 26 27 28 29 30 31 32 33
 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49]
Testing dataset index: [10 11 12 13 14 15 16 17 18 19]
Fold 2:
Training dataset index: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 30 31 32 33
 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49]
Testing dataset index: [20 21 22 23 24 25 26 27 28 29]
Fold 3:
Training dataset index: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 40 41 42 43 44 45 46 47 48 49]
Testing dataset index: [30 31 32 33 34 35 36 37 38 39]
Fold 4:
Training dataset index: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39]
Testing dataset

In [26]:
## putting all fold results for the models together in one table
fold_model_results = pd.concat(test_output, ignore_index=True)

## pulling out models that scored above 95 across all folds 
best_model_results = fold_model_results.loc[fold_model_results['score'] > 95]

## taking the models that scored above 95 and counting how many times that model appears across all folds
## this is all to inform which models to use moving forward
top_models = pd.DataFrame(np.unique(best_model_results["model"], return_counts=True)).T
top_models.columns = ["model", "num_occurences"]
top_models = top_models.sort_values(by="num_occurences", ascending=False)

top_models

Unnamed: 0,model,num_occurences
0,decision_tree,5
1,grad_boost,5
4,random_forest,5
5,ridge_class,4
6,svc,4
2,knn,2
3,perceptron,1


In [30]:
##int_top_models = top_models.loc[top_models['num_occurences'] == 5]
int_top_models = top_models.loc[top_models['model'] == 'decision_tree']
top_model_list = int_top_models['model'].to_list()
top_model_list

['decision_tree']

i want to take the filtered dictionary of the highest performing ml models and run each of them through the `kfold_model_predict()` function

**current issues:**
- I know how to use a for loop to iterate through each model in the filtered dictionary, however, I need to find a way to put the rest of the models in the filtered dictionary in a list of tuples for the estimators list ex: [('model_abbreviation', model_function), etc]
- maybe do:
    - esti_list = [] \
        for label, model in filt_dict.items(): \
            model_tuple = (label, model) \
            esti_list.append(model_tuple)
    - yay!! that worked

- I think I need to filter the dictionary even further to not include the model that's currently being run in the `kfold_model_predict()` function prior to the for loop above 
    - but howwwwww????
    - like so:
    - test_model = 'decision_tree' \
        test_dict = {} \
        for value in test_model: \
            int_test_dict = {k:v for (k,v) in filt_dict.items() if value not in k} \
            test_dict.update(int_test_dict)

- I need to maybe create a dictionary of parameters for every single model I include so I can pull those into the `kfold_model_predict()` function
    - I'm just lazy and I don't want to do all this work but I have to 
    - so now that the dictionary is put together I can access it via:
        - paramgrid_dict["model_label"]

In [31]:
## you actually need the for loop for this one since you're filtering the dictionary using a list instead of just a string
filt_dict = {}
for value in top_model_list:
    int_dict = {k:v for (k,v) in model_dict.items() if value in k}
    filt_dict.update(int_dict)

filt_dict

{'decision_tree': DecisionTreeClassifier()}

In [32]:
## creating a dictionary of parameters for every single model :(

## random forest
rf_paramgrid = {
    'n_estimators':      [100, 150, 200, 250, 300, 400, 500],
    'criterion':         ['gini', 'entropy'],
    'max_features':      ['auto', 'log2'],
    'min_samples_leaf':  list(range(2, 8)),
    'random_state':      [42]
}

## gradient boosted classifier
gb_paramgrid = {
    'n_estimators':      [100, 150, 200, 250, 300, 400, 500],
    'criterion':         ['friedman_mse', 'squared_error'],
    'max_features':      ['sqrt', 'log2'],
    'min_samples_leaf':  list(range(2, 8)),
    'random_state':      [42]
}

## decision tree
dt_paramgrid = {
    'criterion':         ['gini', 'entropy'],
    'max_features':      ['sqrt', 'log2'],
    'min_samples_leaf':  list(range(2, 8)),
    'random_state':      [42]
}

## support vector machine
## this one is funky idk what the best way to set this up is 
svc_paramgrid = {
    'C':                [0.0001, 0.001, 0.01, 0.1, 1.0],
    'gamma':            [0.01, 0.1],
    'random_state':      [42]
} 

## ridge classifier cv
rc_paramgrid = {
    'alphas':        [0.0001, 0.001, 0.01, 0.1, 1.0],
    'scoring':       ['neg_mean_squared_error', 'neg_mean_squared_log_error']
}

## logistic regression
lr_paramgrid = {
    'penalty':      ['None', 'l2'],
    'C':            [0.0001, 0.001, 0.01, 0.1, 1.0],
    'solver':       ['lbfgs', 'newton-cg', 'sag'],
    'random_state':      [42]
}

## knn
knn_paramgrid = {
    'n_neighbors':  [3, 4, 5],
    'weights':      ['uniform', 'distance'],
    'algorithm':    ['auto', 'ball_tree', 'kd_tree'],
    'leaf_size':    [10, 20, 30, 40, 50],
    'p':            [2]
}

## gaussian nb classifier
## there are like no hyperparameters for this model lol idk what to do here
gnb_paramgrid = {
    'priors':      ['none'],
    'var_smoothing': [1e-09],
    'random_state':      [42]
}

## perceptron
per_paramgrid = {
    'penalty':   ['None', 'l2'],
    'alpha':     [0.0001, 0.001, 0.01, 0.1, 1.0],
    'l1_ratio':  [0.15, 0],
    'random_state':      [42]
}

## putting them all into a dictionary together :)
paramgrid_dict = {"random_forest": rf_paramgrid,
                  "grad_boost": gb_paramgrid,
                  "decision_tree": dt_paramgrid,
                  "svc": svc_paramgrid,
                  "ridge_class": rc_paramgrid,
                  "logreg": lr_paramgrid,
                  "knn": knn_paramgrid,
                  "gaussian": gnb_paramgrid,
                  "perceptron": per_paramgrid}

**Attempting this giant for loop** \
hopefully I don't kill my computer

In [33]:
av_correct_list = []
comb_meta_dict = {}

for label, model_func in filt_dict.items():
    ## saving the name of the model I'm running
    model_label = label
    ## generating a new dictionary of models for the estimator list
    new_dict = {k:v for (k,v) in filt_dict.items() if model_label not in k}
    
    ## putting together estimator list
    estimator_list = []
    for label, model in new_dict.items():
        model_tuple = (label, model)
        estimator_list.append(model_tuple)
    
    ## using the model name to pull the correct hyperparameters out of the dictionary
    wanted_params = paramgrid_dict[model_label]

    print(model_func)

    ## now we have all the parts that we need so we can actually run this function!!
    predict_results = kfold_model_predict(x_dataframe=family_deltas_x,
                                          y_dataframe=family_deltas_y,
                                          k_fold=kf,
                                          wanted_model=model_func,
                                          paramgrid=wanted_params,
                                          wanted_model_name=model_label,
                                          add_estimator_list=estimator_list)
    
    ## pulling model results
    ## average amount of times that the model predicts the correct y-value
    model_acc = predict_results["model_mean_acc"]
    overall_model_results = pd.DataFrame(data=model_acc,
                                        index=["av_correct"]).T
    final_model_acc = overall_model_results["av_correct"].mean()

    av_correct_list.append((model_label, final_model_acc))

    ## joining y-pred key and values to the metadata file
    pre_y_pred = predict_results["model_y_pred"]
    y_pred = pd.concat(pre_y_pred, ignore_index=True)
    y_pred["mouse_id"] = y_pred.index
    comb_y_pred = y_pred.merge(meta, how='left', on=["mouse_id"])
    comb_y_pred["model"] = model_label

    comb_meta_dict.update({model_label: comb_y_pred})
    

DecisionTreeClassifier()
Fold 0:
Training dataset index: [10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49]
Testing dataset index: [0 1 2 3 4 5 6 7 8 9]
Fold 1:
Training dataset index: [ 0  1  2  3  4  5  6  7  8  9 20 21 22 23 24 25 26 27 28 29 30 31 32 33
 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49]
Testing dataset index: [10 11 12 13 14 15 16 17 18 19]
Fold 2:
Training dataset index: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 30 31 32 33
 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49]
Testing dataset index: [20 21 22 23 24 25 26 27 28 29]
Fold 3:
Training dataset index: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 40 41 42 43 44 45 46 47 48 49]
Testing dataset index: [30 31 32 33 34 35 36 37 38 39]
Fold 4:
Training dataset index: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 

KeyError: 'mouse_id'

In [36]:
y_pred["mouse_id"] = y_pred.index
comb_y_pred = y_pred.merge(meta, how='left', on=["mouse_id"])

comb_y_pred["mouse_id"] = comb_y_pred["mouse_id"].map(inverse_mouseID_dict)

comb_y_pred

Unnamed: 0.1,key,final_y_pred,model_correct,fold,mouse_id,Unnamed: 0,diet,vendor,high_fiber,chow_highFiber,bloom_status
0,0,1,0,f0,CDD02.Tc.HFHF.4,33,1,0,1,1,0
1,0,0,1,f0,CDD02.CR.HFHF.1,5,1,1,1,1,0
2,1,1,1,f0,CDD02.CR.LFLF.3,22,4,1,0,0,1
3,1,1,1,f0,CDD02.CR.HFLF.1,10,2,1,0,0,1
4,0,1,0,f0,CDD02.CR.Chow.3,2,0,1,0,1,0
5,0,1,0,f0,CDD02.Tc.Chow.5,29,0,0,0,1,0
6,1,1,1,f0,CDD02.Tc.Chow.1,25,0,0,0,1,1
7,0,1,0,f0,CDD02.Tc.LFHF.1,40,3,0,1,1,0
8,1,0,0,f0,CDD02.CR.HFLF.4,13,2,1,0,0,1
9,1,1,1,f0,CDD02.Tc.HFLF.5,39,2,0,0,0,1


In [18]:
selected_model_results = pd.DataFrame(av_correct_list,
                                      columns=["model", "overall_score"])

selected_model_results

Unnamed: 0,model,overall_score
0,decision_tree,0.74
1,grad_boost,0.78
2,random_forest,0.74


In [19]:
comb_meta_yPred = pd.concat(comb_meta_dict, ignore_index=True)

comb_meta_yPred["mouse_id"] = comb_meta_yPred["mouse_id"].map(inverse_mouseID_dict)
comb_meta_yPred["diet"] = comb_meta_yPred["diet"].map(inverse_diet_dict)
comb_meta_yPred["vendor"] = comb_meta_yPred["vendor"].map(inverse_vendor_dict)
comb_meta_yPred["bloom_status"] = comb_meta_yPred["bloom_status"].map(inverse_bloom_dict)

comb_meta_yPred

Unnamed: 0.1,key,final_y_pred,model_correct,fold,Unnamed: 0,mouse_id,diet,vendor,high_fiber,chow_highFiber,bloom_status,model
0,0,0,1,f0,0,CDD02.CR.Chow.1,Chow,charles_river,0,1,False,decision_tree
1,1,1,1,f0,1,CDD02.CR.Chow.2,Chow,charles_river,0,1,False,decision_tree
2,1,1,1,f0,2,CDD02.CR.Chow.3,Chow,charles_river,0,1,False,decision_tree
3,0,0,1,f0,3,CDD02.CR.Chow.4,Chow,charles_river,0,1,True,decision_tree
4,1,1,1,f0,4,CDD02.CR.Chow.5,Chow,charles_river,0,1,False,decision_tree
...,...,...,...,...,...,...,...,...,...,...,...,...
145,1,1,1,f4,45,CDD02.Tc.LFLF.1,LF/LF,taconic,0,0,True,random_forest
146,0,0,1,f4,46,CDD02.Tc.LFLF.2,LF/LF,taconic,0,0,True,random_forest
147,1,1,1,f4,47,CDD02.Tc.LFLF.3,LF/LF,taconic,0,0,True,random_forest
148,0,0,1,f4,48,CDD02.Tc.LFLF.4,LF/LF,taconic,0,0,True,random_forest


**Saving my outputs**

In [35]:
top_models.to_csv("../outputs/bloom_best_performing_models.tsv", sep="\t")
selected_model_results.to_csv("../outputs/bloom_model_predict_scores.tsv", sep="\t")
comb_meta_yPred.to_csv("../outputs/bloom_comb_meta_yPred.tsv", sep="\t")

**TO DO NEXT:**
- put boruta shap in and run it with the selected model - output the selected features (microbes) for each fold in the model!!!
- I can't do this bc boruta shap sucks

**current issue:**
- when I try to run boruta shap with a different model other than the default specified (random forest) it gives me weird and vague errors that I literally can't figure out how to resolve 
- I want to supplement the model I'm using to predict everything else for the model used in boruta shap 
- I don't think boruta shap supports any non-tree based classifiers so the only ones that would work are random forest and decision tree
    - update:
        - gradient boosting classifier works!!
        - random forest works but only if you use the built in one for the package (and classification=False)
        - decision tree DOES NOT work
        - if you try to separately specify decision tree or random forest you get an error
            - "IndexError: list index out of range"