# **Boruta Shap Feature Analysis**
**what I want to learn:**
- **which features (taxonomic families) are important to blooms at day 3?**

**k fold cross validation documentation:**
- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html

**Needed libraries**

In [1]:
import pandas as pd
import numpy as np
from BorutaShap import BorutaShap
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier)

**Functions**

In [6]:
## generate x and y dataframes for machine learning and/or boruta shap/kfold cross validation
def make_xy_tables(meta_df,
                   otu_df,
                   merge_on,
                   y_col):
    output_dict = {}
    
    mini_meta = meta_df.loc[:, (merge_on, y_col)]
    comb_df = otu_df.merge(mini_meta, how="left", on=[merge_on])
    ## x - the side that has the data I want the model to use to predict y
    pre_x_df = comb_df.copy()
    x_df = pre_x_df.drop(y_col, axis=1)
    x_df[merge_on] = x_df[merge_on].astype(float)
    x_df = x_df.drop(merge_on, axis=1)
    ## y - what is to be predicted
    y_df = comb_df[y_col]

    ## saving my outputs
    output_dict.update({"x_dataframe": x_df,
                        "y_dataframe": y_df})
    return(output_dict)


## run kfold cross validation on x and y datasets and then identify important features via boruta shap
def kfold_boruta_shap(k_fold,
                      feature_selector,
                      x_dataframe,
                      y_dataframe,
                      trial_num,
                      run_name):
    output_dict = {}

    train_list = []
    for i, (train_index, test_index) in enumerate(k_fold.split(x_dataframe, y_dataframe)):
        print(f"Fold {i}:")
        print(f"Training dataset index: {train_index}")
        print(f"Testing dataset index: {test_index}")
        ## setting up test/train datasets 
        x_train = x_dataframe.filter(items=train_index, axis=0)
        x_test = x_dataframe.filter(items=test_index, axis=0)
        y_train = y_dataframe.filter(items=train_index, axis=0)
        y_test = y_dataframe.filter(items=test_index, axis=0)

        ## running boruta shap on training data
        feature_selector.fit(X=x_train,
                             y=y_train, 
                             n_trials=trial_num,
                             random_state=0,
                             sample=False,
                             verbose=True)
        
        train_acc_features = feature_selector.accepted
        train_list = train_list + train_acc_features
        feature_selector.results_to_csv(filename=f"{run_name}_f{i}_bs_results")

    ## saving my outputs
    output_dict.update({f"{run_name}_train": train_list})
    return(output_dict)


## to make the output table of the kfold cross validated boruta shap results
def create_occurence_table(input_list):
    wanted_df = pd.DataFrame(np.unique(input_list, return_counts=True)).T
    wanted_df.columns = ["feature", "num_occurences"]
    wanted_df = wanted_df.sort_values(by="num_occurences", ascending=False)
    wanted_df["av_occurences"] = wanted_df["num_occurences"]/5
    return(wanted_df) 

**File paths**

In [3]:
family_deltas_fp = "../data/family_deltas_wide.tsv"
meta_fp = "../data/ml_approved_metadata.tsv"
meta_keys_fp = "../data/meta_dict_keys.tsv"

## common variables for functions
bs_cols = ['family', 'num_occurences']
bs_sort_by = 'num_occurences'

## cross validator and boruta shap 
kf = KFold(n_splits=5)
grad_boost = GradientBoostingClassifier(n_estimators=100)
random_forest_bs = BorutaShap(importance_measure='shap', 
                           classification=False)
grad_boost_bs = BorutaShap(model=grad_boost,
                        importance_measure='shap',
                        classification=True)
borutaShap_dict = {"random_forest": random_forest_bs}

**Reading in files and data wrangling**

In [7]:
family_deltas_df = pd.read_csv(family_deltas_fp, sep='\t')
meta = pd.read_csv(meta_fp, sep='\t')
meta_keys = pd.read_csv(meta_keys_fp, sep='\t')

In [8]:
## pulling out mouse id assigned numbers as a dictionary to map it to the deltas table
mouse_id_dict = dict(zip(meta_keys.mouse_id, meta_keys.assigned_num))
inverse_mouseID_dict = dict(zip(meta_keys.assigned_num, meta_keys.mouse_id))

In [6]:
## converting mouse ids in the deltas table to the same numbers as the metadata
##family_deltas_df["mouse_id"] = family_deltas_df["mouse_id"].map(mouse_id_dict)

## Bloom Status

In [9]:
## more data wrangling bc why not i love it!!! (that's sarcasm)
## im actually just generating my x and y dataframes 
bloom_xy = make_xy_tables(meta_df=meta,
                          otu_df=family_deltas_df,
                          merge_on="mouse_id",
                          y_col="bloom_status")

bloom_x_df = bloom_xy["x_dataframe"]
bloom_y_df = bloom_xy["y_dataframe"]

In [10]:
bloom_bs_acc_train = {}
for label, boruta_shap in borutaShap_dict.items():
    bloom_results = kfold_boruta_shap(k_fold=kf,
                                      feature_selector=boruta_shap,
                                      x_dataframe=bloom_x_df,
                                      y_dataframe=bloom_y_df,
                                      trial_num=100,
                                      run_name="bloom")
    
    ## pulling out boruta shap accepted features 
    ## training data
    bloom_train_list = bloom_results["bloom_train"]
    bs_bloom_train_df = create_occurence_table(input_list=bloom_train_list)
    bs_bloom_train_df["bs_model"] = label

    bloom_bs_acc_train.update({f"{label}_accepted": bs_bloom_train_df})


bloom_train_features = pd.concat(bloom_bs_acc_train, ignore_index=True)

Fold 0:
Training dataset index: [10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
 34 35 36 37 38 39 40 41 42 43 44 45 46]
Testing dataset index: [0 1 2 3 4 5 6 7 8 9]


  0%|          | 0/100 [00:00<?, ?it/s]

5 attributes confirmed important: [' f__UCG-010', ' f__Erysipelotrichaceae', ' f__Peptostreptococcaceae', ' f__Akkermansiaceae', ' f__RF39']
46 attributes confirmed unimportant: [' f__Lactobacillaceae', ' f__Sphingomonadaceae', ' f__Morganellaceae', ' f__Moorellaceae', ' f__Spirosomaceae', ' f__Oscillospiraceae', ' f__Rikenellaceae', ' f__Bacteroidaceae', ' f__Saccharimonadaceae', ' f__Marinifilaceae', ' f__Bacillaceae', ' f__uncultured', ' f__Anaerofustaceae', ' f__Clostridia_vadinBB60_group', ' f__Eggerthellaceae', ' f__Deferribacteraceae', ' f__Clostridiaceae', ' f__Prevotellaceae', ' f__[Eubacterium]_coprostanoligenes_group', ' f__Desulfovibrionaceae', ' f__Clostridia_UCG-014', ' f__Xanthomonadaceae', ' f__Erysipelatoclostridiaceae', ' f__Moraxellaceae', ' f__Hungateiclostridiaceae', ' f__Peptococcaceae', ' f__AKAU3644', ' f__Enterobacteriaceae', ' f__Sutterellaceae', ' f__Ruminococcaceae', ' f__Christensenellaceae', ' f__Weeksellaceae', ' f__Bifidobacteriaceae', ' f__Beggiatoaceae

  0%|          | 0/100 [00:00<?, ?it/s]

5 attributes confirmed important: [' f__Deferribacteraceae', ' f__Erysipelotrichaceae', ' f__Peptostreptococcaceae', ' f__Akkermansiaceae', ' f__Enterococcaceae']
44 attributes confirmed unimportant: [' f__Lactobacillaceae', ' f__Sphingomonadaceae', ' f__Morganellaceae', ' f__Moorellaceae', ' f__Spirosomaceae', ' f__Oscillospiraceae', ' f__Rikenellaceae', ' f__Bacteroidaceae', ' f__Saccharimonadaceae', ' f__Marinifilaceae', ' f__Bacillaceae', ' f__uncultured', ' f__Anaerofustaceae', ' f__Clostridia_vadinBB60_group', ' f__Eggerthellaceae', ' f__Clostridiaceae', ' f__Prevotellaceae', ' f__Desulfovibrionaceae', ' f__Clostridia_UCG-014', ' f__Xanthomonadaceae', ' f__Erysipelatoclostridiaceae', ' f__Moraxellaceae', ' f__Hungateiclostridiaceae', ' f__Peptococcaceae', ' f__UCG-010', ' f__AKAU3644', ' f__Sutterellaceae', ' f__Ruminococcaceae', ' f__Christensenellaceae', ' f__Weeksellaceae', ' f__Bifidobacteriaceae', ' f__Beggiatoaceae', ' f__Alicyclobacillaceae', ' f__Lachnospiraceae', ' f__St

  0%|          | 0/100 [00:00<?, ?it/s]

7 attributes confirmed important: [' f__Peptococcaceae', ' f__Erysipelotrichaceae', ' f__[Eubacterium]_coprostanoligenes_group', ' f__Desulfovibrionaceae', ' f__Peptostreptococcaceae', ' f__Akkermansiaceae', ' f__Enterococcaceae']
43 attributes confirmed unimportant: [' f__Lactobacillaceae', ' f__Sphingomonadaceae', ' f__Morganellaceae', ' f__Moorellaceae', ' f__Spirosomaceae', ' f__Rikenellaceae', ' f__Oscillospiraceae', ' f__Bacteroidaceae', ' f__Saccharimonadaceae', ' f__Marinifilaceae', ' f__Bacillaceae', ' f__uncultured', ' f__Anaerofustaceae', ' f__Clostridia_vadinBB60_group', ' f__Eggerthellaceae', ' f__Deferribacteraceae', ' f__Clostridiaceae', ' f__Prevotellaceae', ' f__Clostridia_UCG-014', ' f__Xanthomonadaceae', ' f__Erysipelatoclostridiaceae', ' f__Moraxellaceae', ' f__Hungateiclostridiaceae', ' f__UCG-010', ' f__AKAU3644', ' f__Sutterellaceae', ' f__Ruminococcaceae', ' f__Christensenellaceae', ' f__Weeksellaceae', ' f__Bifidobacteriaceae', ' f__Beggiatoaceae', ' f__Alicycl

  0%|          | 0/100 [00:00<?, ?it/s]

5 attributes confirmed important: [' f__Erysipelotrichaceae', ' f__[Eubacterium]_coprostanoligenes_group', ' f__Peptostreptococcaceae', ' f__Akkermansiaceae', ' f__Enterococcaceae']
46 attributes confirmed unimportant: [' f__Lactobacillaceae', ' f__Sphingomonadaceae', ' f__Morganellaceae', ' f__Moorellaceae', ' f__Spirosomaceae', ' f__Oscillospiraceae', ' f__Rikenellaceae', ' f__Bacteroidaceae', ' f__Saccharimonadaceae', ' f__Marinifilaceae', ' f__Bacillaceae', ' f__uncultured', ' f__Anaerofustaceae', ' f__Clostridia_vadinBB60_group', ' f__Eggerthellaceae', ' f__Deferribacteraceae', ' f__Clostridiaceae', ' f__Prevotellaceae', ' f__Desulfovibrionaceae', ' f__Clostridia_UCG-014', ' f__Xanthomonadaceae', ' f__Erysipelatoclostridiaceae', ' f__Moraxellaceae', ' f__Hungateiclostridiaceae', ' f__Peptococcaceae', ' f__UCG-010', ' f__AKAU3644', ' f__Enterobacteriaceae', ' f__Sutterellaceae', ' f__Ruminococcaceae', ' f__Christensenellaceae', ' f__Weeksellaceae', ' f__Bifidobacteriaceae', ' f__Be

  0%|          | 0/100 [00:00<?, ?it/s]

4 attributes confirmed important: [' f__Enterococcaceae', ' f__[Eubacterium]_coprostanoligenes_group', ' f__Akkermansiaceae', ' f__Erysipelotrichaceae']
44 attributes confirmed unimportant: [' f__Lactobacillaceae', ' f__Sphingomonadaceae', ' f__Morganellaceae', ' f__Moorellaceae', ' f__Spirosomaceae', ' f__Oscillospiraceae', ' f__Rikenellaceae', ' f__Bacteroidaceae', ' f__Saccharimonadaceae', ' f__Marinifilaceae', ' f__Bacillaceae', ' f__uncultured', ' f__Anaerofustaceae', ' f__Clostridia_vadinBB60_group', ' f__Eggerthellaceae', ' f__Deferribacteraceae', ' f__Clostridiaceae', ' f__Prevotellaceae', ' f__Desulfovibrionaceae', ' f__Clostridia_UCG-014', ' f__Xanthomonadaceae', ' f__Erysipelatoclostridiaceae', ' f__Moraxellaceae', ' f__Hungateiclostridiaceae', ' f__Peptococcaceae', ' f__UCG-010', ' f__AKAU3644', ' f__Sutterellaceae', ' f__Ruminococcaceae', ' f__Christensenellaceae', ' f__Weeksellaceae', ' f__Bifidobacteriaceae', ' f__Beggiatoaceae', ' f__Alicyclobacillaceae', ' f__Lachnospi

- f_RF39: unclassified enterococcaceae bacterium
- f_UCG-010: i have no idea, after looking it up on the silva database website (the database I used to classify the taxonomy for this data in qiime), they're not sure where to place it (insertae sedis) but there's a bunch of uncultured clostridiales/clostridiaceae bacterium under it (I wouldn't be too dependent on that though bc there's 973 unclassified bacterium that are under UCG-010)

In [11]:
bloom_train_features

Unnamed: 0,feature,num_occurences,av_occurences,bs_model
0,f__Akkermansiaceae,5,1.0,random_forest
1,f__Erysipelotrichaceae,5,1.0,random_forest
2,f__Enterococcaceae,4,0.8,random_forest
3,f__Peptostreptococcaceae,4,0.8,random_forest
4,f__[Eubacterium]_coprostanoligenes_group,3,0.6,random_forest
5,f__Deferribacteraceae,1,0.2,random_forest
6,f__Desulfovibrionaceae,1,0.2,random_forest
7,f__Peptococcaceae,1,0.2,random_forest
8,f__RF39,1,0.2,random_forest
9,f__UCG-010,1,0.2,random_forest


## Diet

In [14]:
## creating x and y dataframes for kfold and boruta shap analysis
diet_xy = make_xy_tables(meta_df=meta,
                         otu_df=family_deltas_df,
                         merge_on="mouse_id",
                         y_col="diet")

diet_x_df = diet_xy["x_dataframe"]
diet_y_df = diet_xy["y_dataframe"]

In [17]:
## I can't use both random forest and gradient boosting classifiers on the diet data since it's not binary
## and its not supported :(
diet_results = kfold_boruta_shap(k_fold=kf,
                                 feature_selector=random_forest_bs,
                                 x_dataframe=diet_x_df,
                                 y_dataframe=diet_y_df,
                                 trial_num=100,
                                 run_name='diet')

Fold 0:
Training dataset index: [10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49]
Testing dataset index: [0 1 2 3 4 5 6 7 8 9]


  0%|          | 0/100 [00:00<?, ?it/s]

0 attributes confirmed important: []
52 attributes confirmed unimportant: [' f__Monoglobaceae', ' f__Weeksellaceae', ' f__Beggiatoaceae', ' f__RF39', ' f__Xanthomonadaceae', ' f__Streptococcaceae', ' f__Gastranaerophilales', ' f__Tannerellaceae', ' f__Erysipelatoclostridiaceae', ' f__Bacteroidaceae', ' f__Oscillospiraceae', ' f__Erysipelotrichaceae', ' f__Peptococcaceae', ' f__Spirosomaceae', ' f__Bifidobacteriaceae', ' f__Moorellaceae', ' f__Oscillospirales', ' f__Butyricicoccaceae', ' f__Clostridiaceae', ' f__Moraxellaceae', ' f__Christensenellaceae', ' f__Deferribacteraceae', ' f__Saccharimonadaceae', ' f__Prevotellaceae', ' f__Lactobacillaceae', ' f__Alicyclobacillaceae', ' f__AKAU3644', ' f__[Eubacterium]_coprostanoligenes_group', ' f__Sutterellaceae', ' f__Peptostreptococcaceae', ' f__Anaerovoracaceae', ' f__Anaerofustaceae', ' f__Enterobacteriaceae', ' f__Bacillaceae', ' f__Enterococcaceae', ' f__Clostridia_vadinBB60_group', ' f__Hungateiclostridiaceae', ' f__Ruminococcaceae', '

  0%|          | 0/100 [00:00<?, ?it/s]

3 attributes confirmed important: [' f__Rikenellaceae', ' f__Butyricicoccaceae', ' f__Erysipelatoclostridiaceae']
49 attributes confirmed unimportant: [' f__Monoglobaceae', ' f__Weeksellaceae', ' f__Beggiatoaceae', ' f__RF39', ' f__Xanthomonadaceae', ' f__Streptococcaceae', ' f__Gastranaerophilales', ' f__Tannerellaceae', ' f__Bacteroidaceae', ' f__Oscillospiraceae', ' f__Erysipelotrichaceae', ' f__Peptococcaceae', ' f__Spirosomaceae', ' f__Bifidobacteriaceae', ' f__Moorellaceae', ' f__Oscillospirales', ' f__Clostridiaceae', ' f__Moraxellaceae', ' f__Christensenellaceae', ' f__Deferribacteraceae', ' f__Saccharimonadaceae', ' f__Prevotellaceae', ' f__Lactobacillaceae', ' f__Alicyclobacillaceae', ' f__AKAU3644', ' f__Peptostreptococcaceae', ' f__Sutterellaceae', ' f__[Eubacterium]_coprostanoligenes_group', ' f__Anaerovoracaceae', ' f__Anaerofustaceae', ' f__Enterobacteriaceae', ' f__Bacillaceae', ' f__Enterococcaceae', ' f__Clostridia_vadinBB60_group', ' f__Hungateiclostridiaceae', ' f__

  0%|          | 0/100 [00:00<?, ?it/s]

0 attributes confirmed important: []
52 attributes confirmed unimportant: [' f__Monoglobaceae', ' f__Weeksellaceae', ' f__Beggiatoaceae', ' f__RF39', ' f__Xanthomonadaceae', ' f__Streptococcaceae', ' f__Gastranaerophilales', ' f__Tannerellaceae', ' f__Erysipelatoclostridiaceae', ' f__Bacteroidaceae', ' f__Oscillospiraceae', ' f__Erysipelotrichaceae', ' f__Peptococcaceae', ' f__Spirosomaceae', ' f__Bifidobacteriaceae', ' f__Butyricicoccaceae', ' f__Moorellaceae', ' f__Oscillospirales', ' f__Clostridiaceae', ' f__Moraxellaceae', ' f__Christensenellaceae', ' f__Deferribacteraceae', ' f__Saccharimonadaceae', ' f__Prevotellaceae', ' f__Lactobacillaceae', ' f__Alicyclobacillaceae', ' f__AKAU3644', ' f__Peptostreptococcaceae', ' f__Sutterellaceae', ' f__[Eubacterium]_coprostanoligenes_group', ' f__Anaerovoracaceae', ' f__Anaerofustaceae', ' f__Enterobacteriaceae', ' f__Bacillaceae', ' f__Enterococcaceae', ' f__Clostridia_vadinBB60_group', ' f__Hungateiclostridiaceae', ' f__Ruminococcaceae', '

  0%|          | 0/100 [00:00<?, ?it/s]

0 attributes confirmed important: []
52 attributes confirmed unimportant: [' f__Monoglobaceae', ' f__Weeksellaceae', ' f__Beggiatoaceae', ' f__RF39', ' f__Xanthomonadaceae', ' f__Streptococcaceae', ' f__Gastranaerophilales', ' f__Tannerellaceae', ' f__Erysipelatoclostridiaceae', ' f__Bacteroidaceae', ' f__Oscillospiraceae', ' f__Erysipelotrichaceae', ' f__Peptococcaceae', ' f__Spirosomaceae', ' f__Bifidobacteriaceae', ' f__Butyricicoccaceae', ' f__Moorellaceae', ' f__Oscillospirales', ' f__Clostridiaceae', ' f__Moraxellaceae', ' f__Christensenellaceae', ' f__Deferribacteraceae', ' f__Saccharimonadaceae', ' f__Prevotellaceae', ' f__Lactobacillaceae', ' f__Alicyclobacillaceae', ' f__AKAU3644', ' f__Peptostreptococcaceae', ' f__Sutterellaceae', ' f__[Eubacterium]_coprostanoligenes_group', ' f__Anaerovoracaceae', ' f__Anaerofustaceae', ' f__Enterobacteriaceae', ' f__Bacillaceae', ' f__Enterococcaceae', ' f__Clostridia_vadinBB60_group', ' f__Hungateiclostridiaceae', ' f__Ruminococcaceae', '

  0%|          | 0/100 [00:00<?, ?it/s]

0 attributes confirmed important: []
52 attributes confirmed unimportant: [' f__Monoglobaceae', ' f__Weeksellaceae', ' f__Beggiatoaceae', ' f__RF39', ' f__Xanthomonadaceae', ' f__Streptococcaceae', ' f__Gastranaerophilales', ' f__Tannerellaceae', ' f__Erysipelatoclostridiaceae', ' f__Bacteroidaceae', ' f__Oscillospiraceae', ' f__Erysipelotrichaceae', ' f__Peptococcaceae', ' f__Spirosomaceae', ' f__Bifidobacteriaceae', ' f__Butyricicoccaceae', ' f__Moorellaceae', ' f__Oscillospirales', ' f__Clostridiaceae', ' f__Moraxellaceae', ' f__Christensenellaceae', ' f__Deferribacteraceae', ' f__Saccharimonadaceae', ' f__Prevotellaceae', ' f__Lactobacillaceae', ' f__Alicyclobacillaceae', ' f__AKAU3644', ' f__Peptostreptococcaceae', ' f__Sutterellaceae', ' f__[Eubacterium]_coprostanoligenes_group', ' f__Anaerovoracaceae', ' f__Anaerofustaceae', ' f__Enterobacteriaceae', ' f__Bacillaceae', ' f__Enterococcaceae', ' f__Clostridia_vadinBB60_group', ' f__Hungateiclostridiaceae', ' f__Ruminococcaceae', '

In [18]:
## selected features from boruta shap on training data 
diet_train_list = diet_results["diet_train"]
bs_diet_train_df = create_occurence_table(input_list=diet_train_list)

bs_diet_train_df

Unnamed: 0,feature,num_occurences,av_occurences
0,f__Butyricicoccaceae,1,0.2
1,f__Erysipelatoclostridiaceae,1,0.2
2,f__Rikenellaceae,1,0.2


**Saving my outputs**

In [16]:
## bloom status
bloom_train_features.to_csv("../data/outputs/borutaShap_bloom_train.tsv", sep="\t")

## diet
bs_diet_train_df.to_csv("../data/outputs/borutaShap_diet_train.tsv", sep="\t")