# Feature selection

## Imports

In [2]:
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from xgboost import XGBClassifier
import pandas as pd
import catboost as cb
import numpy as np
import seaborn as sns
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectFromModel, VarianceThreshold
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import randint, uniform
from tqdm.notebook import tqdm
import pickle
from datetime import datetime
from catboost import CatBoostRegressor, Pool, EShapCalcType, EFeaturesSelectionAlgorithm

## Load data

In [3]:
# Load genus relative abundance data as features
# X = pd.read_csv("https://raw.githubusercontent.com/per6x/CSE3000/master/kraken_taxonomy/genus_relative_abundance.csv", sep=";")
# Load species relative abundance data as features
X = pd.read_csv("https://raw.githubusercontent.com/per6x/CSE3000/master/kraken_taxonomy/species_relative_abundance.csv", sep=";")
# Load labels 
y = pd.read_csv("https://raw.githubusercontent.com/per6x/CSE3000/master/labels.csv", sep=";")
y = y.set_index("Sample", drop=True)["Class"]
X = X.set_index("Sample", drop=True)
print(X.shape)
assert X.shape[0] == y.shape[0]

(212, 4630)


## Split the data

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_eval, y_train, y_eval = train_test_split(X_train, y_train, test_size=0.25, random_state=42) 

full_pool = cb.Pool(data=X, label=y)

train_pool = cb.Pool(data=X_train, label=y_train)
eval_pool = cb.Pool(data=X_eval, label=y_eval)
test_pool = cb.Pool(data=X_test, label=y_test)

## Define K-Fold, models and param distributions

In [6]:
# K-Fold
skf = StratifiedKFold(n_splits=10, shuffle=True)
# models
models = {
  'CatBoost': cb.CatBoostClassifier(thread_count=-1, verbose=False, random_state=42),
  'XGBoost': XGBClassifier(n_jobs=-1, random_state=42),
  'RF': RandomForestClassifier(warm_start=True, n_jobs=-1, random_state=42),
  'AdaBoost': AdaBoostClassifier(random_state=42)
}

## Select Features (top 100)

### CatBoost

In [19]:
catboost_top_100 = models['CatBoost'].select_features(
    train_pool,
    eval_set=eval_pool,
    features_for_select=list(range(train_pool.num_col())),
    num_features_to_select=100,
    steps=1,
    algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
    shap_calc_type=EShapCalcType.Regular,
    train_final_model=True,
    logging_level='Verbose',
)

### XGBoost

In [10]:
xgboost_top_100 = SelectFromModel(estimator=models['XGBoost'], max_features=100, threshold=-np.inf)
xgboost_top_100.fit(X_train, y_train)

In [21]:
models['XGBoost'].fit(X_train.iloc[:,xgboost_top_100.get_support()], y_train)

In [26]:
f1_score(models['XGBoost'].predict(X_test.iloc[:,xgboost_top_100.get_support()]), y_test)

NameError: name 'f1_score' is not defined

In [None]:
accuracy_score(y_test, )

### AdaBoost

In [None]:
adaboost_top_100 = SelectFromModel(estimator=models['AdaBoost'], max_features=100, threshold=-np.inf)
adaboost_top_100.fit(X_train, y_train)

### RF

In [None]:
rf_top_100 = SelectFromModel(estimator=models['RF'], max_features=100, threshold=-np.inf)
rf_top_100.fit(X_train, y_train)