In [2]:
# Importing Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RepeatedKFold, StratifiedShuffleSplit, GridSearchCV
# Importing our modules
from pipeline.selection_pipeline import SelectionPipeline
from selection_methods.lasso_method import LassoMethod
from selection_methods.alasso_method import AlassoMethod
from selection_methods.elasticnet_method import ElasticNetMethod
from selection_methods.mlrrf_method import MLRRFMethod
from selection_methods.relieff_method import ReliefFMethod
from selection_methods.svmrfe_method import SVMRFEMethod
from selection_methods.boruta_method import BorutaMethod

In [3]:
# Settings
random_state = 42

In [9]:
# Load and Pre-Process Data 
discovery_set = pd.read_excel('data/discovery_set.xlsx', index_col=0)
discovery_set['state'] = discovery_set.apply(lambda a:0 if a['type']=='N' else 1, axis=1)
plasma_df = discovery_set.copy()

# Make a stratified test set
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.33, random_state=random_state)
for train_index, test_index in splitter.split(plasma_df, plasma_df['type']):
    train_set, test_set = plasma_df.iloc[train_index], plasma_df.iloc[test_index]

Xtrain_stratified = train_set
Xtest_stratified = test_set

ytrain_stratified = Xtrain_stratified['state']
ytest_stratified = Xtest_stratified['state']

Xtrain = Xtrain_stratified.drop(['state', 'type', 'Batch', 'batch', 'sample_id'], axis=1)
Xtest = Xtest_stratified.drop(['state', 'type', 'Batch', 'batch', 'sample_id'], axis=1)

ytrain = np.array(ytrain_stratified)
ytrain = ytrain.astype(np.float32)

ytest = np.array(ytest_stratified)
ytest = ytest.astype(np.float32)

Xtrain = Xtrain.values
Xtest = Xtest.values

In [None]:
# Feature Selection

exclude_mets = ['Glycerate-2P_Glycerate-3P_neg-006','Citraconic acid_neg-025','Pyridoxine_pos-137','Argininosuccinic acid_pos-039']
# The above metabolites are excluded as characteristics due to poor peak shapes in mass spectrometry

pipeline = SelectionPipeline()

# Add feature selection methods
pipeline.add_method(LassoMethod(n_features=15, alpha=1.0, max_iter=1000))
pipeline.add_method(BorutaMethod(n_features=15))
pipeline.add_method(ReliefFMethod(n_features=15, n_neighbors=100))

# Apply pipeline on the data and get the output metabolites from each method
metas_dict = pipeline.apply(Xtrain, ytrain)
