In [None]:
# Import Libraries
import numpy as np
import pandas as pd
from functools import reduce
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
# Import modules
from utils.filter_data import filter_data
from utils.extract_metas import extract_metas

from pipeline.selection_pipeline import SelectionPipeline
from selection_methods.lasso_method import LassoMethod
from selection_methods.alasso_method import AlassoMethod
from selection_methods.elasticnet_method import ElasticNetMethod
from selection_methods.mlrrf_method import MLRRFMethod
from selection_methods.relieff_method import ReliefFMethod
from selection_methods.svmrfe_method import SVMRFEMethod
from selection_methods.boruta_method import BorutaMethod

In [None]:
# Define the Base Metabolites Panel
BASE_PANEL = {'Succinate_neg-079', 'Uridine_neg-088', 'S-Adenosyl-methionine_pos-139', 
              'N-Acetyl-D-glucosamine 6-phosphate_neg-061', 'Serotonin_pos-142', 
              'Pyroglutamic acid_neg-072', 'Neopterin_pos-117', 'Lactic acid_neg-055',
              '2-Aminooctanoic acid_pos-006', 'NMN_pos-162'}

# Create a Feature Selection Pipeline
pipeline = SelectionPipeline()

# Add feature selection methods
pipeline.add_method(LassoMethod(n_features=15, alpha=0.005, max_iter=1000))
pipeline.add_method(AlassoMethod(n_features=15, alpha=0.005, max_iter=1000))
pipeline.add_method(ElasticNetMethod(n_features=15, alpha=0.005, l1_ratio=0.5, max_iter=1000))
pipeline.add_method(MLRRFMethod(n_features=15, random_state=42))
pipeline.add_method(SVMRFEMethod(n_features=15))
pipeline.add_method(BorutaMethod(n_features=15))
pipeline.add_method(ReliefFMethod(n_features=15, n_neighbors=100))

In [None]:
# Load data, preprocess data, and make different batches
discovery_set = pd.read_excel('data/discovery_set.xlsx', index_col=0)
discovery_set['state'] = discovery_set.apply(lambda a:0 if a['type'] == 'N' else 1, axis=1)
labels = discovery_set['type']

plasma_df = discovery_set.copy()
batch1 = plasma_df[(plasma_df['batch'] == 'batch1')]
batch1.reset_index(drop=True,inplace=True)
batch2 = plasma_df[(plasma_df['batch'] == 'batch2')]
batch2.reset_index(drop=True,inplace=True)
batch3 = plasma_df[plasma_df['batch'] == 'batch3']
batch3.reset_index(drop=True, inplace=True)

exclude_mets = ['Glycerate-2P_Glycerate-3P_neg-006','Citraconic acid_neg-025','Pyridoxine_pos-137','Argininosuccinic acid_pos-039']
# The above metabolites are excluded as characteristics due to poor peak shapes in mass spectrometry
for i in range(0, 10000):
    # print("This is round %d"%(i))
    random_state = i
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.33, random_state=random_state)
    for train_index, test_index in split.split(batch1,batch1['type']):
        batch1_train_set = batch1.loc[train_index]
        batch1_test_set = batch1.loc[test_index] # batch1
    for train_index, test_index in split.split(batch2,batch2['type']):
        batch2_train_set = batch2.loc[train_index]
        batch2_test_set = batch2.loc[test_index] # batch2
    for train_index, test_index in split.split(batch3,batch3['type']):
        batch3_train_set = batch3.loc[train_index]
        batch3_test_set = batch3.loc[test_index] # batch3

    # 合并3个batch的training set 和testing set数据
    Xtrain_stratified = pd.concat([batch1_train_set, batch2_train_set, batch3_train_set], axis=0)
    Xtrain_stratified.reset_index(drop=True,inplace=True)
    Xtest_stratified = pd.concat([batch1_test_set, batch2_test_set, batch3_test_set], axis=0)
    Xtest_stratified.reset_index(drop=True,inplace=True)

    # 分类数据
    ytrain_stratified = Xtrain_stratified['state']
    ytest_stratified = Xtest_stratified['state']
    # 丢弃不需要的列
    Xtrain_stratified.drop(['Batch','type','batch','state'], axis=1, inplace=True)
    Xtest_stratified.drop(['Batch','type','batch','state'], axis=1, inplace=True)

    Xtrain = Xtrain_stratified
    Xtest = Xtest_stratified
    ytrain = pd.DataFrame(ytrain_stratified)
    ytest = pd.DataFrame(ytest_stratified)

    Xtrain.drop(['sample_id'], axis=1,inplace=True)
    Xtest.drop(['sample_id'], axis=1,inplace=True)

    metas = list(Xtrain.columns)
    pipeline.metas = metas
    
    ytrain = np.array(ytrain.state)
    ytrain = ytrain.astype(np.float32)
    ytest = np.array(ytest.state)
    ytest = ytest.astype(np.float32)

    Xtrain = Xtrain.values
    Xtest = Xtest.values
    
    # Apply Feature Selection Pipeline on Data
    pipeline.apply(Xtrain, ytrain)

{'LassoMethod': {'Taurine_neg-080', 'Arginine_pos-038', 'Fumaric acid_neg-036', "S-(5'-Adenosyl)-L-homocysteine_neg-075", '7-Methylguanosine_pos-028', 'N-Acetylputrescine_pos-114', 'Uridine_neg-088', '2-Aminooctanoic acid_pos-006', 'Acetylcarnitine_pos-029', 'S-Adenosyl-methionine_pos-139', 'Succinate_neg-079', 'Citrate_neg-026', 'Hypoxanthine_pos-096', '4-Hydroxyphenylpyruvic acid_neg-013'}}


In [None]:

# Get the output metabolites from each method
final_metas = extract_metas(pipeline.method_metas, BASE_PANEL)

# Filter our datasets using extracted metabolites
Xtrain = filter_data('data/Xtrain.xlsx', columns=final_metas)
Xtest = filter_data('data/Xtrain.xlsx', columns=final_metas)


Unnamed: 0,NMN_pos-162,Arginine_pos-038,S-(5'-Adenosyl)-L-homocysteine_neg-075,7-Methylguanosine_pos-028,Acetylcarnitine_pos-029,Neopterin_pos-117,Serotonin_pos-142,Hypoxanthine_pos-096,N-Acetyl-D-glucosamine 6-phosphate_neg-061,4-Hydroxyphenylpyruvic acid_neg-013,Taurine_neg-080,Lactic acid_neg-055,Fumaric acid_neg-036,N-Acetylputrescine_pos-114,Uridine_neg-088,Pyroglutamic acid_neg-072,2-Aminooctanoic acid_pos-006,S-Adenosyl-methionine_pos-139,Citrate_neg-026,Succinate_neg-079
Batch01_49N,0.000008,0.002939,0.000012,0.000054,0.042355,0.000058,0.000035,0.002818,0.000105,0.000020,0.003560,0.023980,0.000120,0.000252,0.000678,0.000238,0.000052,0.000606,0.004283,0.000857
Batch01_41N,0.000003,0.002816,0.000010,0.000019,0.039183,0.000042,0.000079,0.000107,0.000077,0.000020,0.002165,0.024642,0.000158,0.000231,0.000598,0.000282,0.000070,0.000842,0.028489,0.000702
Batch01_97N,0.000002,0.001968,0.000010,0.000018,0.056677,0.000043,0.000038,0.000127,0.000126,0.000004,0.003681,0.024888,0.000169,0.000185,0.000495,0.000433,0.000135,0.000702,0.004381,0.000677
Batch01_113N,0.000016,0.003053,0.000010,0.000044,0.044983,0.000038,0.000183,0.001197,0.000084,0.000032,0.004947,0.023912,0.000144,0.000137,0.000475,0.000284,0.000128,0.000635,0.013414,0.000645
Batch01_65N,0.000015,0.003202,0.000008,0.000039,0.071797,0.000030,0.000023,0.001766,0.000119,0.000024,0.004134,0.021813,0.000181,0.000160,0.000460,0.000245,0.000060,0.000396,0.018188,0.000818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Batch19_408N,0.000007,0.004769,0.000013,0.000077,0.062483,0.000054,0.000020,0.002067,0.000142,0.000159,0.004095,0.017484,0.000269,0.000241,0.000672,0.000179,0.000070,0.000691,0.010708,0.000817
Batch19_424P,0.000056,0.006179,0.000008,0.000092,0.042481,0.000076,0.001182,0.002056,0.000214,0.000221,0.004063,0.012390,0.000174,0.000289,0.000414,0.000116,0.000074,0.001557,0.016428,0.001164
Batch19_454P,0.000058,0.006823,0.000004,0.000102,0.057233,0.000048,0.000671,0.004615,0.000194,0.000166,0.004582,0.011409,0.000290,0.000313,0.000395,0.000138,0.000042,0.001059,0.015091,0.000687
Batch19_434P,0.000017,0.009083,0.000006,0.000094,0.050957,0.000063,0.000309,0.001565,0.000132,0.000186,0.002799,0.008891,0.000148,0.000195,0.000379,0.000116,0.000069,0.001095,0.019204,0.000611
