# CREATE INPUT FILES FROM DIABLO

these files will be used as input for DIABLO in R

In [None]:
import os
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler

In [None]:
data_source = '../data/'

## RNASEQ

In [None]:
metadata_file = os.path.join(data_source, 'GSE98923_metadata.xlsx')

In [None]:
metadata = pd.read_excel(metadata_file, index_col=0, sheet_name='NO_REPLICATES')
metadata

In [None]:
data_all_genes = pd.read_csv(os.path.join(data_source, 'GREAT_LOG_TPM_GSE98923_NOREPS.csv'), index_col=0)
data_all_genes = data_all_genes.transpose()
data_all_genes.shape

In [None]:
y_state = metadata['state']

In [None]:
trains_index = []
tests_index = []
ys_train = []

In [None]:
skf = StratifiedKFold(n_splits=5)
for i, (train_index, test_index) in enumerate(skf.split(data_all_genes, y_state)):
    
    X_train = data_all_genes.iloc[train_index, :]
    X_test = data_all_genes.iloc[test_index, :]

    y_train = y_state.iloc[train_index]
    y_test = y_state.iloc[test_index]
    
    ys_train.append(y_train)
    
    trains_index.append(train_index)
    tests_index.append(test_index)
    
    # remove some features
    vt = VarianceThreshold(0.1)
    filter_train = vt.fit(X_train)
    
    train_filtered = filter_train.transform(X_train)
    test_filtered = filter_train.transform(X_test)
    
    cols_inds = vt.get_support(indices=True)
    
    X_train_filtered = pd.DataFrame(train_filtered, index=X_train.index, columns=X_train.columns[cols_inds])
    X_test_filtered = pd.DataFrame(test_filtered, index=X_test.index, columns=X_test.columns[cols_inds])
    
    kb2 = SelectKBest(f_classif, k=500)

    kb2_fit = kb2.fit(X_train_filtered, y_train)

    train_filtered2 = kb2_fit.transform(X_train_filtered)
    test_filtered2 = kb2_fit.transform(X_test_filtered)

    cols_inds = kb2_fit.get_support(indices=True)

    X_train_filtered2 = pd.DataFrame(train_filtered2, columns=X_train_filtered.columns[cols_inds], index=X_train_filtered.index)
    X_test_filtered2 = pd.DataFrame(test_filtered2, columns=X_test_filtered.columns[cols_inds], index=X_test_filtered.index)
    
    scaler_model = StandardScaler().fit(X_train_filtered2)
    X_train_scaled = scaler_model.transform(X_train_filtered2)
    X_test_scaled = scaler_model.transform(X_test_filtered2)
    
    X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train_filtered2.columns, index=X_train_filtered2.index)
    X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test_filtered2.columns, index=X_test_filtered2.index)
    
    X_train_scaled_df.to_csv(os.path.join(data_source, 'DIABLO_INPUT', 'XTRAIN_RNASEQ_ALL_GENES_NOREPS_SPLIT_' + str(i) + '.csv'))
    
    X_test_scaled_df.to_csv(os.path.join(data_source, 'DIABLO_INPUT', 'XTEST_RNASEQ_ALL_GENES_NOREPS_SPLIT_' + str(i) + '.csv'))
    
    y_train.to_csv(os.path.join(data_source, 'DIABLO_INPUT', 'yTRAIN_ALL_GENES_NOREPS_SPLIT_' + str(i) + '.csv'))
    
    y_test.to_csv(os.path.join(data_source, 'DIABLO_INPUT', 'yTEST_ALL_500_GENES_NOREPS_SPLIT_' + str(i) + '.csv'))

## METABOLOMICS

In [None]:
metadata_noreps = pd.read_excel(os.path.join(data_source, 'metabolomics_metadata.xlsx'), sheet_name='CONVERSION', index_col=0)

In [None]:
data_reps = pd.read_excel(os.path.join(data_source, 'metabolomics.xlsx'), index_col=0, header=0)
data_reps = data_reps.loc[:, data_reps.columns != 'Method']
data_reps = data_reps.transpose()
data_reps.shape

In [None]:
data_mets_noreps = data_reps.groupby(metadata_noreps['groups']).mean()

In [None]:
for i in range(len(trains_index)):
        
    Xtrain_mets = data_mets_noreps.iloc[trains_index[i], :]
    
    Xtest_mets = data_mets_noreps.iloc[tests_index[i], :]
    
    vt = VarianceThreshold(0).fit(Xtrain_mets)
    X_train_filtered = vt.transform(Xtrain_mets)
    X_test_filtered = vt.transform(Xtest_mets)
    cols_inds_vt = vt.get_support(indices=True)
    X_train_filtered_df = pd.DataFrame(X_train_filtered, index=Xtrain_mets.index, columns=Xtrain_mets.columns[cols_inds_vt])
    X_test_filtered_df = pd.DataFrame(X_test_filtered, index=Xtest_mets.index, columns=Xtest_mets.columns[cols_inds_vt])
    
    scaler_model = StandardScaler().fit(X_train_filtered_df)
    X_train_scaled = scaler_model.transform(X_train_filtered_df)
    X_test_scaled = scaler_model.transform(X_test_filtered_df)
    
    X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train_filtered_df.columns, index=X_train_filtered_df.index)
    X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test_filtered_df.columns, index=X_test_filtered_df.index)
    
    X_train_scaled_df.to_csv(os.path.join(data_source, 'DIABLO_INPUT', 'XTRAIN_METABOLOMICS_NOREPS_VT_SPLIT_'+ str(i) +'.csv'))
    X_test_scaled_df.to_csv(os.path.join(data_source, 'DIABLO_INPUT', 'XTEST_METABOLOMICS_NOREPS_VT_SPLIT_'+ str(i) +'.csv'))

## FLUXOMICS

In [None]:
data_fluxes = pd.read_csv(os.path.join(data_source, 'fluxomics_fc.csv'), index_col=0)
data_fluxes = data_fluxes.fillna(0)
data_fluxes = data_fluxes.transpose()
data_fluxes.shape

In [None]:
for i in range(len(trains_index)):
    
    Xtrain_fluxes = data_fluxes.iloc[trains_index[i], :]
    
    Xtest_fluxes = data_fluxes.iloc[tests_index[i], :]
    
    y_train = ys_train[i]

    # remove some features
    vt = VarianceThreshold(0.1)
    filter_train = vt.fit(Xtrain_fluxes)
    
    train_filtered = filter_train.transform(Xtrain_fluxes)
    test_filtered = filter_train.transform(Xtest_fluxes)
    
    cols_inds = vt.get_support(indices=True)
    
    X_train_filtered = pd.DataFrame(train_filtered, index=Xtrain_fluxes.index, columns=Xtrain_fluxes.columns[cols_inds])
    X_test_filtered = pd.DataFrame(test_filtered, index=Xtest_fluxes.index, columns=Xtest_fluxes.columns[cols_inds])

    kb2 = SelectKBest(f_classif, k=500)

    kb2_fit = kb2.fit(X_train_filtered, y_train)
    
    train_filtered2 = kb2_fit.transform(X_train_filtered)
    test_filtered2 = kb2_fit.transform(X_test_filtered)
    
    cols_inds = kb2_fit.get_support(indices=True)
    
    X_train_filtered2 = pd.DataFrame(train_filtered2, columns=X_train_filtered.columns[cols_inds], index=X_train_filtered.index)
    X_test_filtered2 = pd.DataFrame(test_filtered2, columns=X_test_filtered.columns[cols_inds], index=X_test_filtered.index)
    
    scaler_model = StandardScaler().fit(X_train_filtered2)
    X_train_scaled = scaler_model.transform(X_train_filtered2)
    X_test_scaled = scaler_model.transform(X_test_filtered2)
    
    X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train_filtered2.columns, index=X_train_filtered2.index)
    X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test_filtered2.columns, index=X_test_filtered2.index)
    
    X_train_scaled_df.to_csv(os.path.join(data_source, 'DIABLO_INPUT', 'XTRAIN_FLUXOMICS_REACTIONS_SPLIT_'+ str(i) +'.csv'))
    X_test_scaled_df.to_csv(os.path.join(data_source, 'DIABLO_INPUT', 'XTEST_FLUXOMICS_REACTIONS_SPLIT_'+ str(i) +'.csv'))