In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [18]:
#To ignore warning
import warnings
warnings.filterwarnings('ignore')

## RCC metabolomics data

In [19]:
cohort = pd.read_excel('data/fullcohort.xlsx', index_col=0)

In [20]:
# Select the biomarkers for RCC in urine. 
final_features_ID = {'2-phenylacetamide', 
                     'lys-Ile/lys-leu', 
                     'Dibutylamine',
                     'hippuric acid',
                     'hippurate-mannitol',
                     '2-mercaptobenzothiazole',
                     'N-acetyl-glucosaminic acid',
                     'Groups'}

In [21]:
# numpy random seed.
np.random.seed(42)

In [25]:
# Transform dataset vis autoscaling 
df = cohort[list(final_features_ID)]

In [26]:
df.head()

Unnamed: 0,hippurate-mannitol,Dibutylamine,hippuric acid,2-mercaptobenzothiazole,N-acetyl-glucosaminic acid,2-phenylacetamide,Groups,lys-Ile/lys-leu
0,344514.838847,123738.863361,7276.20605,3942.561187,107672.229359,2818504.0,RCC,522552.8
1,566946.267693,117934.731942,4049.040213,36871.411846,94714.545602,1781136.0,RCC,1692409.0
2,507622.708725,352899.235704,1191.185002,15746.465206,80282.421425,952056.1,RCC,840617.1
3,590259.167426,351289.507895,1384.50771,41132.333747,140790.012284,682468.9,RCC,1085230.0
4,322294.000442,165126.186787,5505.012787,17861.908096,256558.919952,2297574.0,RCC,796809.5


In [27]:
df['Groups'] = df['Groups'].map({'Control': 0, 'RCC': 1})

In [28]:
df.head()

Unnamed: 0,hippurate-mannitol,Dibutylamine,hippuric acid,2-mercaptobenzothiazole,N-acetyl-glucosaminic acid,2-phenylacetamide,Groups,lys-Ile/lys-leu
0,344514.838847,123738.863361,7276.20605,3942.561187,107672.229359,2818504.0,1,522552.8
1,566946.267693,117934.731942,4049.040213,36871.411846,94714.545602,1781136.0,1,1692409.0
2,507622.708725,352899.235704,1191.185002,15746.465206,80282.421425,952056.1,1,840617.1
3,590259.167426,351289.507895,1384.50771,41132.333747,140790.012284,682468.9,1,1085230.0
4,322294.000442,165126.186787,5505.012787,17861.908096,256558.919952,2297574.0,1,796809.5


In [29]:
# Separate target and features
target = "Groups" # Replace with your target column name.
X = df.drop(target, axis=1)
y = df[target]

In [30]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42) 
# Adjust test_size and random_state as needed

In [31]:
# Combine the features and target for train and test dataframes
RCCtrain_df = pd.concat([X_train, y_train], axis=1)
RCCtest_df = pd.concat([X_test, y_test], axis=1)

In [33]:
RCCtest_df.head()

Unnamed: 0,hippurate-mannitol,Dibutylamine,hippuric acid,2-mercaptobenzothiazole,N-acetyl-glucosaminic acid,2-phenylacetamide,lys-Ile/lys-leu,Groups
224,2379022.0,103829.313655,16120.784491,5183.678824,68247.343844,5252678.0,1112001.0,0
6,619055.1,80520.905159,18428.529659,10722.213325,50565.826913,3180798.0,368181.4,1
28,1112162.0,407921.911864,1440.743047,122186.252174,94474.316727,1384680.0,2822.377,1
198,2697248.0,66899.646344,63881.438382,1656.367242,105111.851103,4960717.0,3560.525,0
87,448867.1,162471.909775,17500.447111,770.756689,1831.944831,4988279.0,578.4373,0


In [35]:
# Save the train and test dataframes as csv files

#RCCtrain_df.to_excel("RCC_train_dataframe.xlsx", index=False)
#RCCtest_df.to_excel("RCC_test_dataframe.xlsx", index=False)

## Ovarian metabolomics data