This notebook will provide complete insight in the training and testing process of [DREAM Birth challenge](https://www.synapse.org/#!Synapse:syn18380862wiki590485) Subchallenge 1

In [1]:
import numpy as np
import pandas as pd
import os.path

PATH_FILES = os.path.dirname(os.path.abspath('')) + '/data'

anno = pd.read_csv(PATH_FILES + '/anoSC1_v11_nokey.csv', delimiter = ',', index_col = 0)
anno.head()

Unnamed: 0_level_0,GA,Batch,Set,Train,Platform
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Tarca_001_P1A01,11.0,1,PRB_HTA,1,HTA20
Tarca_013_P1B01,15.3,1,PRB_HTA,1,HTA20
Tarca_025_P1C01,21.7,1,PRB_HTA,1,HTA20
Tarca_037_P1D01,26.7,1,PRB_HTA,1,HTA20
Tarca_049_P1E01,31.3,1,PRB_HTA,1,HTA20


In [2]:
HTA20_RMA = pd.read_csv(PATH_FILES + '/HTA20_RMA.csv', delimiter = ',', index_col = 0).transpose()
HTA20_RMA.head()

Unnamed: 0,1_at,10_at,100_at,1000_at,10000_at,100009613_at,100009676_at,10001_at,10002_at,10003_at,...,AFFX-BkGr-GC24_at,AFFX-BkGr-GC25_at,AFFX-r2-Bs-dap-5_st,AFFX-r2-Bs-lys-5_st,AFFX-r2-Bs-phe-5_st,AFFX-r2-Bs-thr-5_st,AFFX-r2-Ec-bioB-5_at,AFFX-r2-Ec-bioC-5_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-5_at
Tarca_001_P1A01,6.062215,3.796484,5.849338,3.567779,6.166815,4.443027,5.836522,6.330018,4.922339,2.689344,...,8.786114,11.587051,8.342203,4.520028,5.636615,6.709797,8.972873,10.440245,12.101476,13.695705
Tarca_003_P1A03,6.125023,3.805305,6.191562,3.452524,5.678373,4.773199,6.143398,5.601745,4.711765,2.77101,...,9.594813,12.13809,9.010691,5.148384,6.723139,6.153199,9.376194,10.845176,12.370891,13.635522
Tarca_004_P1A04,5.875502,3.450245,6.550525,3.316134,6.185059,4.393488,5.898364,6.137984,4.628124,2.556756,...,9.294845,12.049271,8.555541,4.4419,6.016953,9.590764,8.843612,10.493416,12.295786,13.616688
Tarca_005_P1A05,6.126131,3.628411,6.421877,3.432451,5.633757,4.623783,6.019792,5.787502,4.796283,2.613415,...,9.694992,12.311885,9.164106,4.529299,6.990176,5.437926,9.191471,10.879879,12.249936,13.524328
Tarca_006_P1A06,6.146466,3.446812,6.260962,3.477162,5.313198,4.422651,6.407699,5.830437,4.726488,2.631878,...,9.600712,12.173934,8.105479,4.989477,6.247265,7.550088,9.247768,10.754316,12.245458,13.509353


In [3]:
# Sync the X and y data by sorting the labels

df1 = anno.sort_index()
df2 = HTA20_RMA.sort_index()

X = df2.iloc[np.array(np.logical_not(df1['GA'].isna())),:]
y = df1.dropna().loc[:,['GA','Batch']]


In [4]:
# Check to see if the indexes are the same
(X.index == y.index).all()

True

## Standard scaling
We will now perform the standard scaling of features from one batch

In [5]:
from sklearn.preprocessing import StandardScaler

XX = np.zeros(X.shape)
for i in [1,2,3,4,5,6,7,8,9,10,32]:
    scale = StandardScaler()
    indices = np.bool8(y['Batch'] == i)
    Xtemp = X.iloc[indices,:]
    scale.fit(Xtemp)
    XX[indices,:] = scale.transform(Xtemp)

In [6]:
# delete batch column
yy = y['GA']

In [9]:
# import necessary ML modules
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

num_of_splits = 10
kf = KFold(n_splits = num_of_splits)
forest = RandomForestRegressor(n_estimators = 1000, criterion = 'mse', 
                               random_state = 1, n_jobs = -1)

y_train_pred = []
y_test_pred = []

for train_index, test_index in tqdm(kf.split(X), total = num_of_splits, unit = 'iteration'):

  pca = PCA(n_components = 0.95, svd_solver = 'full')

  X_train_std, X_test_std = XX[train_index,:], XX[test_index,:]
  y_train, y_test = yy.iloc[train_index], yy.iloc[test_index]

  X_train_pca = pca.fit_transform(X_train_std)
  X_test_pca = pca.transform(X_test_std)
  
  forest.fit(X_train_pca,y_train)
  
  y_train_pred.append(mean_squared_error(y_train,forest.predict(X_train_pca), squared = False))
  y_test_pred.append(mean_squared_error(y_test,forest.predict(X_test_pca), squared = False))



  0%|          | 0/10 [00:04<?, ?iteration/s]


NameError: name 'y_train_pca' is not defined

In [8]:
np.mean(y_train_pred)

7.09001644588343

In [None]:
np.mean(y_test_pred)