# Imports

In [6]:
import numpy as np
import pandas as pd
from sklearn import tree

# Functions

In [7]:
# function to train a decision tree
# inputs: train_data = training data in array (samples x timepoints), train_label = training labels in array (samples x genes),
# outputs: dtree = trained decision tree model 
def train_dtree(train_data,train_label):    
    # create tree model
    dtree = tree.DecisionTreeRegressor()
    dtree.fit(train_data,train_label)
    return dtree

In [8]:
# function to test decision tree predictions and calculate RMSE
# inputs: tree_model = trained decision tree model, data = numpy array of pSTAT data (samples x 2*timepoints; pSTAT3 then pSTAT1 concat.)
# label = numpy array of gene expression labels corresponding to data (samples x genes flattened)
# outputs: predict = numpy array of gene predictions (samples x genes flattened), rmse = root mean squared error
def test_and_eval(tree_model,data,label):
    # predict labels with DTree
    predict = tree_model.predict(data)

    # calculate RMSE 
    rmse = np.sqrt(np.sum((predict - label) ** 2,axis=0) / predict.shape[0])

    return predict,rmse

# Main Script

In [3]:
# load training data and labels (trajectories already normalized)
train_norm_pSTAT3_data_df = pd.read_csv('Data/subset_training_data_pSTAT3.csv', header=None)
train_label = np.asarray(pd.read_csv('Data/subset_training_label_pSTAT3.csv', header=None))

# load testing data and labels (trajectories already normalized)
test_norm_pSTAT3_data_df = pd.read_csv('Data/subset_testing_data_pSTAT3.csv', header=None)
test_label = np.asarray(pd.read_csv('Data/subset_testing_label_pSTAT3.csv', header=None))

In [None]:
# timeframes from paper (will be different than results from HMM_timeframe_ID.ipynb)
# top row = frame start
# bottom row = frame end
pSTAT3_frames = np.array([[0,9,14,49,50,61],[8,13,48,49,60,90]])

In [None]:
# 10-fold CV for timeframe consolidation (training data only since it is optimization)
# sample size for each partition 
n_cv = int(train_norm_pSTAT3_data_df.shape[0] / 10)
cv_splits = np.arange(0,train_norm_pSTAT3_data_df.shape[0]+n_cv,n_cv)

# test each timeframe individually 
for i in range(pSTAT3_frames.shape[1]):
    print('Timeframe:',i+1)
    
    # variables to sum over CV runs
    predict_sum = 0
    rmse_sum = 0

    for j in range(len(cv_splits)-1):
        # split train and test set for CV
        test_ind = np.arange(cv_splits[j],cv_splits[j+1])
        
        cv_test_data = np.asarray(train_norm_pSTAT3_data_df.iloc[test_ind,pSTAT3_frames[0,i]:pSTAT3_frames[1,i]+1])
        cv_test_label = train_label[test_ind,:]
        
        cv_train_data = np.delete(np.asarray(train_norm_pSTAT3_data_df.iloc[:,pSTAT3_frames[0,i]:pSTAT3_frames[1,i]+1]),test_ind,axis=0)
        cv_train_label = np.delete(train_label,test_ind,axis=0)

        # train and test DTree
        test_dtree = train_dtree(cv_train_data,cv_train_label)
        test_predict,test_rmse = test_and_eval(test_dtree,cv_test_data,cv_test_label)

        # add to storage variables
        predict_sum+=test_predict
        rmse_sum+=test_rmse

    # average over CV runs
    avg_cv_predict = predict_sum / (len(cv_splits)-1)
    avg_cv_rmse =  rmse_sum / (len(cv_splits)-1)

    # uncomment to save timeframe predictions
    # np.savetxt('avg_cv_predict' + '_frame' + str(i+1) + '.csv', avg_cv_predict, delimiter=',')
    # np.savetxt('avg_cv_rmse' + '_frame' + str(i+1) + '.txt',avg_cv_rmse)

In [10]:
# early predictions using timeframes from paper
early_test_dtree = train_dtree(np.asarray(train_norm_pSTAT3_data_df.iloc[:,0:14]),train_label)
early_test_predict,early_test_rmse = test_and_eval(early_test_dtree,np.asarray(test_norm_pSTAT3_data_df.iloc[:,0:14]),test_label)

In [11]:
# late predictions using timeframes from paper
late_test_dtree = train_dtree(np.asarray(train_norm_pSTAT3_data_df.iloc[:,49:]),train_label)
late_test_predict,late_test_rmse = test_and_eval(late_test_dtree,np.asarray(test_norm_pSTAT3_data_df.iloc[:,49:]),test_label)