# Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn import tree

# Functions

In [2]:
# function to train a decision tree
# inputs: train_data = training data in array (samples x timepoints), train_label = training labels in array (samples x genes),
# outputs: dtree = trained decision tree model 
def train_dtree(train_data,train_label):    
    # create tree model
    dtree = tree.DecisionTreeRegressor()
    dtree.fit(train_data,train_label)
    return dtree

In [3]:
# function to test decision tree predictions and calculate RMSE
# inputs: tree_model = trained decision tree model, data = numpy array of pSTAT data (samples x 2*timepoints; pSTAT3 then pSTAT1 concat.)
# label = numpy array of gene expression labels corresponding to data (samples x genes flattened)
# outputs: predict = numpy array of gene predictions (samples x genes flattened), rmse = root mean squared error
def test_and_eval(tree_model,data,label):
    # predict labels with DTree
    predict = tree_model.predict(data)

    # calculate RMSE 
    rmse = np.sqrt(np.sum((predict - label) ** 2,axis=0) / predict.shape[0])

    return predict,rmse

# Main Script

In [5]:
# load training data and labels (trajectories already normalized)
train_norm_pSTAT1_data_df = pd.read_csv('Data/subset_training_data_pSTAT1.csv', header=None)
train_label = np.asarray(pd.read_csv('Data/subset_training_label_pSTAT1.csv', header=None))

# load testing data and labels (trajectories already normalized) - only IL-6 low dose since that is the experimental data we have to validate
test_norm_pSTAT1_jak2i_data_df = pd.read_csv('Data/subset_testing_data_pSTAT1.csv', header=None)
test_label = np.asarray(pd.read_csv('Data/subset_testing_label_pSTAT1.csv', header=None))

In [6]:
# train with normal, unperturbed trajectory and test with jak2i trajectories
dtree = train_dtree(np.asarray(train_norm_pSTAT1_data_df),train_label)
predict,rmse = test_and_eval(dtree,np.asarray(test_norm_pSTAT1_jak2i_data_df),test_label)