
### Classification baseline for CNNs with RandomForests - Summary
Results of baseline random forest classifiers with automatic features extraction from tsfresh.

This notebook is just a summary of the results, for feature extraction and random forest optimization check: randomForest_baseline.py.


In [1]:
import pandas as pd
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import sklearn.metrics as metrics
import treeinterpreter.treeinterpreter as ti
from sklearn.model_selection import GridSearchCV
import zipfile
# Read module from another directory
import sys
sys.path.append('../')
from load_data import DataProcesser


In [2]:
# Get target vector for sklearn
data_file = '../data/ErkAkt_6GF_len240.zip'
meas_var = 'AKT'  # doesn't matter which variable is chosen
data = DataProcesser(data_file)
data.subset(sel_groups=meas_var, start_time=0, end_time=600)
classes = tuple(data.classes.iloc[:,1])
data = data.dataset

data = pd.melt(data, id_vars=['ID', 'class'], var_name='Time', value_name='Ratio_' + meas_var)
data['Time'] = data['Time'].str.replace('^{}_'.format(meas_var), '').astype('int')
data = data.sort_values(['ID', 'Time'])

dt_class = data.loc[:, ['ID', 'class']].copy()
dt_class = dt_class.drop_duplicates()
dt_class.index = dt_class['ID']
dt_class = dt_class.drop('ID', axis=1)

y_target =  dt_class['class']

del data  # Free memory


### Load features

For the bivariate case, the features are extracted indepently on ERK and AKT channels and concatenated.


In [3]:
# Load features and split for forest fitting
def load_features_split(meas_vars, y_target):
    if isinstance(meas_vars, str):
        features_archive = zipfile.ZipFile('../data/randForest_Features/randForest_Features_{}.zip'.format(meas_vars), 'r')
        features_filtered = pd.read_csv(features_archive.open('randForest_fltrFeatures.csv'),
                                    index_col='id')
    if isinstance(meas_vars, tuple):
        features_filtered = []
        for meas_var in meas_vars:
            features_archive = zipfile.ZipFile('../data/randForest_Features/randForest_Features_{}.zip'.format(meas_var), 'r')
            features_filtered.append(pd.read_csv(features_archive.open('randForest_fltrFeatures.csv'),
                                        index_col='id'))
        features_filtered = pd.merge(features_filtered[0], features_filtered[1], left_index=True, right_index=True)
        
    X_train, X_test, y_train, y_test = train_test_split(features_filtered, y_target, test_size=0.3, random_state=42)
    return X_train, X_test, y_train, y_test


features_dict = {meas:load_features_split(meas, y_target) for meas in ['ERK', 'AKT', ('ERK', 'AKT')]}


### Fit Models

Use decision tree and base random forest from sklearn with default parameters, except number of trees set to 100.


In [4]:
def confmat_acc_tree(xtrain, xtest, ytrain, ytest):
    model = DecisionTreeClassifier()
    model.fit(xtrain, ytrain)
    y_predict = model.predict(xtest)
    # Truth on rows; Predicted on columns
    confmat = metrics.confusion_matrix(ytest.values, y_predict, labels=[0,1,2,3,4,5,6])
    acc = metrics.accuracy_score(ytest.values, y_predict)
    return acc, confmat

def confmat_acc_frst(xtrain, xtest, ytrain, ytest):
    model = RandomForestClassifier(n_estimators=100)
    model.fit(xtrain, ytrain)
    y_predict = model.predict(xtest)
    # Truth on rows; Predicted on columns
    confmat = metrics.confusion_matrix(ytest.values, y_predict, labels=[0,1,2,3,4,5,6])
    acc = metrics.accuracy_score(ytest.values, y_predict)
    return acc, confmat

accuracies_trees   = {meas:confmat_acc_tree(*features_dict[meas]) for meas in ['ERK', 'AKT', ('ERK', 'AKT')]}
accuracies_forests = {meas:confmat_acc_frst(*features_dict[meas]) for meas in ['ERK', 'AKT', ('ERK', 'AKT')]}


### Report accuracies


In [5]:
def print_results(meas_var):
    def format_confmat(mat, class_names):
        out = pd.DataFrame(mat)
        out.index = class_names
        out.columns = class_names
        return  out
    print('Accuracy of the decision tree for {} is: {}'.format(meas_var, accuracies_trees[meas_var][0]))
    print('Accuracy of the random forest for {} is: {}'.format(meas_var, accuracies_forests[meas_var][0]))
    
    print('\n Confusion matrix: decision tree')
    print(format_confmat(accuracies_trees[meas_var][1], classes))
    
    print('\n Confusion matrix: random forest')
    print(format_confmat(accuracies_forests[meas_var][1], classes))


* ERK

In [6]:
print_results('ERK')

Accuracy of the decision tree for ERK is: 0.37604881769641496
Accuracy of the random forest for ERK is: 0.5087719298245614

 Confusion matrix: decision tree
              Ctrl  EGF  HGF  IGF1  Epiregulin  HeregulinB1  Betacellulin
Ctrl            94    8   98   119          11           60            13
EGF              5  128   18     5          65           21            94
HGF             75   10  139    87          10           48            19
IGF1           101    4   94   168           5           60            12
Epiregulin      11   48   11     6         163           14            41
HeregulinB1     52   19   59    74          11          153            33
Betacellulin    11   96   21    17          44           26           141

 Confusion matrix: random forest
              Ctrl  EGF  HGF  IGF1  Epiregulin  HeregulinB1  Betacellulin
Ctrl           107    3   30   166           6           79            12
EGF              6  178    5     5          58            7          

* AKT

In [7]:
print_results('AKT')

Accuracy of the decision tree for AKT is: 0.43897787948131195
Accuracy of the random forest for AKT is: 0.5728451563691839

 Confusion matrix: decision tree
              Ctrl  EGF  HGF  IGF1  Epiregulin  HeregulinB1  Betacellulin
Ctrl           162   11  134    28          22           27            19
EGF             10  118   40     8          54           10            96
HGF             92   29  149    29          26           35            28
IGF1            27    6   31   294          12           68             6
Epiregulin      20   72   35     6          88           20            53
HeregulinB1     25   18   30    56          18          235            19
Betacellulin    25  105   35     6          57           23           105

 Confusion matrix: random forest
              Ctrl  EGF  HGF  IGF1  Epiregulin  HeregulinB1  Betacellulin
Ctrl           254    4   74    27          16           18            10
EGF             11  157   15     9          56           14          

* ERK-AKT

In [8]:
print_results(('ERK', 'AKT'))

Accuracy of the decision tree for ('ERK', 'AKT') is: 0.5225019069412662
Accuracy of the random forest for ('ERK', 'AKT') is: 0.6708619374523265

 Confusion matrix: decision tree
              Ctrl  EGF  HGF  IGF1  Epiregulin  HeregulinB1  Betacellulin
Ctrl           174   10  113    38          11           40            17
EGF             14  148   15     2          50           12            95
HGF            112   14  165    34          11           35            17
IGF1            27    2   35   328           4           46             2
Epiregulin      15   61   12     5         156            1            44
HeregulinB1     32    7   41    60           7          243            11
Betacellulin    19  104   25     6          33           13           156

 Confusion matrix: random forest
              Ctrl  EGF  HGF  IGF1  Epiregulin  HeregulinB1  Betacellulin
Ctrl           258    4   71    35           5           18            12
EGF              7  190   11     1          51  

## Redo the analysis but with the new replicate and trim 100 first frames (early response)

In [9]:
# Get target vector for sklearn
data_file = '../data/ErkAkt_6GF_len240_repl2_trim100.zip'
meas_var = 'AKT'  # doesn't matter which variable is chosen
data = DataProcesser(data_file)
data.subset(sel_groups=meas_var, start_time=100, end_time=600)
classes = tuple(data.classes.iloc[:,1])
data = data.dataset

data = pd.melt(data, id_vars=['ID', 'class'], var_name='Time', value_name='Ratio_' + meas_var)
data['Time'] = data['Time'].str.replace('^{}_'.format(meas_var), '').astype('int')
data = data.sort_values(['ID', 'Time'])

dt_class = data.loc[:, ['ID', 'class']].copy()
dt_class = dt_class.drop_duplicates()
dt_class.index = dt_class['ID']
dt_class = dt_class.drop('ID', axis=1)

y_target =  dt_class['class']

del data  # Free memory

### Load features

In [11]:
# Load features and split for forest fitting
def load_features_split(meas_vars, y_target):
    if isinstance(meas_vars, str):
        features_archive = zipfile.ZipFile('../data/randForest_Features/randForest_Features_{}_repl2_trim100.zip'.format(meas_vars), 'r')
        features_filtered = pd.read_csv(features_archive.open('randForest_fltrFeatures.csv'),
                                    index_col='id')
    if isinstance(meas_vars, tuple):
        features_filtered = []
        for meas_var in meas_vars:
            features_archive = zipfile.ZipFile('../data/randForest_Features/randForest_Features_{}_repl2_trim100.zip'.format(meas_var), 'r')
            features_filtered.append(pd.read_csv(features_archive.open('randForest_fltrFeatures.csv'),
                                        index_col='id'))
        features_filtered = pd.merge(features_filtered[0], features_filtered[1], left_index=True, right_index=True)
        
    X_train, X_test, y_train, y_test = train_test_split(features_filtered, y_target, test_size=0.3, random_state=42)
    return X_train, X_test, y_train, y_test


features_dict = {meas:load_features_split(meas, y_target) for meas in ['ERK', 'AKT', ('ERK', 'AKT')]}

### Fit model

In [12]:
accuracies_trees   = {meas:confmat_acc_tree(*features_dict[meas]) for meas in ['ERK', 'AKT', ('ERK', 'AKT')]}
accuracies_forests = {meas:confmat_acc_frst(*features_dict[meas]) for meas in ['ERK', 'AKT', ('ERK', 'AKT')]}

### Report accuracies

* ERK

In [13]:
print_results('ERK')

Accuracy of the decision tree for ERK is: 0.3314656443526983
Accuracy of the random forest for ERK is: 0.4370392214685933

 Confusion matrix: decision tree
     BTC  CTR  EGF  EPR  HGF  HRG  IGF
BTC  156   25  114   37   23   40   20
CTR   24  135   25   19  114   83  130
EGF  103   10  149   62   19   20   16
EPR   54   15   70  183   11   19   13
HGF   36  113   23   18  122  106  154
HRG   27   93   31   14   97  208   84
IGF   15  120   15   13  138  104  171

 Confusion matrix: random forest
     BTC  CTR  EGF  EPR  HGF  HRG  IGF
BTC  201   13   80   55   11   50    5
CTR   14  167    2   16  102   87  142
EGF  103   13  159   77    4   22    1
EPR   34   14   32  271    5    7    2
HGF   31  108    8   14   82  159  170
HRG   29   53   11   11   61  331   58
IGF   17   79    4   13   59  133  271


* AKT

In [14]:
print_results('AKT')

Accuracy of the decision tree for AKT is: 0.40902388675906814
Accuracy of the random forest for AKT is: 0.5252138012385726

 Confusion matrix: decision tree
     BTC  CTR  EGF  EPR  HGF  HRG  IGF
BTC  142   41  112   41   51   20    8
CTR   33  184   31   36  150   48   48
EGF  109   24  121   54   43   23    5
EPR   66   31   64  111   60   24    9
HGF   57  169   42   44  163   61   36
HRG   23   54   19   38   52  280   88
IGF    7   36   11   16   36   84  386

 Confusion matrix: random forest
     BTC  CTR  EGF  EPR  HGF  HRG  IGF
BTC  188   25   93   26   51   31    1
CTR   21  241    8   27  171   25   37
EGF  114   18  161   37   20   28    1
EPR   68   30   60  114   37   46   10
HGF   59  184   16   14  221   52   26
HRG   10   24    4    8   36  376   96
IGF    1   24    0    2    9   60  480


* ERK-AKT

In [16]:
print_results(('ERK', 'AKT'))

Accuracy of the decision tree for ('ERK', 'AKT') is: 0.4747861987614273
Accuracy of the random forest for ('ERK', 'AKT') is: 0.6054261279858448

 Confusion matrix: decision tree
     BTC  CTR  EGF  EPR  HGF  HRG  IGF
BTC  168   19  116   45   41   20    6
CTR   26  174   17   20  190   56   47
EGF  113   22  138   64   24   13    5
EPR   63   19   58  180   19   16   10
HGF   34  168   26   16  229   62   37
HRG   17   41    9   17   75  315   80
IGF   11   36    5    7   41   70  406

 Confusion matrix: random forest
     BTC  CTR  EGF  EPR  HGF  HRG  IGF
BTC  246   11   70   54   28    5    1
CTR   20  237    1   16  197   30   29
EGF  107   10  176   70   15    1    0
EPR   44   10   31  266    8    4    2
HGF   35  180    5    9  248   70   25
HRG    5   29    3   10   35  390   82
IGF    1   18    0    1   16   50  490
