
### Classification baseline for CNNs with RandomForests - Summary
Results of baseline random forest classifiers with automatic features extraction from tsfresh.

This notebook is just a summary of the results, for feature extraction and random forest optimization check: randomForest_baseline.py.


In [5]:
import pandas as pd
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import sklearn.metrics as metrics
import treeinterpreter.treeinterpreter as ti
from sklearn.model_selection import GridSearchCV
import zipfile
# Read module from another directory
import sys
sys.path.append('../')
from load_data import DataProcesser


In [32]:
# Get target vector for sklearn
data_file = '../data/ErkAkt_6GF_len240.zip'
meas_var = 'AKT'  # doesn't matter which variable is chosen
data = DataProcesser(data_file)
data.subset(sel_groups=meas_var, start_time=0, end_time=600)
classes = tuple(data.classes.iloc[:,1])
data = data.dataset

data = pd.melt(data, id_vars=['ID', 'class'], var_name='Time', value_name='Ratio_' + meas_var)
data['Time'] = data['Time'].str.replace('^{}_'.format(meas_var), '').astype('int')
data = data.sort_values(['ID', 'Time'])

dt_class = data.loc[:, ['ID', 'class']].copy()
dt_class = dt_class.drop_duplicates()
dt_class.index = dt_class['ID']
dt_class = dt_class.drop('ID', axis=1)

y_target =  dt_class['class']

del data  # Free memory


### Load features

For the bivariate case, the features are extracted indepently on ERK and AKT channels and concatenated.


In [12]:
# Load features and split for forest fitting
def load_features_split(meas_vars, y_target):
    if isinstance(meas_vars, str):
        features_archive = zipfile.ZipFile('../data/randForest_Features/randForest_Features_{}.zip'.format(meas_vars), 'r')
        features_filtered = pd.read_csv(features_archive.open('randForest_fltrFeatures.csv'),
                                    index_col='id')
    if isinstance(meas_vars, tuple):
        features_filtered = []
        for meas_var in meas_vars:
            features_archive = zipfile.ZipFile('../data/randForest_Features/randForest_Features_{}.zip'.format(meas_var), 'r')
            features_filtered.append(pd.read_csv(features_archive.open('randForest_fltrFeatures.csv'),
                                        index_col='id'))
        features_filtered = pd.merge(features_filtered[0], features_filtered[1], left_index=True, right_index=True)
        
    X_train, X_test, y_train, y_test = train_test_split(features_filtered, y_target, test_size=0.3, random_state=42)
    return X_train, X_test, y_train, y_test


features_dict = {meas:load_features_split(meas, y_target) for meas in ['ERK', 'AKT', ('ERK', 'AKT')]}


### Fit Models

Use decision tree and base random forest from sklearn with only 10 trees.


In [34]:
def confmat_acc_tree(xtrain, xtest, ytrain, ytest):
    model = DecisionTreeClassifier()
    model.fit(xtrain, ytrain)
    y_predict = model.predict(xtest)
    # Truth on rows; Predicted on columns
    confmat = metrics.confusion_matrix(ytest.values, y_predict, labels=[0,1,2,3,4,5,6])
    acc = metrics.accuracy_score(ytest.values, y_predict)
    return acc, confmat

def confmat_acc_frst(xtrain, xtest, ytrain, ytest):
    model = RandomForestClassifier(n_estimators=10)
    model.fit(xtrain, ytrain)
    y_predict = model.predict(xtest)
    # Truth on rows; Predicted on columns
    confmat = metrics.confusion_matrix(ytest.values, y_predict, labels=[0,1,2,3,4,5,6])
    acc = metrics.accuracy_score(ytest.values, y_predict)
    return acc, confmat

accuracies_trees   = {meas:confmat_acc_tree(*features_dict[meas]) for meas in ['ERK', 'AKT', ('ERK', 'AKT')]}
accuracies_forests = {meas:confmat_acc_frst(*features_dict[meas]) for meas in ['ERK', 'AKT', ('ERK', 'AKT')]}


### Report accuracies


In [51]:
def print_results(meas_var):
    def format_confmat(mat, class_names):
        out = pd.DataFrame(mat)
        out.index = class_names
        out.columns = class_names
        return  out
    print('Accuracy of the decision tree for {} is: {}'.format(meas_var, accuracies_trees[meas_var][0]))
    print('Accuracy of the random forest for {} is: {}'.format(meas_var, accuracies_forests[meas_var][0]))
    
    print('\n Confusion matrix: decision tree')
    print(format_confmat(accuracies_trees[meas_var][1], classes))
    
    print('\n Confusion matrix: random forest')
    print(format_confmat(accuracies_forests[meas_var][1], classes))


* ERK

In [52]:
print_results('ERK')


Accuracy of the decision tree for ERK is: 0.38253241800152554
Accuracy of the random forest for ERK is: 0.4622425629290618

 Confusion matrix: decision tree
              Ctrl  EGF  HGF  IGF1  Epiregulin  HeregulinB1  Betacellulin
Ctrl            98    3   93   126          11           56            16
EGF              7  128   12     3          65           29            92
HGF             76   14  127   100           7           48            16
IGF1            85    8   95   177           7           61            11
Epiregulin      12   50   12     7         158           15            40
HeregulinB1     47   22   56    68          13          167            28
Betacellulin    13   90   17    11          39           38           148

 Confusion matrix: random forest
              Ctrl  EGF  HGF  IGF1  Epiregulin  HeregulinB1  Betacellulin
Ctrl           125    8   49   141           6           63            11
EGF              9  163   10     5          58            7          

* AKT

In [53]:
print_results('AKT')


Accuracy of the decision tree for AKT is: 0.4401220442410374
Accuracy of the random forest for AKT is: 0.513348588863463

 Confusion matrix: decision tree
              Ctrl  EGF  HGF  IGF1  Epiregulin  HeregulinB1  Betacellulin
Ctrl           146   12  133    29          26           35            22
EGF             14  114   37     7          59           14            91
HGF             88   30  156    26          26           33            29
IGF1            25    9   32   303          14           59             2
Epiregulin      19   65   37     6          82           21            64
HeregulinB1     22   16   29    54          15          240            25
Betacellulin    24  107   29     6          59           18           113

 Confusion matrix: random forest
              Ctrl  EGF  HGF  IGF1  Epiregulin  HeregulinB1  Betacellulin
Ctrl           227   10   95    28          16           16            11
EGF             16  136   22     8          53           12            

* ERK-AKT

In [54]:
print_results(('ERK', 'AKT'))


Accuracy of the decision tree for ('ERK', 'AKT') is: 0.5312738367658276
Accuracy of the random forest for ('ERK', 'AKT') is: 0.5995423340961098

 Confusion matrix: decision tree
              Ctrl  EGF  HGF  IGF1  Epiregulin  HeregulinB1  Betacellulin
Ctrl           178    7  118    29          18           35            18
EGF             14  145   13     3          51            6           104
HGF            104   18  169    35          13           30            19
IGF1            29    3   28   329           5           48             2
Epiregulin      11   64   14     4         159            1            41
HeregulinB1     31   10   35    54           6          251            14
Betacellulin    19   87   24     6          46           12           162

 Confusion matrix: random forest
              Ctrl  EGF  HGF  IGF1  Epiregulin  HeregulinB1  Betacellulin
Ctrl           258    7   72    29           6           18            13
EGF             11  160   13     1          61  