# Custom LINCS random forest

In [1]:
from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy

In [13]:
X, y = datasets.make_classification(n_samples=20000, 
                                            n_features=36, 
                                            n_informative=10, 
                                            n_redundant=6, 
                                            n_repeated=0, 
                                            n_classes=2, 
                                            n_clusters_per_class=2, 
                                            weights=None, 
                                            flip_y=0.01, 
                                            class_sep=1.0, 
                                            hypercube=True, 
                                            shift=0.0, 
                                            scale=1.0, 
                                            shuffle=True, 
                                            random_state=1)

In [14]:
X.shape

(20000, 36)

We want to randomly remove data from X in order to simulate the missing data from our LINCS classification problem. Basically every sample has **4 metrics x 9 cell lines = 36 total features**. However, not all samples are tested in all cell lines, but we will say that have at minimum data from four cell lines. This should be made a variable. 

In [15]:
# first assign which cell lines each sample was tested in
min_n_cells = 4
max_n_cells = 9
n_cells_ = np.random.randint(min_n_cells, max_n_cells, len(y))
n_missing_cells_ = 9 - n_cells_

In [16]:
# remove features from each sample's missing cell lines
X_df = pd.DataFrame(X).copy()

for index in range(len(X)):
    n_missing_cells = n_missing_cells_[index]
    
    # randomly choose which cells lines are mising
    missing_cell_lines = np.random.choice(np.arange(9),n_missing_cells, replace=False)
    
    # convert this to the missing feature indeces
    missing_feature_idx = np.array([ 4*i + np.array([0, 1, 2, 3]) for i in missing_cell_lines ]).reshape(1,-1)[0]
    
    # remove the feature values
    X_df.set_value(index, missing_feature_idx, np.NaN)

In [17]:
X_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,1.209149,1.19217,-1.164384,0.484403,-0.456668,1.353219,-1.084165,-0.51096,-1.074072,-0.376052,...,-2.528943,0.326389,-0.818247,1.39016,1.249547,0.66406,-0.308625,-0.47675,-1.852631,-0.192746
1,-0.724587,0.857552,-0.28239,-0.872162,-1.295439,-1.104565,1.008983,1.783967,,,...,,,-1.027753,1.048833,-1.846904,-0.272547,,,,
2,,,,,,,,,1.074256,0.497485,...,1.126106,-0.804649,-1.06479,-0.591569,0.717058,-1.076956,2.132901,0.559781,-1.215091,0.669849
3,,,,,0.123821,1.134245,-4.068682,-0.38408,1.60072,1.042394,...,-7.302279,0.459751,-0.754988,0.204575,-2.347061,1.891265,-4.264726,-1.29077,0.073044,-0.258931
4,0.847515,-3.265937,-3.784482,-0.851469,-1.244846,0.978361,-0.988202,1.456638,-0.067727,0.233124,...,0.446171,-0.059311,-0.776104,0.527975,-1.371177,0.92711,,,,


Ok, so now we have a dataset with missing values that mimic the missing data we have in our LINCS dataset. Now we have to construct our custom Random Forest implementation that elegantly handles the missing data.

In [18]:
X_missing = X_df.values
X_not_missing = ~np.isnan(X_missing)
num_cells_not_missing = np.count_nonzero(X_not_missing, axis=1) / 4
print(num_cells_not_missing)
np.min(num_cells_not_missing)

[8. 5. 6. ... 5. 5. 8.]


4.0

### Let's try a classic SKLEARN random forest classifier

In [20]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(criterion='gini',
                                n_estimators=10,
                                random_state=1,
                                n_jobs=-1)
forest.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=False, random_state=1,
            verbose=0, warm_start=False)

In [22]:
[ tree.predict(X) for tree in forest.estimators_ ]

[array([1., 0., 0., ..., 1., 0., 0.]),
 array([0., 0., 0., ..., 1., 0., 0.]),
 array([0., 1., 0., ..., 1., 0., 0.]),
 array([0., 0., 0., ..., 1., 0., 0.]),
 array([0., 0., 0., ..., 1., 0., 0.]),
 array([0., 0., 0., ..., 1., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 1., 0., 0.]),
 array([0., 0., 0., ..., 1., 0., 0.]),
 array([0., 0., 0., ..., 1., 0., 1.])]

## Let's try our own custom forest implementation

In [23]:
from models import LincsRandomForestClassifier

In [26]:
LRF = LincsRandomForestClassifier(n_cells_per_forest=4)
LRF.fit(X_missing, y)

In [28]:
predictions = LRF.predict_(X_missing)

In [31]:
np.count_nonzero(predictions == y)

19965