# Correlation-Based Random Forest Classification

In [2]:
import numpy as np
import pandas as pd
import sklearn

In [3]:
# load data
correlation_training_data = pd.DataFrame.from_csv('checkpoint_files/top_4_correlation_training_data.csv')
correlation_training_data

Unnamed: 0,pert_id,target,label,A375,A549,MCF7,PC3
0,BRD-K21680192,TOP2A,1,0.489878,0.141628,-0.183952,-0.123435
1,BRD-K81418486,HDAC1,1,0.348987,-0.088103,0.135304,0.276418
2,BRD-K81418486,HDAC2,1,0.275635,0.235417,-0.001411,0.135818
3,BRD-K81418486,HDAC3,1,0.333418,0.168260,0.213528,0.287947
4,BRD-K81418486,HDAC6,1,0.411440,0.308649,0.195380,0.310055
5,BRD-K81418486,HDAC8,1,-0.002134,-0.074233,-0.205291,0.148544
6,BRD-K94441233,HMGCR,1,0.072370,0.058970,0.067738,0.256720
7,BRD-K09638361,CSNK1G2,1,0.125853,-0.117417,0.167081,0.067171
8,BRD-K08547377,TOP1,1,0.129659,0.114016,0.269936,0.001985
9,BRD-K92093830,TOP2A,1,0.174227,0.115415,-0.112306,-0.108745


In [101]:
# balance True and False samples
pos_samples = correlation_training_data[correlation_training_data.label == 1]
neg_samples = correlation_training_data[correlation_training_data.label == -1].sample(n=pos_samples.shape[0])
balanced_training_data = np.vstack([pos_samples,neg_samples])

In [102]:
# shuffle the data
perm = np.random.RandomState(1).permutation(balanced_training_data.shape[0])
shuffled_training_data = balanced_training_data[perm]

In [103]:
# separate features and labels
X = shuffled_training_data[:,3:7].astype(float)
y = shuffled_training_data[:,2].astype(float)

In [104]:
# scale the features
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X)
X_std = sc.transform(X)

In [109]:
# random forest classification
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, random_state=0, n_estimators=10)
clf.fit(X_std, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [110]:
print('Accuracy: {:.2f}'.format(clf.score(X_std, y)))

Accuracy: 0.61


In [114]:
results = pd.DataFrame(shuffled_training_data, columns=correlation_training_data.columns)
results['prediction'] = clf.predict(X_std).astype(int)

In [112]:
false_interactions = results[results.label==-1]
false_accuracy = (false_interactions[false_interactions.prediction==-1].shape[0] /
                  false_interactions.shape[0])

true_interactions = results[results.label==1]
true_accuracy = (true_interactions[true_interactions.prediction==1].shape[0] /
                  true_interactions.shape[0])


print('Accuracy on False interactions: {:.2f}'.format(false_accuracy))
print('Accuracy on True interactions: {:.2f}'.format(true_accuracy))

Accuracy on False interactions: 0.75
Accuracy on True interactions: 0.48


## What if we don't balance the data beforehand

In [115]:
# shuffle all data
perm = np.random.RandomState(1).permutation(correlation_training_data.shape[0])
shuffled_training_data = correlation_training_data.values[perm]

In [116]:
# separate features and labels
X = shuffled_training_data[:,3:7].astype(float)
y = shuffled_training_data[:,2].astype(float)

In [117]:
# scale the features
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X)
X_std = sc.transform(X)

In [118]:
# random forest classification
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_std, y)
print('Accuracy: {:.2f}'.format(clf.score(X_std, y)))

Accuracy: 0.98


In [119]:
# look at accuract of true vs false interactions
results = pd.DataFrame(shuffled_training_data, columns=correlation_training_data.columns)
results['prediction'] = clf.predict(X_std).astype(int)

false_interactions = results[results.label==-1]
false_accuracy = (false_interactions[false_interactions.prediction==-1].shape[0] /
                  false_interactions.shape[0])

true_interactions = results[results.label==1]
true_accuracy = (true_interactions[true_interactions.prediction==1].shape[0] /
                  true_interactions.shape[0])


print('Accuracy on False interactions: {:.2f}'.format(false_accuracy))
print('Accuracy on True interactions: {:.2f}'.format(true_accuracy))

Accuracy on False interactions: 1.00
Accuracy on True interactions: 0.00


### Very bad