# Correlation-Based Random Forest Classification

In [1]:
import numpy as np
import pandas as pd
import sklearn

In [100]:
# load data
correlation_training_data = pd.DataFrame.from_csv('checkpoint_files/top_4_correlation_training_data.csv')
correlation_training_data.sample(n=10)

Unnamed: 0,pert_id,target,label,A375,A549,MCF7,PC3
6718,BRD-K64052750,XPNPEP3,-1,0.061478,-0.001093,-0.034774,-0.017798
6554,BRD-M64432851,DNPEP,-1,0.080946,0.076358,-0.039057,-0.02639
6651,BRD-K64052750,STAT5B,-1,-0.043271,0.040655,-0.0569,-0.057506
605,BRD-K94441233,CDK4,-1,0.166037,0.314397,-0.014367,0.394319
9243,BRD-K23363278,PDPK1,-1,0.15338,0.086145,0.232274,-0.063573
8186,BRD-K54256913,PYGL,-1,-0.131308,-0.043886,-0.074298,-0.001108
3682,BRD-K92241597,VKORC1,-1,-0.144646,-0.023281,-0.010137,0.004164
621,BRD-K94441233,LCK,-1,0.168164,0.245319,0.047586,0.280752
8473,BRD-K64890080,RRM1,-1,0.078507,0.11221,0.176226,0.124498
6809,BRD-K17953061,EPHB4,-1,0.38272,0.166355,0.3242,0.261688


In [101]:
# balance True and False samples
pos_samples = correlation_training_data[correlation_training_data.label == 1]
neg_samples = correlation_training_data[correlation_training_data.label == -1].sample(n=pos_samples.shape[0])
balanced_training_data = np.vstack([pos_samples,neg_samples])

In [102]:
# shuffle the data
perm = np.random.RandomState(1).permutation(balanced_training_data.shape[0])
shuffled_training_data = balanced_training_data[perm]

In [103]:
# separate features and labels
X = shuffled_training_data[:,3:7].astype(float)
y = shuffled_training_data[:,2].astype(float)

In [104]:
# scale the features
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X)
X_std = sc.transform(X)

In [109]:
# random forest classification
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, random_state=0, n_estimators=10)
clf.fit(X_std, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [110]:
print('Accuracy: {:.2f}'.format(clf.score(X_std, y)))

Accuracy: 0.61


In [114]:
results = pd.DataFrame(shuffled_training_data, columns=correlation_training_data.columns)
results['prediction'] = clf.predict(X_std).astype(int)

In [112]:
false_interactions = results[results.label==-1]
false_accuracy = (false_interactions[false_interactions.prediction==-1].shape[0] /
                  false_interactions.shape[0])

true_interactions = results[results.label==1]
true_accuracy = (true_interactions[true_interactions.prediction==1].shape[0] /
                  true_interactions.shape[0])


print('Accuracy on False interactions: {:.2f}'.format(false_accuracy))
print('Accuracy on True interactions: {:.2f}'.format(true_accuracy))

Accuracy on False interactions: 0.75
Accuracy on True interactions: 0.48


## What if we don't balance the data beforehand

In [115]:
# shuffle all data
perm = np.random.RandomState(1).permutation(correlation_training_data.shape[0])
shuffled_training_data = correlation_training_data.values[perm]

In [116]:
# separate features and labels
X = shuffled_training_data[:,3:7].astype(float)
y = shuffled_training_data[:,2].astype(float)

In [117]:
# scale the features
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X)
X_std = sc.transform(X)

In [118]:
# random forest classification
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_std, y)
print('Accuracy: {:.2f}'.format(clf.score(X_std, y)))

Accuracy: 0.98


In [119]:
# look at accuract of true vs false interactions
results = pd.DataFrame(shuffled_training_data, columns=correlation_training_data.columns)
results['prediction'] = clf.predict(X_std).astype(int)

false_interactions = results[results.label==-1]
false_accuracy = (false_interactions[false_interactions.prediction==-1].shape[0] /
                  false_interactions.shape[0])

true_interactions = results[results.label==1]
true_accuracy = (true_interactions[true_interactions.prediction==1].shape[0] /
                  true_interactions.shape[0])


print('Accuracy on False interactions: {:.2f}'.format(false_accuracy))
print('Accuracy on True interactions: {:.2f}'.format(true_accuracy))

Accuracy on False interactions: 1.00
Accuracy on True interactions: 0.00


### Very bad