In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import sys
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from models import RandomForest, SVC
from metrics import ClassificationResult
from utils import get_features, get_labels, subsample_data, normalize_features, reconstruct_2d, printMatrix, binarize, zscore, get_2D, plot_prediction, change_scale 

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX TITAN X (CNMeM is enabled with initial size: 2500 MB, cuDNN 5105)


In [2]:
contacts='PP'
path='/users/mtaranov/datasets_3d/dist_matched_'+contacts+'/'
X_train = get_features(path+'motifs/X_train_thres_10.npy')
y_train = get_labels(path+'y_train_thres_10.npy')
X_valid = get_features(path+'motifs/X_valid_thres_10.npy')
y_valid = get_labels(path+'y_valid_thres_10.npy')
X_test = get_features(path+'motifs/X_test_thres_10.npy')
y_test = get_labels(path+'y_test_thres_10.npy')

In [3]:
X_train_normalized, X_valid_normalized, X_test_normalized = normalize_features(X_train, X_valid, X_test)

In [4]:
X_train_pairs = X_train_normalized.reshape(X_train_normalized.shape[0],X_train_normalized.shape[2]*X_train_normalized.shape[3])
X_valid_pairs = X_valid_normalized.reshape(X_valid_normalized.shape[0],X_valid_normalized.shape[2]*X_valid_normalized.shape[3])
X_test_pairs = X_test_normalized.reshape(X_test_normalized.shape[0],X_test_normalized.shape[2]*X_test_normalized.shape[3])

# Random Forest

In [5]:
rf = RandomForest()

### with motifs only

In [6]:
rf.train(X_train_pairs[:,22:], y_train)
preds_test = rf.predict(X_test_pairs[:,22:])
preds_train = rf.predict(X_train_pairs[:,22:])
print ('On Test:\n{}\n'.format(ClassificationResult(y_test, preds_test)))
print ('On Train:\n{}\n'.format(ClassificationResult(y_train, preds_train)))

  self.classifier.fit(X, y)


On Test:
Balanced Accuracy: 72.84%	 auROC: 0.810	 auPRC: 0.834	 auPRG: 0.640
Recall at 5%|10%|20% FDR: 29.6%|33.7%|65.0%	 Num Positives: 243	 Num Negatives: 243	 

On Train:
Balanced Accuracy: 100.00%	 auROC: 1.000	 auPRC: 1.000	 auPRG: 1.000
Recall at 5%|10%|20% FDR: 100.0%|100.0%|100.0%	 Num Positives: 566	 Num Negatives: 566	 



### with atac only

In [7]:
rf.train(X_train_pairs[:,:2], y_train)
preds_test = rf.predict(X_test_pairs[:,:2])
preds_train = rf.predict(X_train_pairs[:,:2])
print ('On Test:\n{}\n'.format(ClassificationResult(y_test, preds_test)))
print ('On Train:\n{}\n'.format(ClassificationResult(y_train, preds_train)))

On Test:
Balanced Accuracy: 62.96%	 auROC: 0.690	 auPRC: 0.725	 auPRG: 0.338
Recall at 5%|10%|20% FDR: 15.6%|18.5%|39.9%	 Num Positives: 243	 Num Negatives: 243	 

On Train:
Balanced Accuracy: 97.61%	 auROC: 0.995	 auPRC: 0.994	 auPRG: 0.993
Recall at 5%|10%|20% FDR: 98.6%|99.1%|99.1%	 Num Positives: 566	 Num Negatives: 566	 



### with distance (genomic locations) only

In [8]:
rf.train(X_train_pairs[:,20:22], y_train)
preds_test = rf.predict(X_test_pairs[:,20:22])
preds_train = rf.predict(X_train_pairs[:,20:22])
print ('On Test:\n{}\n'.format(ClassificationResult(y_test, preds_test)))
print ('On Train:\n{}\n'.format(ClassificationResult(y_train, preds_train)))

On Test:
Balanced Accuracy: 73.66%	 auROC: 0.783	 auPRC: 0.796	 auPRG: 0.578
Recall at 5%|10%|20% FDR: 0.0%|25.9%|45.7%	 Num Positives: 243	 Num Negatives: 243	 

On Train:
Balanced Accuracy: 100.00%	 auROC: 1.000	 auPRC: 1.000	 auPRG: 1.000
Recall at 5%|10%|20% FDR: 100.0%|100.0%|100.0%	 Num Positives: 566	 Num Negatives: 566	 



### with motifs+atac

In [9]:
rf.train(X_train_pairs[:, [0,1]+[i+22 for i in range(1920*2)]], y_train)
preds_test = rf.predict(X_test_pairs[:, [0,1]+[i+22 for i in range(1920*2)]])
preds_train = rf.predict(X_train_pairs[:, [0,1]+[i+22 for i in range(1920*2)]])
print ('On Test:\n{}\n'.format(ClassificationResult(y_test, preds_test)))
print ('On Train:\n{}\n'.format(ClassificationResult(y_train, preds_train)))

On Test:
Balanced Accuracy: 70.78%	 auROC: 0.806	 auPRC: 0.831	 auPRG: 0.620
Recall at 5%|10%|20% FDR: 32.9%|41.6%|58.0%	 Num Positives: 243	 Num Negatives: 243	 

On Train:
Balanced Accuracy: 100.00%	 auROC: 1.000	 auPRC: 1.000	 auPRG: 1.000
Recall at 5%|10%|20% FDR: 100.0%|100.0%|100.0%	 Num Positives: 566	 Num Negatives: 566	 



### with motifs+atac+distance

In [10]:
rf.train(X_train_pairs[:, [0,1]+[20,21]+[i+22 for i in range(1920*2)]], y_train)
preds_test = rf.predict(X_test_pairs[:, [0,1]+[20,21]+[i+22 for i in range(1920*2)]])
preds_train = rf.predict(X_train_pairs[:, [0,1]+[20,21]+[i+22 for i in range(1920*2)]])
print ('On Test:\n{}\n'.format(ClassificationResult(y_test, preds_test)))
print ('On Train:\n{}\n'.format(ClassificationResult(y_train, preds_train)))

On Test:
Balanced Accuracy: 71.19%	 auROC: 0.813	 auPRC: 0.844	 auPRG: 0.648
Recall at 5%|10%|20% FDR: 34.6%|45.7%|66.7%	 Num Positives: 243	 Num Negatives: 243	 

On Train:
Balanced Accuracy: 100.00%	 auROC: 1.000	 auPRC: 1.000	 auPRG: 1.000
Recall at 5%|10%|20% FDR: 100.0%|100.0%|100.0%	 Num Positives: 566	 Num Negatives: 566	 



# SVC

In [27]:
svc = SVC()
from sklearn.svm import SVC as scikit_SVC

# linear - SVM

In [28]:
svc.classifier= scikit_SVC(probability=True, kernel='linear')

### with motifs only

In [30]:
X_train_pairs[:,22:].shape

(1132, 3840)

In [31]:
svc.train(X_train_pairs[:,22:], y_train)
preds_test = svc.predict(X_test_pairs[:,22:])
preds_train = svc.predict(X_train_pairs[:,22:])
print ('On Test:\n{}\n'.format(ClassificationResult(y_test, preds_test)))
print ('On Train:\n{}\n'.format(ClassificationResult(y_train, preds_train)))

On Test:
Balanced Accuracy: 72.02%	 auROC: 0.782	 auPRC: 0.784	 auPRG: 0.583
Recall at 5%|10%|20% FDR: 10.3%|15.6%|46.9%	 Num Positives: 243	 Num Negatives: 243	 

On Train:
Balanced Accuracy: 100.00%	 auROC: 1.000	 auPRC: 1.000	 auPRG: 1.000
Recall at 5%|10%|20% FDR: 100.0%|100.0%|100.0%	 Num Positives: 566	 Num Negatives: 566	 



### with atac only

In [32]:
svc.train(X_train_pairs[:,:2], y_train)
preds_test = svc.predict(X_test_pairs[:,:2])
preds_train = svc.predict(X_train_pairs[:,:2])
print ('On Test:\n{}\n'.format(ClassificationResult(y_test, preds_test)))
print ('On Train:\n{}\n'.format(ClassificationResult(y_train, preds_train)))

On Test:
Balanced Accuracy: 57.00%	 auROC: 0.568	 auPRC: 0.609	 auPRG: 0.090
Recall at 5%|10%|20% FDR: 3.3%|3.3%|5.3%	 Num Positives: 243	 Num Negatives: 243	 

On Train:
Balanced Accuracy: 60.87%	 auROC: 0.604	 auPRC: 0.662	 auPRG: 0.177
Recall at 5%|10%|20% FDR: 3.7%|5.5%|20.5%	 Num Positives: 566	 Num Negatives: 566	 



### with genomic locations only

In [15]:
svc.train(X_train_pairs[:,20:22], y_train)
preds_test = svc.predict(X_test_pairs[:,20:22])
preds_train = svc.predict(X_train_pairs[:,20:22])
print ('On Test:\n{}\n'.format(ClassificationResult(y_test, preds_test)))
print ('On Train:\n{}\n'.format(ClassificationResult(y_train, preds_train)))

On Test:
Balanced Accuracy: 58.44%	 auROC: 0.619	 auPRC: 0.571	 auPRG: 0.237
Recall at 5%|10%|20% FDR: 0.0%|0.0%|0.0%	 Num Positives: 243	 Num Negatives: 243	 

On Train:
Balanced Accuracy: 56.71%	 auROC: 0.591	 auPRC: 0.571	 auPRG: 0.160
Recall at 5%|10%|20% FDR: 0.0%|0.0%|0.0%	 Num Positives: 566	 Num Negatives: 566	 



### with motifs+atac

In [26]:
svc.train(X_train_pairs[:, [0,1]+[i+22 for i in range(1920*2)]], y_train)
preds_test = svc.predict(X_test_pairs[:, [0,1]+[i+22 for i in range(1920*2)]])
preds_train = svc.predict(X_train_pairs[:, [0,1]+[i+22 for i in range(1920*2)]])
print ('On Test:\n{}\n'.format(ClassificationResult(y_test, preds_test)))
print ('On Train:\n{}\n'.format(ClassificationResult(y_train, preds_train)))

On Test:
Balanced Accuracy: 72.22%	 auROC: 0.793	 auPRC: 0.795	 auPRG: 0.611
Recall at 5%|10%|20% FDR: 10.7%|21.8%|56.8%	 Num Positives: 243	 Num Negatives: 243	 

On Train:
Balanced Accuracy: 100.00%	 auROC: 1.000	 auPRC: 1.000	 auPRG: 1.000
Recall at 5%|10%|20% FDR: 100.0%|100.0%|100.0%	 Num Positives: 566	 Num Negatives: 566	 



### with motifs+atac+distance

In [34]:
svc.train(X_train_pairs[:, [0,1]+[20,21]+[i+22 for i in range(1920*2)]], y_train)
preds_test = svc.predict(X_test_pairs[:, [0,1]+[20,21]+[i+22 for i in range(1920*2)]])
preds_train = svc.predict(X_train_pairs[:, [0,1]+[20,21]+[i+22 for i in range(1920*2)]])
print ('On Test:\n{}\n'.format(ClassificationResult(y_test, preds_test)))
print ('On Train:\n{}\n'.format(ClassificationResult(y_train, preds_train)))

On Test:
Balanced Accuracy: 72.84%	 auROC: 0.794	 auPRC: 0.797	 auPRG: 0.615
Recall at 5%|10%|20% FDR: 11.1%|23.5%|54.7%	 Num Positives: 243	 Num Negatives: 243	 

On Train:
Balanced Accuracy: 100.00%	 auROC: 1.000	 auPRC: 1.000	 auPRG: 1.000
Recall at 5%|10%|20% FDR: 100.0%|100.0%|100.0%	 Num Positives: 566	 Num Negatives: 566	 



# RBF - SVM

In [35]:
svc.classifier= scikit_SVC(probability=True, kernel='rbf')

### with motifs only

In [36]:
svc.train(X_train_pairs[:,22:], y_train)
preds_test = svc.predict(X_test_pairs[:,22:])
preds_train = svc.predict(X_train_pairs[:,22:])
print ('On Test:\n{}\n'.format(ClassificationResult(y_test, preds_test)))
print ('On Train:\n{}\n'.format(ClassificationResult(y_train, preds_train)))

On Test:
Balanced Accuracy: 71.19%	 auROC: 0.800	 auPRC: 0.816	 auPRG: 0.618
Recall at 5%|10%|20% FDR: 17.3%|36.6%|60.5%	 Num Positives: 243	 Num Negatives: 243	 

On Train:
Balanced Accuracy: 97.08%	 auROC: 0.998	 auPRC: 0.998	 auPRG: 0.997
Recall at 5%|10%|20% FDR: 99.1%|100.0%|100.0%	 Num Positives: 566	 Num Negatives: 566	 



### with atac only

In [37]:
svc.train(X_train_pairs[:,:2], y_train)
preds_test = svc.predict(X_test_pairs[:,:2])
preds_train = svc.predict(X_train_pairs[:,:2])
print ('On Test:\n{}\n'.format(ClassificationResult(y_test, preds_test)))
print ('On Train:\n{}\n'.format(ClassificationResult(y_train, preds_train)))

On Test:
Balanced Accuracy: 61.73%	 auROC: 0.656	 auPRC: 0.658	 auPRG: 0.287
Recall at 5%|10%|20% FDR: 0.4%|0.4%|0.4%	 Num Positives: 243	 Num Negatives: 243	 

On Train:
Balanced Accuracy: 68.46%	 auROC: 0.727	 auPRC: 0.734	 auPRG: 0.462
Recall at 5%|10%|20% FDR: 1.2%|2.1%|45.1%	 Num Positives: 566	 Num Negatives: 566	 



### with genomic locations only

In [38]:
svc.train(X_train_pairs[:,20:22], y_train)
preds_test = svc.predict(X_test_pairs[:,20:22])
preds_train = svc.predict(X_train_pairs[:,20:22])
print ('On Test:\n{}\n'.format(ClassificationResult(y_test, preds_test)))
print ('On Train:\n{}\n'.format(ClassificationResult(y_train, preds_train)))

On Test:
Balanced Accuracy: 58.64%	 auROC: 0.635	 auPRC: 0.591	 auPRG: 0.259
Recall at 5%|10%|20% FDR: 0.0%|0.0%|0.0%	 Num Positives: 243	 Num Negatives: 243	 

On Train:
Balanced Accuracy: 57.24%	 auROC: 0.597	 auPRC: 0.558	 auPRG: 0.178
Recall at 5%|10%|20% FDR: 0.2%|0.2%|0.2%	 Num Positives: 566	 Num Negatives: 566	 



### with motifs+atac

In [39]:
svc.train(X_train_pairs[:, [0,1]+[i+22 for i in range(1920*2)]], y_train)
preds_test = svc.predict(X_test_pairs[:, [0,1]+[i+22 for i in range(1920*2)]])
preds_train = svc.predict(X_train_pairs[:, [0,1]+[i+22 for i in range(1920*2)]])
print ('On Test:\n{}\n'.format(ClassificationResult(y_test, preds_test)))
print ('On Train:\n{}\n'.format(ClassificationResult(y_train, preds_train)))

On Test:
Balanced Accuracy: 72.43%	 auROC: 0.804	 auPRC: 0.823	 auPRG: 0.627
Recall at 5%|10%|20% FDR: 19.3%|39.9%|60.1%	 Num Positives: 243	 Num Negatives: 243	 

On Train:
Balanced Accuracy: 97.35%	 auROC: 0.998	 auPRC: 0.998	 auPRG: 0.998
Recall at 5%|10%|20% FDR: 99.5%|100.0%|100.0%	 Num Positives: 566	 Num Negatives: 566	 



### with motifs_atac_distance

In [40]:
svc.train(X_train_pairs[:, [0,1]+[20,21]+[i+22 for i in range(1920*2)]], y_train)
preds_test = svc.predict(X_test_pairs[:, [0,1]+[20,21]+[i+22 for i in range(1920*2)]])
preds_train = svc.predict(X_train_pairs[:, [0,1]+[20,21]+[i+22 for i in range(1920*2)]])
print ('On Test:\n{}\n'.format(ClassificationResult(y_test, preds_test)))
print ('On Train:\n{}\n'.format(ClassificationResult(y_train, preds_train)))

On Test:
Balanced Accuracy: 72.43%	 auROC: 0.804	 auPRC: 0.823	 auPRG: 0.628
Recall at 5%|10%|20% FDR: 20.2%|39.9%|60.1%	 Num Positives: 243	 Num Negatives: 243	 

On Train:
Balanced Accuracy: 97.44%	 auROC: 0.998	 auPRC: 0.998	 auPRG: 0.998
Recall at 5%|10%|20% FDR: 99.5%|100.0%|100.0%	 Num Positives: 566	 Num Negatives: 566	 

