In [12]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import sys
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
sys.path.append('/users/mtaranov/genome3D/')
from models_3d import Genome3D_SVM_RBF
from metrics import ClassificationResult
from utils import get_features, get_labels, subsample_data, normalize_features, reconstruct_2d, printMatrix, binarize, zscore, get_2D, plot_prediction, change_scale 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
contacts='pe'
path='/users/mtaranov/datasets_3d/by_chr_dist_matched_'+contacts+'/'
X_train = get_features(path+'d0_X_train_thres_10.npy')
y_train = get_labels(path+'d0_y_train_thres_10.npy')
X_valid = get_features(path+'d0_X_valid_thres_10.npy')
y_valid = get_labels(path+'d0_y_valid_thres_10.npy')
X_test = get_features(path+'d0_X_test_thres_10.npy')
y_test = get_labels(path+'d0_y_test_thres_10.npy')

In [3]:
X_train_normalized, X_valid_normalized, X_test_normalized = normalize_features(X_train, X_valid, X_test)

In [4]:
X_train_pairs = X_train_normalized.reshape(X_train_normalized.shape[0],X_train_normalized.shape[2]*X_train_normalized.shape[3])
X_valid_pairs = X_valid_normalized.reshape(X_valid_normalized.shape[0],X_valid_normalized.shape[2]*X_valid_normalized.shape[3])
X_test_pairs = X_test_normalized.reshape(X_test_normalized.shape[0],X_test_normalized.shape[2]*X_test_normalized.shape[3])

In [5]:
X_train_pairs.shape

(6710, 20)

# SVM-RBF

### w/o genomic locations

# Estimating hyper-parameter using held out validation set

In [6]:
X_train_valid_pairs = np.concatenate((X_train_pairs, X_valid_pairs), axis=0)
y_train_valid = np.concatenate((y_train, y_valid), axis=0)

In [7]:
# test_fold to 0 for all samples that are part of the validation set, and to -1 for all other samples.
valid_index=[-1 for i in range(X_train_pairs.shape[0])]+[0 for i in range(X_valid_pairs.shape[0])]

In [8]:
param_grid = {'gamma': [1e-3, 1e-4, 0.005, 0.05, 0.5],'C': [1, 10, 100]}

In [9]:
best_param={}

In [10]:
svm = Genome3D_SVM_RBF(best_param)

In [13]:
best_param = svm.train_cross_val(X_train_valid_pairs[:,:18], [i for i in y_train_valid[:,0]], valid_index, param_grid)

In [14]:
best_param

{'C': 1, 'gamma': 0.05}

# SVM-RBF Predictions

In [15]:
svm = Genome3D_SVM_RBF(best_param)

In [16]:
svm.train(X_train_pairs[:,:18], y_train)
preds_test = svm.predict(X_test_pairs[:,:18])
preds_train = svm.predict(X_train_pairs[:,:18])
print ('On Test:\n{}\n'.format(ClassificationResult(y_test, preds_test)))
print ('On Train:\n{}\n'.format(ClassificationResult(y_train, preds_train)))

  y = column_or_1d(y, warn=True)


On Test:
Balanced Accuracy: 69.64%	 auROC: 0.773	 auPRC: 0.749	 auPRG: 0.552
Positve Accuracy: 70.45%	 Negative Accuracy: 68.83%
Recall at 5%|10%|20% FDR: 2.4%|2.6%|16.4%	 Num Positives: 494	 Num Negatives: 494	 

On Train:
Balanced Accuracy: 77.78%	 auROC: 0.856	 auPRC: 0.859	 auPRG: 0.744
Positve Accuracy: 78.57%	 Negative Accuracy: 76.99%
Recall at 5%|10%|20% FDR: 24.4%|53.0%|73.8%	 Num Positives: 3355	 Num Negatives: 3355	 



# SVM-RBF with default hyper-parameters

In [18]:
svm.train(X_train_pairs[:,:18], y_train)
preds_test = svm.predict(X_test_pairs[:,:18])
preds_train = svm.predict(X_train_pairs[:,:18])
print ('On Test:\n{}\n'.format(ClassificationResult(y_test, preds_test)))
print ('On Train:\n{}\n'.format(ClassificationResult(y_train, preds_train)))

On Test:
Balanced Accuracy: 69.64%	 auROC: 0.773	 auPRC: 0.749	 auPRG: 0.552
Positve Accuracy: 70.45%	 Negative Accuracy: 68.83%
Recall at 5%|10%|20% FDR: 2.4%|2.6%|16.4%	 Num Positives: 494	 Num Negatives: 494	 

On Train:
Balanced Accuracy: 77.75%	 auROC: 0.856	 auPRC: 0.859	 auPRG: 0.744
Positve Accuracy: 78.57%	 Negative Accuracy: 76.93%
Recall at 5%|10%|20% FDR: 24.4%|53.0%|73.8%	 Num Positives: 3355	 Num Negatives: 3355	 

