In [4]:
import numpy as np
from sklearn import linear_model
from sklearn.svm import SVC
import GO_utils
import utils

## Specify File Paths

In [6]:
gene2go_file_path = '../data/gene2go.txt' # If file doesn't exist, then run gene2go = download_ncbi_associations()
rpkm_file_path = '../../CS341_Data/transcript_rpkm_in_go_nonzero_exp.txt'
gene_count_file_path = '../data/supp_GO_term_gene_counts.txt'
biomart_file_path = '../data/biomart_ensembl_to_entrez.txt'  
sample_tissue_path = '../data/sampleID_tissue.txt'
obo_file_path = '../data/go-basic.obo'

In [None]:
## Get Gene Annotations for all GO terms in Supplementary File (include the GO descendant terms)

In [7]:
GO_terms = GO_utils.get_go_terms_descendants(biomart_file_path, gene2go_file_path, gene_count_file_path, obo_file_path, ev_codes=None)
GO_terms = GO_utils.sort_go_terms(GO_terms)
print 'Top GO terms'
for t in GO_terms[0:10]:
    print t.id, ' ', len(t.genes)
term = GO_terms[0]
ensembl_ids = term.genes

16439 GO terms associated with human NCBI Entrez GeneIDs


load obo file ../data/go-basic.obo


../data/go-basic.obo: format-version(1.2) data-version(releases/2016-04-27)


46518 nodes imported


Top GO terms
GO:0007166   2018
GO:0007186   1128
GO:0051960   741
GO:0050767   659
GO:0007167   652
GO:0045664   549
GO:0007169   483
GO:0019221   449
GO:0051962   432
GO:0002694   425


## 1st Pass Through Dataset: Obtain positive training examples

In [10]:
NUM_FEATURES = 8555
gene_features, positive_example_rows, gene_ids_ordered, num_transcripts = \
        GO_utils.get_positive_examples(rpkm_file_path, ensembl_ids, NUM_FEATURES)

print 'After pass 1 (inserting positive examples), gene feature matrix has dimension: ', gene_features.shape
num_positive_examples = len(positive_example_rows)
num_negative_examples = num_positive_examples
num_examples = num_positive_examples + num_negative_examples
print 'num pos: ', num_positive_examples
print 'num neg: ', num_negative_examples

After pass 1 (inserting positive examples), gene feature matrix has dimension:  (1931, 8555)
num pos:  1931
num neg:  1931


## 2nd Pass through dataset: Obtain an equal number of negative training exmaples

In [11]:
neg_rows = utils.rand_sample_exclude(range(0, num_transcripts), num_negative_examples, exclude=positive_example_rows)

gene_features_neg, gene_ids_ordered_neg = \
    GO_utils.get_negative_examples(rpkm_file_path, neg_rows, NUM_FEATURES)
gene_features = np.append(gene_features, gene_features_neg, axis=0)
gene_ids_ordered += gene_ids_ordered_neg

print 'After pass 2 (inserting negative examples), gene feature matrix has dimension: ', gene_features.shape

After pass 2 (inserting negative examples), gene feature matrix has dimension:  (3862, 8555)


## Add Binary Labels to the Data and split into train/test

In [13]:
# Vector of labels for each example
labels = num_positive_examples * [1] + num_negative_examples * [0]

train, test = utils.split_data(gene_features, labels, gene_ids_ordered, train_set_size=0.7)
print 'Obtained training & testing data'

num examples:  3862
Dimensionality of training set:  (2704, 8555)
Dimensionality of test set:  (1158, 8555)
Obtained training & testing data


## Evaluate Various Models

## Logistic Regression With 10-Fold CV, L2 Norm

In [15]:
num_folds = 10   # number of folds to use for cross-validation
loss_function = 'l2'  # Loss function to use. Must be either 'l1' or 'l2'
logreg_cv_L2 = linear_model.LogisticRegressionCV(cv=num_folds, penalty=loss_function)
logreg_cv_L2.fit(train.gene_features, train.labels)
pred_lr_cv_L2 = logreg_cv_L2.predict(test.gene_features)
utils.print_prediction_results('Cross-Validated Logistic Regression', test.labels, pred_lr_cv_L2,
                               other_info='Norm: ' + loss_function + ', # of Folds: ' + str(num_folds))

--------------------
Cross-Validated Logistic Regression
--------------------
Root Mean Square Error:  0.607239001981
ROC AUC Score:  0.631123970887
False positive rate:  0.145077720207
False negative rate:  0.22366148532
Norm: l2, # of Folds: 10


## Logistic Regression With 3-Fold CV, L1 Norm

In [None]:
num_folds = 3   # number of folds to use for cross-validation
loss_function = 'l1'  # Loss function to use. Must be either 'l1' or 'l2'
logreg_cv_L1 = linear_model.LogisticRegressionCV(cv=num_folds, penalty=loss_function, solver='liblinear')
logreg_cv_L1.fit(train.gene_features, train.labels)
pred_lr_cv_L1 = logreg_cv_L1.predict(test.gene_features)
print_prediction_results('Cross-Validated Logistic Regression', test.labels, pred_lr_cv_L1, 
                         other_info='Norm: ' + loss_function + ', # of Folds: ' + str(num_folds))

## SVM

In [None]:
from sklearn.grid_search import GridSearchCV

# Split training data into train and dev
#train_cv , dev = utils.split_data(train.gene_features, train.labels, train.gene_ids_ordered, train_set_size=0.7)

svc = SVC(kernel='rbf')
Cs = np.logspace(-6, -1, 10)
clf = GridSearchCV(estimator=svc, param_grid=dict(C=Cs),n_jobs=-1)    
clf.fit(train.gene_features, train.labels)
print 'Best score: ', clf.best_score_
best_C = clf.best_estimator_.C
print 'Best C: ', best_C

In [None]:
clf = SVC(kernel='rbf', C=best_C)
clf.fit(train.gene_features, train.labels)
pred_svm = clf.predict(test.gene_features)
utils.print_prediction_results('SVM', test.labels, pred_svm)