<div class="alert alert-block alert-info">
__Name__: pangenome_indepth<br/>
__Description__: Identify top pangenome region (presence/absence) used in classification for PATRIC AMR data. Identify potential reasons for misclassification<br/>
__Author__: Matthew Whiteside matthew dot whiteside at canada dot ca<br/>
__Date__: Oct 10, 2017<br/>
__TODO__:<br/>
</div>

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import sklearn
import os

In [2]:
os.chdir('../pangenome')

In [3]:
import config
import utils
import classify

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
# Load pangenome regions
pg, genome_list, locus_list = utils.read_panseq(config.PANSEQ['pangenome_file'])

In [6]:
# Load amr resistance phenotypes
amr,amr_list = utils.read_amr(config.PHENOTYPE['amr_file'], genome_list)

In [7]:
# Load pangenome loci annotations
annot = utils.read_annot(config.ANNOTATION['blast_file'])
annot

Unnamed: 0,LocusID,Accession,Description
0,3013261977000,gi|378982922|ref|YP_005246077.1|,DNA transfer protein [Salmonella enterica subs...
1,3013261978000,gi|526222303|ref|YP_008266337.1|,hypothetical protein SE451236_16640 [Salmonell...
2,3013261979000,gi|16765561|ref|NP_461176.1|,hypothetical protein STM2233 [Salmonella enter...
3,3013261980000,gi|353563111|gb|EHC29555.1|,phosphotransferase of PTS system [Salmonella e...
4,3013261981000,gi|487649446|ref|WP_001746646.1|,"galactonate dehydratase, partial [Salmonella e..."
5,3013261982000,gi|447178632|ref|WP_001255888.1|,integrase [Gammaproteobacteria]
6,3013261983000,gi|194738380|ref|YP_002115115.1|,tail protein [Salmonella enterica subsp. enter...
7,3013261984000,gi|487580929|ref|WP_001722937.1|,"mutagenesis protein, partial [Salmonella enter..."
8,3013261985000,gi|487517437|ref|WP_001707741.1|,"putative transposase, partial [Salmonella ente..."
9,3013261986000,gi|261888791|ref|YP_003264480.1|,replication protein [Salmonella enterica subsp...


In [8]:
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.linear_model import RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn import metrics

In [9]:
# Examine ampicillin
d = np.argwhere(amr_list == 'ampicillin').item(0)
validrows = ~np.isnan(amr[:,d])
validrows
X = pg[validrows,:]
y = amr[validrows,d]

In [182]:
# Ridge Regression Classifier
clf = RidgeClassifierCV(alphas=[54], cv=5)
clf.fit(X,y)
clf.alpha_

54

In [94]:
# Compute predicted values
clf2 = RidgeClassifier(alpha=54,solver="sag")
predicted = cross_val_predict(clf2, X, y, cv=5, n_jobs=4)

In [95]:
print(metrics.classification_report(y, predicted))

             precision    recall  f1-score   support

        0.0       0.86      0.50      0.63        64
        1.0       0.90      0.98      0.94       278

avg / total       0.89      0.89      0.88       342



In [287]:
fp = genome_list[(y != predicted) & (y == 0)]
fn = genome_list[(y != predicted) & (y == 1)]
print(metrics.confusion_matrix(y, predicted))

[[ 32  32]
 [  5 273]]


In [277]:
tops = np.argsort(np.absolute(clf.coef_[0]))[::-1]
for i in tops[:40]:
    hits = annot[annot.LocusID == locus_list[i]]
    print('{}\t{}\t{}\t{}'.format(locus_list[i],coeff[i],hits['Accession'].values[0],hits['Description'].values[0]))

18079571516000	0.12090173845392979	gi|553902307|ref|WP_023139374.1|	LysR family transcriptional regulator [Salmonella enterica]
21092833417000	0.12090173845392979	gi|169797577|ref|YP_001715370.1|	tetracycline repressor protein class G [Acinetobacter baumannii AYE]
19586202464000	0.12090173845392979	gi|487908194|ref|WP_001981660.1|	MFS transporter [Vibrio cholerae]
16572940561000	0.12083437099506932	gi|90265358|emb|CAJ77040.1|	Transposase [Acinetobacter baumannii]
28625988430000	0.12056397798091725	gi|550903217|ref|YP_008672309.1|	chloramphenicol and florfenicol resistance protein [Salmonella enterica subsp. enterica serovar Typhimurium str. DT104]
15066309608000	0.10814703104933744	gi|487909260|ref|WP_001982726.1|	hypothetical protein [Vibrio cholerae]
19586202470000	-0.08784118663281325	gi|485701964|ref|WP_001335071.1|	hypothetical protein, partial [Escherichia coli]
25612726264000	0.08398364519228638	gi|446675187|ref|WP_000752533.1|	hypothetical protein [Salmonella enterica]
21092833

In [294]:
fp.tolist()

['85569_dot_102',
 '85569_dot_110',
 '85569_dot_125',
 '85569_dot_128',
 '85569_dot_132',
 '85569_dot_141',
 '85569_dot_144',
 '85569_dot_167',
 '85569_dot_210',
 '85569_dot_218',
 '85569_dot_223',
 '85569_dot_225',
 '85569_dot_233',
 '85569_dot_240',
 '85569_dot_258',
 '85569_dot_259',
 '85569_dot_273',
 '85569_dot_283',
 '85569_dot_285',
 '85569_dot_307',
 '85569_dot_314',
 '85569_dot_320',
 '85569_dot_323',
 '85569_dot_333',
 '85569_dot_343',
 '85569_dot_35',
 '85569_dot_354',
 '85569_dot_355',
 '85569_dot_38',
 '85569_dot_40',
 '85569_dot_66',
 '85569_dot_90']

In [293]:
fn.tolist()

['85569_dot_105',
 '85569_dot_11',
 '85569_dot_345',
 '85569_dot_44',
 '85569_dot_86']

In [291]:
fp.shape

(32,)

In [10]:
# Ensemble Classifiers
# RandomForest
rfc = RandomForestClassifier(n_jobs=-1, max_features= 'sqrt' , n_estimators=50, oob_score = True) 
param_grid = { 
    'n_estimators': [50, 200, 700, 1000],
    'max_features': ['auto', 'sqrt', 'log2']
}

cv_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)

In [309]:
import numpy as np
np.random.mtrand

AttributeError: module 'numpy.random' has no attribute 'mtrand'

In [11]:
cv_rfc.fit(X,y)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=-1, oob_score=True, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [50, 200, 700, 1000], 'max_features': ['auto', 'sqrt', 'log2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [12]:
cv_rfc.best_params_

{'max_features': 'auto', 'n_estimators': 50}