In [1]:
import numpy as np
import pandas as pd
print pd.__version__

0.19.1


In [38]:
from sklearn.model_selection import train_test_split

data_dir = "data/pdaid/"
disease_groups = ["bacterial", "parasitic", "viral", "fungal"]

def data_cleanup(group, group_name):
    group["disease"] = group_name
    group['disease_id'] = disease_groups.index(group_name)
    group.columns = [
        'type', 'epitope_sequence', 'start', 'end', 'chebi', 'syn', 'protein',
        'protein_id', 'organism', 'oid', 'comments', 'disease', 'disease_id'
    ]
    # Remove entries with '+' notation (note: looking into this, e.g. "PLNISLGDVVLY + DEAM(N3)")
    group = group[group.epitope_sequence.str.find('+') == -1]
    group = group[["epitope_sequence", "disease", "disease_id"]]
    #group = group[group.epitope_sequence.str.find("Z") == -1]
    group = group[group.duplicated(subset=["epitope_sequence"], keep=False) == False]
    return group

bacterial = data_cleanup(pd.DataFrame.from_csv(data_dir + "bacterial.csv"), "bacterial")
fungal = data_cleanup(pd.DataFrame.from_csv(data_dir + "fungal.csv"), "fungal")
parasitic = data_cleanup(pd.DataFrame.from_csv(data_dir + "parasitic.csv"), "parasitic")
viral = data_cleanup(pd.DataFrame.from_csv(data_dir + "viral.csv"), "viral")
  
print "Input -- Bacterial: %d, Fungal: %d, Parasitic: %d, Viral: %d" % \
    (bacterial.shape[0], fungal.shape[0], parasitic.shape[0], viral.shape[0])

# Consider putting fungal back in?
all_samples = bacterial\
    .append(parasitic, ignore_index=True)\
    .append(viral, ignore_index= True)

train, test = train_test_split(all_samples, test_size = 0.25)

print "%d training and %d test samples" % (train.shape[0], test.shape[0])

Input -- Bacterial: 3468, Fungal: 39, Parasitic: 3431, Viral: 7937
11127 training and 3709 test samples


In [54]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

#parameters = {'vect__ngram_range': [(1,2), (1,3), (1,9), (1,12)],
#              'tfidf__use_idf': (True, False),
#              'tfidf__smooth_idf': (True, False),
#              'tfidf__sublinear_tf': (True, False),
#              'tfidf__norm': ("l1", "l2", None),
#              'clf__alpha': (1e-1, 1e-2),
#              'clf__fit_prior': (True, False)}

clf = Pipeline([('vect', CountVectorizer(analyzer='char', lowercase=True, ngram_range=(1, 9))),
                ('tfidf', TfidfTransformer(norm=None, smooth_idf=True, sublinear_tf=False, use_idf=False)),
                ('clf', MultinomialNB(alpha=0.01, fit_prior=True))
])

#gs_clf = GridSearchCV(clf, parameters, n_jobs=-1)

clf.fit(train.epitope_sequence, train.disease_id)
predicted = clf.predict(test.epitope_sequence)

print classification_report(test.disease_id, predicted, target_names=disease_groups)

cm = confusion_matrix(test.disease_id, predicted)
print "                                   ACTUAL"
print "                             bact  paras  viral"
for (row, line) in enumerate(cm):
    print "predicted %16s %s" % (disease_groups[row], " ".join(["%6s" % (i) for i in line]))

             precision    recall  f1-score   support

  bacterial       0.80      0.80      0.80       859
  parasitic       0.95      0.86      0.90       848
      viral       0.89      0.92      0.91      2002

avg / total       0.88      0.88      0.88      3709

                                   ACTUAL
                             bact  paras  viral
predicted        bacterial    687     20    152
predicted        parasitic     40    729     79
predicted            viral    136     18   1848


In [55]:
#print gs_clf.best_score_
#for param_name in sorted(parameters.keys()):
#     print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))