In [1]:
import numpy as np
import pandas as pd
print pd.__version__

0.19.1


In [2]:
from sklearn.model_selection import train_test_split

# All data for this notebook can be found in data_dir
data_dir = "data/pdaid/"

disease_groups = ["bacterial", "parasitic", "viral"]

def data_prep(group_name):
    df = pd.DataFrame.from_csv(data_dir + group_name + ".csv")
    df["disease_group"] = group_name
    df['disease_id'] = disease_groups.index(group_name)
    df.columns = [
        'type', 'epitope_sequence', 'start', 'end', 'chebi', 'syn', 'protein',
        'protein_id', 'organism', 'oid', 'comments', 'disease_group', 'disease_id'
    ]
    # Remove entries with '+' notation (note: looking into this, e.g. "PLNISLGDVVLY + DEAM(N3)")
    df = df[df.epitope_sequence.str.find('+') == -1]
    df = df[["epitope_sequence", "disease_group", "disease_id", "organism"]]
    
    #group = group[group.epitope_sequence.str.find("Z") == -1]
    df = df[df.duplicated(subset=["epitope_sequence"], keep=False) == False]
    return df

disease_dfs = []
for disease_group in disease_groups:
    disease_dfs.append(data_prep(disease_group))
  
print "Raw CSV files contains:"
group_describe = lambda name, df: "%16s: %3d unique organisms in %4d samples" % \
                            (name, df.organism.unique().shape[0], df.shape[0])
for (name, df) in zip(disease_groups, disease_dfs):
    print group_describe(name, df)
    
all_samples_df = disease_dfs[0]
for df in disease_dfs[1:]:
    all_samples_df = all_samples_df.append(df, ignore_index=True)

train, test = train_test_split(all_samples_df, test_size = 0.25)

print "\nCreated %d training and %d test samples" % (train.shape[0], test.shape[0])

Raw CSV files contains:
       bacterial:  79 unique organisms in 3468 samples
       parasitic:  61 unique organisms in 3431 samples
           viral: 263 unique organisms in 7937 samples

Created 11127 training and 3709 test samples


In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

#parameters = {'vect__ngram_range': [(1,3), (1,9), (1,12)],
#              'tfidf__use_idf': (True, False),
#              'tfidf__smooth_idf': (True, False),
#              'tfidf__sublinear_tf': (True, False),
#              'tfidf__norm': ("l1", "l2", None),
#              'clf__alpha': (1e-1, 1e-2),
#              'clf__fit_prior': (True, False)}

clf = Pipeline([('vect', CountVectorizer(analyzer='char', lowercase=True, ngram_range=(1, 9))),
                ('tfidf', TfidfTransformer(norm=None, smooth_idf=True, sublinear_tf=False, use_idf=False)),
                ('clf', MultinomialNB(alpha=0.01, fit_prior=True))
])

#gs_clf = GridSearchCV(clf, parameters, n_jobs=-1)

print "Time to fit the model (%d samples)" % (train.shape[0])
%time clf.fit(train.epitope_sequence, train.disease_id)
print "\nTime to predict the values of test set (%d samples)" % (test.shape[0])
%time predicted = clf.predict(test.epitope_sequence)
print
print
print classification_report(test.disease_id, predicted, target_names=disease_groups)

cm = confusion_matrix(test.disease_id, predicted)
print "\n\nConfusion Matrix: rows = predictions, columns = actual\n"
row_format ="{:>15}" * (len(disease_groups)+1)
print row_format.format("", *disease_groups)
for disease, row in zip(disease_groups, cm):
    print row_format.format(disease, *row)

Time to fit the model (11127 samples)
CPU times: user 3.02 s, sys: 181 ms, total: 3.2 s
Wall time: 3.15 s

Time to predict the values of test set (3709 samples)
CPU times: user 513 ms, sys: 98.9 ms, total: 612 ms
Wall time: 546 ms


             precision    recall  f1-score   support

  bacterial       0.80      0.81      0.81       857
  parasitic       0.96      0.86      0.91       877
      viral       0.89      0.93      0.91      1975

avg / total       0.89      0.89      0.89      3709



Confusion Matrix: rows = predictions, columns = actual

                     bacterial      parasitic          viral
      bacterial            696             15            146
      parasitic             48            758             71
          viral            121             17           1837
