In [318]:
import pyprind
import pandas as pd
import os
import numpy as np

In [319]:
np.random.seed(0)

In [320]:
df = pd.read_csv('/Users/ramo/Dropbox/Stats/multiple-mets/data/training/BWH_training7.3.16/2cat-Table 1.csv')
df.head(2)

Unnamed: 0,DeIDed Number,Report,Number of Lesions,Group
0,1,Type: MR Brain w AND w/oCont Date/Time: 07...,1,1
1,2,Type: MR Brain w AND w/oCont Date/Time: 06...,1,1


In [321]:
# Select only the relevant columns
df=df[['Report', 'Group']]

In [322]:
# Reassign groups. 0: single met, 1: multiple mets
df['Group']=df['Group'].replace(1, 0)
df['Group']=df['Group'].replace(2,1)

In [323]:
df.head()

Unnamed: 0,Report,Group
0,Type: MR Brain w AND w/oCont Date/Time: 07...,0
1,Type: MR Brain w AND w/oCont Date/Time: 06...,0
2,Type: MR Brain w AND w/oCont Date/Time: 06...,0
3,Type: MR Brain w/Contrast Date/Time: 06/1...,0
4,Type: MR Brain w AND w/oCont Date/Time: 06...,0


## Cleaning text data

In [324]:
import re

In [325]:
# Remove puntuation, removed header/footer and attending attestation
# TODO: keep decimels
# If report fails several variations in syntax, mark as error and drop
def preprocessor(text):
    text = re.sub('[\W]+', ' ', text.lower())
    body_pattern = re.compile(r'findings (.*) (?=this report was)')
    matched_text = body_pattern.search(text)
    
    if matched_text is None:
        body_pattern = re.compile(r'findings (.*) (?=radiologists signatures)')
        matched_text = body_pattern.search(text)
        
    if matched_text is None:
        body_pattern = re.compile(r'comparison (.*) (?=radiologists signatures)')
        matched_text = body_pattern.search(text)
        
    if matched_text is None:
        stripped_text = 'ERROR'
    else:
        stripped_text = matched_text.group(1).replace(
            'i the teaching physician have reviewed the images and agree with the report as written',
            '')
    return stripped_text

In [326]:
df['clean_report'] = df['Report'].apply(preprocessor)

In [327]:
original_rows = df.shape[0]

In [328]:
errors = df.query("clean_report == ['ERROR']")

In [329]:
print('Number of errors found: '+str(errors.shape[0])+'. Rows with errors will be dropped')

Number of errors found: 3. Rows with errors will be dropped


In [330]:
df = df[df['clean_report'] != 'ERROR']

In [331]:
final_rows = df.shape[0]

In [332]:
print('Original rows: '+str(original_rows)+'\nFinal rows: '+str(final_rows))

Original rows: 214
Final rows: 211


In [333]:
df.to_csv('train_dataOA7.19.16.csv', columns=['clean_report', 'Group'])

In [334]:
del df['Report']

## Tokinizing documents, stemming, stop words

In [335]:
def tokenizer(text):
    return text.split()

In [336]:
from nltk.stem.porter import PorterStemmer

In [337]:
porter = PorterStemmer()

In [338]:
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [339]:
# TODO: Try porter2 and lancaster stemming

In [340]:
from nltk.corpus import stopwords

In [341]:
stop = stopwords.words('english')

In [342]:
stop_modified = [w for w in stop if w not in ['no', 'not']]

## Training a logistic regression model

In [343]:
# # Old manual split, non-random. Replace w/ random sampling from pandas method
# x_train = df.loc[:170, 'clean_report'].values
# y_train = df.loc[:170, 'Group'].values
# x_test = df.loc[170:, 'clean_report'].values
# y_test = df.loc[170:, 'Group'].values

In [344]:
train_df = df.sample(frac=0.8, random_state = 0)
test_df = df.drop(train_df.index)

In [345]:
x_train = train_df.loc[:, 'clean_report'].values
y_train = train_df.loc[:, 'Group'].values
x_test = test_df.loc[:, 'clean_report'].values
y_test = test_df.loc[:, 'Group'].values

In [346]:
tfidf = TfidfVectorizer(strip_accents=None,
                       lowercase=False,
                       preprocessor=None)

In [347]:
param_grid = [{'vect__ngram_range': [(1,1), (1,2)],
              'vect__stop_words': [stop, stop_modified, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
             {'vect__ngram_range': [(1,1), (1,2)],
              'vect__stop_words': [stop, stop_modified, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
              'vect__use_idf': [False],
              'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]}
             ]

In [348]:
lr_tfidf = Pipeline([('vect', tfidf),
                    ('clf', LogisticRegression(random_state=0))])

In [None]:
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                          scoring='accuracy',
                          cv=5, verbose=1,
                          n_jobs=-1)

In [None]:
gs_lr_tfidf.fit(x_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:   12.5s


In [None]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)

In [None]:
print('CV Accuracy: %.3f'% gs_lr_tfidf.best_score_)

In [None]:
clf = gs_lr_tfidf.best_estimator_

In [None]:
print('Test Accuracy: %.3f' % clf.score(x_test, y_test))

### Classification Benchmarks

In [None]:
import pylab as pl
from sklearn.metrics import confusion_matrix, classification_report
from time import time
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def benchmark(clf, name, x_test=x_test):
    print('Predicting the outcomes of the testing set')
    t0= time()
    pred = clf.predict(x_test)
    print('Done in %fs' % (time() - t0))
    
    print('Classification report on test set for classifier:')
    print(clf)
    print()
    print(classification_report(y_test, pred))
    
    cm = confusion_matrix(y_test, pred)
    print("Confusion Matrix:")
    print(cm)

In [None]:
benchmark(clf, 'Log Regression CLF')

In [None]:
def plot_confusion_matrix(clf, title='Confusion Matrix', cmap=plt.cm.Blues):
    pred = clf.predict(x_test)
    cm = confusion_matrix(y_test, pred)
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(np.unique(y_test)))
    plt.xticks(tick_marks, np.unique(pred))
    plt.yticks(tick_marks, np.unique(y_test))
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
plt.figure()

In [None]:
plot_confusion_matrix(clf)

## Train using automatic model/hyperparameters selection

In [39]:
#Models
from sklearn.linear_model import Perceptron, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import linear_model, svm, tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB

from numpy.random import RandomState

In [314]:
from sklearn.feature_extraction.text import CountVectorizer

In [40]:
import sys
def concatenate(d1,d2):
    d = d1.copy()
    d.update(d2)
    return d
seed = 0

In [41]:
sys.path.append('/Users/oarnaout/Dropbox/Stats/multiple-mets/')
import sklearnextensions as sklx
import printers

In [42]:
# classifiers and parameters to consider
feature_parameters  = {
                'vect__binary':(False, True),
               'vect__ngram_range': ((1,1),(1,2),(1,3)),
               'vect__analyzer' : ('word', 'char_wb')}

nb_feature_parameters  = {'vect__ngram_range': ((1,1),(1,2),(1,3)),
               'vect__analyzer' : ('word', 'char_wb')}
use_spare_array = True
use_binary_features = True
classifiers = ({
    'logistic_regression':(linear_model.LogisticRegression(),
                           use_spare_array,
                           not use_binary_features,
                           concatenate(feature_parameters, {'clf__C': [1/x for x in [0.01, 0.1, 0.3, 1.0, 3.0, 10.0]]})),
    'svm_linear':(svm.LinearSVC(tol=1e-6),
                  use_spare_array,
                  not use_binary_features,
                  concatenate(feature_parameters, {'clf__C': [1/x for x in [0.01, 0.1, 0.3, 1.0, 3.0, 10.0]]})),
    'svm_gaussian':(svm.SVC(tol=1e-6, kernel='rbf'),
                    use_spare_array,
                    not use_binary_features,
                    concatenate(feature_parameters, {'clf__gamma': [.01, .03, 0.1],
                                             'clf__C': [1/x for x in [0.01, 0.1, 0.3, 1.0, 3.0, 10.0]]})),
    'decision_tree':(tree.DecisionTreeClassifier(criterion='entropy', random_state=RandomState(seed)),
                     not use_spare_array,
                     not use_binary_features,
                     concatenate(feature_parameters,{'clf__max_depth': [2, 3, 4, 5, 6, 7 , 8, 9, 10, 15, 20]})),
    'random_forest':(RandomForestClassifier(criterion='entropy', random_state=RandomState(seed)),
                     not use_spare_array,
                     not use_binary_features,
                     concatenate(feature_parameters,{'clf__max_depth': [2, 3, 4, 5],
                                                     'clf__n_estimators': [5, 25, 50, 100, 150, 200]})),
    'naive_bayes':(BernoulliNB(alpha=1.0, binarize=None, fit_prior=True, class_prior=None),
                   use_spare_array,
                   use_binary_features,
                   {'vect__ngram_range':((1,1),(1,2),(1,3)),
                    'vect__analyzer':('word', 'char_wb')})
})

In [43]:
out_file = 'text.txt'

In [315]:
best_accuracy = 0
for key, value in classifiers.items():
    clf = value[0] #classifier
    usa = value[1] #use sparse array
    ubf = value[2] #use binary (for NB)
    parameters = value[3]
    vectorizer = CountVectorizer(input='content', decode_error='ignore', preprocessor=None, binary=ubf)
    pipeline = (Pipeline(steps=[('vect', vectorizer),('clf',clf)]) if usa
                    else Pipeline(steps=[('vect', vectorizer),('sa',sklx.SparseToArray()),('clf',clf)]))
    gs = sklx.grid_analysis(pipeline, parameters, x_train, y_train)
    printers.print_grid_search_results(gs,key,out_file, x_test, y_test)
    if gs.best_score_>best_accuracy:
        final_clf = gs.best_estimator_
        best_accuracy = final_clf.score(x_test, y_test)

Performing grid search...
Fitting 5 folds for each of 216 candidates, totalling 1080 fits


KeyboardInterrupt: 

In [None]:
best_accuracy

# Web App

### Out of core learning (for partial fit hashing vect)

In [None]:
import numpy as np
import re
from nltk.stem.porter import PorterStemmer

In [None]:
def tokenizer_porter(text):
    text = re.sub('[\W]+', ' ', text.lower())
    tokenized = [porter.stem(word) for word in text]
    return tokenized

In [None]:
def stream_docs(path):
    with open('train_dataOA7.19.16.csv', 'r') as csv:
        next(csv)
        for line in csv:
            text, label = line[2:-3], int(line[-2])
            yield text, label

In [None]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix

In [None]:
vect = HashingVectorizer(decode_error='ignore',
                        n_features=2**21,
                        preprocessor=None,
                        tokenizer=tokenizer_porter)

In [None]:
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)

In [None]:
doc_stream = stream_docs(path='./train_dataOA7.19.16.csv')

In [None]:
import pyprind

In [None]:
pbar = pyprind.ProgBar(1)
classes = np.array([0,1])

In [None]:
for _ in range (1):
    x_train, y_train = get_minibatch(doc_stream, size=170)
    if not x_train:
        break
    x_train = vect.transform(x_train)
    clf.partial_fit(x_train, y_train, classes=classes)
    pbar.update()

In [None]:
len(y_train)

In [None]:
x_test, y_test = get_minibatch(doc_stream, size=40)

In [None]:
len(y_test)

In [None]:
x_test = vect.transform(x_test)

In [None]:
print('Accuracy: %.3f' % clf.score(x_test, y_test))

In [None]:
predicted = clf.predict(x_test)

In [None]:
cm = confusion_matrix(y_test, predicted)
print(cm)

In [None]:
clf = clf.partial_fit(x_test, y_test)

## Serializing fitted scikit-learn estimators

In [None]:
import pickle
import os

In [None]:
dest = os.path.join('reportclassifier', 'pkl_objects')

In [None]:
if not os.path.exists(dest):
    os.makedirs(dest)

In [None]:
pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'),
           protocol=2)

In [None]:
pickle.dump(clf,
           open(os.path.join(dest, 'classifier.pkl'), 'wb'),
           protocol=2)

## Tests (ignore