In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_files
#from sklearn.model_selection import train_test_split
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.grid_search import GridSearchCV
from numpy.random import RandomState

#Models
from sklearn.linear_model import Perceptron, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import linear_model, svm, tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB

In [2]:
path = 'data'
dataset = load_files(path)

In [3]:
docs_train, docs_test, y_train, y_test = train_test_split(dataset.data,
                                                         dataset.target,
                                                         test_size=0.20)

In [4]:
# Build a vectorizer/classifier pipeline

In [5]:
text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', MultinomialNB())
    ])

In [6]:
# Build a grid search to find best parameters

In [7]:
parameters = {'vect__ngram_range': [(1,1), (1,2), (1,3), (1,4)],
             'tfidf__use_idf': (True, False),
             'clf__alpha': (1e-2, 1e-3)}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [8]:
_ = gs_clf.fit(docs_train, y_train)

In [9]:
# Get the best parameters

In [10]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key = lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print('%s: %r' % (param_name, best_parameters[param_name]))

clf__alpha: 0.001
tfidf__use_idf: False
vect__ngram_range: (1, 2)


In [11]:
# Generate predictions

In [12]:
y_predicted = gs_clf.predict(docs_test)

In [13]:
# Model Metrics

In [14]:
print(metrics.classification_report(y_test, y_predicted,
                                        target_names=dataset.target_names))

             precision    recall  f1-score   support

          1       1.00      0.38      0.55        21
          2       0.63      1.00      0.77        22

avg / total       0.81      0.70      0.66        43



In [15]:
cm = metrics.confusion_matrix(y_test, y_predicted)
print(cm)

[[ 8 13]
 [ 0 22]]


## Use advanced options (Masino et al.) 

In [16]:
# Static parameters and setup
import sys
seed = 5824565
def concatenate(d1,d2):
    d = d1.copy()
    d.update(d2)
    return d

In [17]:
sys.path.append('/Users/oarnaout/Dropbox/Stats/multiple-mets/')
import sklearnextensions as sklx
import printers

In [18]:
from nltk.corpus import stopwords
#custom preprocessor to keep some stop words
english_stopwords = filter(lambda w: w not in ['no', 'not', 'under'],
                           stopwords.words('english'))

In [19]:
import re
def text_preprocessor(report_text):
    # 1. Remove all punctuation, whitespaces
    letters_only = re.sub('[^a-zA-Z0-9]', ' ', report_text) 
    
    # 2. All lower caps and split to words
    words = letters_only.lower().split()
    
    # 3. Convert stopwords list to set for efficiency
    stops = set(english_stopwords)
    
    # 4. Remove stop words
    meaningful_words = [w for w in words if w not in stops]
    
    # 5. Join words back together
    return(' '.join(meaningful_words))

In [20]:
# classifiers and parameters to consider
feature_parameters  = {
                'vect__binary':(False, True),
               'vect__ngram_range': ((1,1),(1,2),(1,3)),
               'vect__analyzer' : ('word', 'char_wb')}
nb_feature_parameters  = {'vect__ngram_range': ((1,1),(1,2),(1,3)),
               'vect__analyzer' : ('word', 'char_wb')}
use_spare_array = True
use_binary_features = True
classifiers = ({
    'logistic_regression':(linear_model.LogisticRegression(),
                           use_spare_array,
                           not use_binary_features,
                           concatenate(feature_parameters, {'clf__C': [1/x for x in [0.01, 0.1, 0.3, 1.0, 3.0, 10.0]]})),
    'svm_linear':(svm.LinearSVC(tol=1e-6),
                  use_spare_array,
                  not use_binary_features,
                  concatenate(feature_parameters, {'clf__C': [1/x for x in [0.01, 0.1, 0.3, 1.0, 3.0, 10.0]]})),
    'svm_gaussian':(svm.SVC(tol=1e-6, kernel='rbf'),
                    use_spare_array,
                    not use_binary_features,
                    concatenate(feature_parameters, {'clf__gamma': [.01, .03, 0.1],
                                             'clf__C': [1/x for x in [0.01, 0.1, 0.3, 1.0, 3.0, 10.0]]})),
    'decision_tree':(tree.DecisionTreeClassifier(criterion='entropy', random_state=RandomState(seed)),
                     not use_spare_array,
                     not use_binary_features,
                     concatenate(feature_parameters,{'clf__max_depth': [2, 3, 4, 5, 6, 7 , 8, 9, 10, 15, 20]})),
    'random_forest':(RandomForestClassifier(criterion='entropy', random_state=RandomState(seed)),
                     not use_spare_array,
                     not use_binary_features,
                     concatenate(feature_parameters,{'clf__max_depth': [2, 3, 4, 5],
                                                     'clf__n_estimators': [5, 25, 50, 100, 150, 200]})),
    'naive_bayes':(BernoulliNB(alpha=1.0, binarize=None, fit_prior=True, class_prior=None),
                   use_spare_array,
                   use_binary_features,
                   {'vect__ngram_range':((1,1),(1,2),(1,3)),
                    'vect__analyzer':('word', 'char_wb')})
})

In [21]:
out_file = 'text.txt'

In [22]:
for key, value in classifiers.items():
    clf = value[0] #classifier
    usa = value[1] #use sparse array
    ubf = value[2] #use binary (for NB)
    parameters = value[3]
    vectorizer = CountVectorizer(input='content', decode_error='ignore', preprocessor=text_preprocessor, binary=ubf)
    pipeline = (Pipeline(steps=[('vect', vectorizer),('clf',clf)]) if usa
                    else Pipeline(steps=[('vect', vectorizer),('sa',sklx.SparseToArray()),('clf',clf)]))
    gs = sklx.grid_analysis(pipeline, parameters, docs_train, y_train)
    printers.print_grid_search_results(gs,key,out_file, docs_test, y_test)

Performing grid search...
Fitting 5 folds for each of 216 candidates, totalling 1080 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   10.5s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   53.5s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:  1.9min
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:  3.5min
[Parallel(n_jobs=2)]: Done 1080 out of 1080 | elapsed:  4.9min finished


Grid search complete in 13.912677 sec

------------------------------ svm_gaussian Grid Search Results ------------------------------
Best score: 0.778
Best parameter set:
	clf: SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.01, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=1e-06, verbose=False)
	clf__C: 100.0
	clf__cache_size: 200
	clf__class_weight: None
	clf__coef0: 0.0
	clf__decision_function_shape: None
	clf__degree: 3
	clf__gamma: 0.01
	clf__kernel: 'rbf'
	clf__max_iter: -1
	clf__probability: False
	clf__random_state: None
	clf__shrinking: True
	clf__tol: 1e-06
	clf__verbose: False
	steps: [('vect', CountVectorizer(analyzer='word', binary=True, decode_error='ignore',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1),
        preprocessor=<function text_preprocessor

[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   12.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   53.2s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:  2.0min
[Parallel(n_jobs=2)]: Done 660 out of 660 | elapsed:  3.0min finished


Grid search complete in 8.551398 sec

------------------------------ decision_tree Grid Search Results ------------------------------
Best score: 0.725
Best parameter set:
	clf: DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=6,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False,
            random_state=<mtrand.RandomState object at 0x10d634a90>,
            splitter='best')
	clf__class_weight: None
	clf__criterion: 'entropy'
	clf__max_depth: 6
	clf__max_features: None
	clf__max_leaf_nodes: None
	clf__min_samples_leaf: 1
	clf__min_samples_split: 2
	clf__min_weight_fraction_leaf: 0.0
	clf__presort: False
	clf__random_state: <mtrand.RandomState object at 0x10d634a90>
	clf__splitter: 'best'
	sa: <sklearnextensions.SparseToArray instance at 0x10d6b10e0>
	steps: [('vect', CountVectorizer(analyzer='word', binary=True, decode_error='ignore',
        dtype=<ty

[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   11.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   50.1s
[Parallel(n_jobs=2)]: Done 360 out of 360 | elapsed:  1.6min finished


Grid search complete in 5.537658 sec

------------------------------ logistic_regression Grid Search Results ------------------------------
Best score: 0.830
Best parameter set:
	clf: LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
	clf__C: 10.0
	clf__class_weight: None
	clf__dual: False
	clf__fit_intercept: True
	clf__intercept_scaling: 1
	clf__max_iter: 100
	clf__multi_class: 'ovr'
	clf__n_jobs: 1
	clf__penalty: 'l2'
	clf__random_state: None
	clf__solver: 'liblinear'
	clf__tol: 0.0001
	clf__verbose: 0
	clf__warm_start: False
	steps: [('vect', CountVectorizer(analyzer='char_wb', binary=True, decode_error='ignore',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1

[Parallel(n_jobs=2)]: Done  68 tasks      | elapsed:   17.0s
[Parallel(n_jobs=2)]: Done 218 tasks      | elapsed:   53.9s
[Parallel(n_jobs=2)]: Done 360 out of 360 | elapsed:  1.6min finished


Grid search complete in 5.205982 sec

------------------------------ svm_linear Grid Search Results ------------------------------
Best score: 0.819
Best parameter set:
	clf: LinearSVC(C=100.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=1e-06,
     verbose=0)
	clf__C: 100.0
	clf__class_weight: None
	clf__dual: True
	clf__fit_intercept: True
	clf__intercept_scaling: 1
	clf__loss: 'squared_hinge'
	clf__max_iter: 1000
	clf__multi_class: 'ovr'
	clf__penalty: 'l2'
	clf__random_state: None
	clf__tol: 1e-06
	clf__verbose: 0
	steps: [('vect', CountVectorizer(analyzer='char_wb', binary=True, decode_error='ignore',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3),
        preprocessor=<function text_preprocessor at 0x10d609320>,
        stop_words=None, 

[Parallel(n_jobs=2)]: Done  30 out of  30 | elapsed:    7.7s finished


Grid search complete in 0.455788 sec

------------------------------ naive_bayes Grid Search Results ------------------------------
Best score: 0.719
Best parameter set:
	clf: BernoulliNB(alpha=1.0, binarize=None, class_prior=None, fit_prior=True)
	clf__alpha: 1.0
	clf__binarize: None
	clf__class_prior: None
	clf__fit_prior: True
	steps: [('vect', CountVectorizer(analyzer='word', binary=True, decode_error='ignore',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1),
        preprocessor=<function text_preprocessor at 0x10d609320>,
        stop_words=None, strip_accents=None,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None,
        vocabulary=None)), ('clf', BernoulliNB(alpha=1.0, binarize=None, class_prior=None, fit_prior=True))]
	vect: CountVectorizer(analyzer='word', binary=True, decode_error='ignore',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input

[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   11.7s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   56.8s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:  2.5min
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:  4.6min
[Parallel(n_jobs=2)]: Done 1246 tasks      | elapsed:  7.2min
[Parallel(n_jobs=2)]: Done 1440 out of 1440 | elapsed:  8.9min finished


Grid search complete in 21.049066 sec

------------------------------ random_forest Grid Search Results ------------------------------
Best score: 0.766
Best parameter set:
	clf: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False,
            random_state=<mtrand.RandomState object at 0x10d5c0fd0>,
            verbose=0, warm_start=False)
	clf__bootstrap: True
	clf__class_weight: None
	clf__criterion: 'entropy'
	clf__max_depth: 4
	clf__max_features: 'auto'
	clf__max_leaf_nodes: None
	clf__min_samples_leaf: 1
	clf__min_samples_split: 2
	clf__min_weight_fraction_leaf: 0.0
	clf__n_estimators: 50
	clf__n_jobs: 1
	clf__oob_score: False
	clf__random_state: <mtrand.RandomState object at 0x10d5c0fd0>
	clf__verbose: 0
	clf__warm_start: False
	sa: <sklear

### Extract the best parameters for the winning model

In [24]:
best_parameters = gs.best_estimator_.get_params()

In [48]:
best_parameters

{'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
             max_depth=4, max_features='auto', max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
             oob_score=False,
             random_state=<mtrand.RandomState object at 0x10d5c0fd0>,
             verbose=0, warm_start=False),
 'clf__bootstrap': True,
 'clf__class_weight': None,
 'clf__criterion': 'entropy',
 'clf__max_depth': 4,
 'clf__max_features': 'auto',
 'clf__max_leaf_nodes': None,
 'clf__min_samples_leaf': 1,
 'clf__min_samples_split': 2,
 'clf__min_weight_fraction_leaf': 0.0,
 'clf__n_estimators': 50,
 'clf__n_jobs': 1,
 'clf__oob_score': False,
 'clf__random_state': <mtrand.RandomState at 0x10d5c0fd0>,
 'clf__verbose': 0,
 'clf__warm_start': False,
 'sa': <sklearnextensions.SparseToArray instance at 0x10beb6998>,
 'steps': [('vect',
   CountVectorizer(analyzer='char_wb', binary=False,

In [26]:
parameters = concatenate(feature_parameters,{'clf__max_depth': [2, 3, 4, 5],
                                                     'clf__n_estimators': [5, 25, 50, 100, 150, 200]})

In [29]:
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

	clf__max_depth: 4
	clf__n_estimators: 50
	vect__analyzer: 'char_wb'
	vect__binary: False
	vect__ngram_range: (1, 3)


### Run predict on runtime data

In [67]:
import pandas as pd

In [70]:
rundata = pd.read_excel('Jun21_2016_OneSurgeryPreOpMRI.xlsx')

In [73]:
rundata.shape[0]

1004

In [78]:
rundata['Predicted']=gs.predict(rundata['Report'])

In [83]:
rundata['Predicted']=rundata['Predicted'].replace(0, 'single').replace(1, 'multiple')

In [85]:
rundata.to_excel('runtime_data_catagorized_OA7.11.16.xlsx')