In [1]:
import sys
sys.path.append('..')

import gxd_htLearningLib as ht

import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 
import mglearn

from IPython.display import display

from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics # confusion_matrix?

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

%matplotlib 
%matplotlib inline

Data directory: /Users/jak/work/GXD_ht/Data
Using matplotlib backend: MacOSX


In [2]:
# Load experiments

dataset=ht.getTrainingSet()
print dataset.filenames[:4]

['/Users/jak/work/GXD_ht/Data/yes/E-GEOD-27836'
 '/Users/jak/work/GXD_ht/Data/no/E-GEOD-28228'
 '/Users/jak/work/GXD_ht/Data/no/E-GEOD-31822'
 '/Users/jak/work/GXD_ht/Data/no/E-GEOD-44359']


In [3]:
# Get Experiment IDs from file names

exp_IDs = ht.getExpIDs(dataset.filenames)
print exp_IDs[:4]

['E-GEOD-27836', 'E-GEOD-28228', 'E-GEOD-31822', 'E-GEOD-44359']


In [4]:
# Split into training and test sets

exp_IDs_train, exp_IDs_test, \
docs_train,    docs_test, \
y_train,       y_test      = train_test_split(exp_IDs, dataset.data, dataset.target,
                                               test_size=0.25, random_state=10)

print "Total Number of Experiments: %d" % len(dataset.filenames)
print "Yes count: %d" % (dataset.target.tolist()).count(1)
print "No count:  %d" % (dataset.target.tolist()).count(0)
print
print "Number of Training Experiments: %d" % len(y_train)
print "Yes count: %d" % (y_train.tolist()).count(1)
print "No count:  %d" % (y_train.tolist()).count(0)
print 
print "Number of Test Experiments: %d" % len(y_test)
print "Yes count: %d" % (y_test.tolist()).count(1)
print "No count:  %d" % (y_test.tolist()).count(0)

Total Number of Experiments: 1661
Yes count: 563
No count:  1098

Number of Training Experiments: 1245
Yes count: 423
No count:  822

Number of Test Experiments: 416
Yes count: 140
No count:  276


# TFIDF VECTORIZER w/ beta=4

In [5]:
# Run GridSearch on various parameters

pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(
                     strip_accents='unicode', decode_error='replace',
                     token_pattern=u'(?u)\\b([a-z_]\w+)\\b', stop_words="english") ),
    ('scaler'    , StandardScaler(copy=True, with_mean=False, with_std=True) ),
    ('classifier', SGDClassifier(verbose=2, eta0=1, class_weight='balanced') ),
    ])
parameters={ 'vectorizer__ngram_range':[(1,1),(1,2),(1,3)],
             'vectorizer__min_df':[2],
             'vectorizer__max_df':[.98],
             'classifier__eta0':[.01,.1],
             'classifier__alpha':[.001,.01],
             'classifier__loss':[ 'log' ], #'hinge', 'log','modified_huber'],
             'classifier__penalty':['l1'], # ,'elasticnet'
             'classifier__n_iter':[5],
             'classifier__learning_rate':['invscaling'], # 'constant'
            }
BETA=4
scorer = ht.makeFscorer(beta=BETA)  # used by GridSearchCV() to rate the pipeline options

gs = GridSearchCV(pipeline, parameters, scoring=scorer, n_jobs=-1, verbose=1)
gs.fit( docs_train, y_train )


Fitting 3 folds for each of 12 candidates, totalling 36 fits
-- Epoch 1
Norm: 0.92, NNZs: 4374, Bias: -0.000136, T: 830, Avg. loss: 0.533953
-- Epoch 1
Total training time: 0.02 seconds.
-- Epoch 2
Norm: 0.97, NNZs: 4360, Bias: -0.000208, T: 1660, Avg. loss: 0.372158
Norm: 1.05, NNZs: 4429, Bias: -0.000138, T: 830, Avg. loss: 0.561962
Total training time: 0.03 seconds.
Total training time: 0.04 seconds.
-- Epoch 3
-- Epoch 2
Norm: 1.01, NNZs: 4327, Bias: -0.000232, T: 2490, Avg. loss: 0.302564
Norm: 1.10, NNZs: 4379, Bias: -0.000211, T: 1660, Avg. loss: 0.388392
Total training time: 0.06 seconds.
Total training time: 0.05 seconds.
-- Epoch 3
Norm: 1.13, NNZs: 4352, Bias: -0.000238, T: 2490, Avg. loss: 0.313622
Total training time: 0.07 seconds.
-- Epoch 4
Norm: 1.16, NNZs: 4321, Bias: -0.000253, T: 3320, Avg. loss: 0.270544
-- Epoch 4
Norm: 1.04, NNZs: 4284, Bias: -0.000250, T: 3320, Avg. loss: 0.262843
-- Epoch 1
Total training time: 0.08 seconds.
Norm: 1.04, NNZs: 4305, Bias: -0.0000

-- Epoch 3
-- Epoch 4
Norm: 15.42, NNZs: 8180, Bias: -0.000764, T: 2490, Avg. loss: 1.408797
Norm: 17.43, NNZs: 8161, Bias: -0.001353, T: 3320, Avg. loss: 1.179162
Total training time: 0.05 seconds.
Total training time: 0.12 seconds.
-- Epoch 4
-- Epoch 5
Norm: 15.41, NNZs: 8083, Bias: -0.000796, T: 3320, Avg. loss: 1.086071
Norm: 17.42, NNZs: 8098, Bias: -0.001373, T: 4150, Avg. loss: 0.970160
Total training time: 0.13 seconds.
Total training time: 0.07 seconds.
-- Epoch 5
Norm: 15.41, NNZs: 7964, Bias: -0.000824, T: 4150, Avg. loss: 0.880248
Total training time: 0.09 seconds.
-- Epoch 1
Norm: 0.80, NNZs: 3101, Bias: -0.000147, T: 830, Avg. loss: 0.568583
Total training time: 0.02 seconds.
Norm: 0.87, NNZs: 2860, Bias: -0.000224, T: 1660, Avg. loss: 0.451505
-- Epoch 2
Total training time: 0.06 seconds.
-- Epoch 3
Norm: 0.92, NNZs: 2661, Bias: -0.000271, T: 2490, Avg. loss: 0.398577
Total training time: 0.07 seconds.
-- Epoch 4
Norm: 0.96, NNZs: 2526, Bias: -0.000305, T: 3320, Avg. lo

Total training time: 0.04 seconds.
Total training time: 0.14 seconds.
-- Epoch 5
-- Epoch 3
Norm: 1.81, NNZs: 6173, Bias: -0.000344, T: 4150, Avg. loss: 0.289804
Norm: 1.54, NNZs: 6626, Bias: -0.000311, T: 2490, Avg. loss: 0.329948
Total training time: 0.16 seconds.
Total training time: 0.07 seconds.
-- Epoch 4
Norm: 1.56, NNZs: 6241, Bias: -0.000349, T: 3320, Avg. loss: 0.292691
Total training time: 0.10 seconds.
-- Epoch 5
Norm: 1.58, NNZs: 5808, Bias: -0.000379, T: 4150, Avg. loss: 0.270034
Total training time: 0.10 seconds.
-- Epoch 1
Norm: 1.64, NNZs: 9109, Bias: -0.000154, T: 830, Avg. loss: 0.674257
Total training time: 0.01 seconds.
-- Epoch 2
Norm: 1.67, NNZs: 7344, Bias: -0.000236, T: 1660, Avg. loss: 0.432499
Total training time: 0.04 seconds.
-- Epoch 3
Norm: 1.70, NNZs: 6686, Bias: -0.000280, T: 2490, Avg. loss: 0.346279
Total training time: 0.05 seconds.
-- Epoch 4
Norm: 1.72, NNZs: 6049, Bias: -0.000310, T: 3320, Avg. loss: 0.302731
Total training time: 0.07 seconds.
-- 

[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    9.5s finished


-- Epoch 1
Norm: 0.99, NNZs: 3446, Bias: -0.000124, T: 1245, Avg. loss: 0.557296
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 1.06, NNZs: 3028, Bias: -0.000211, T: 2490, Avg. loss: 0.440592
Total training time: 0.00 seconds.
-- Epoch 3
Norm: 1.11, NNZs: 2753, Bias: -0.000264, T: 3735, Avg. loss: 0.390791
Total training time: 0.00 seconds.
-- Epoch 4
Norm: 1.16, NNZs: 2590, Bias: -0.000298, T: 4980, Avg. loss: 0.361785
Total training time: 0.01 seconds.
-- Epoch 5
Norm: 1.19, NNZs: 2412, Bias: -0.000330, T: 6225, Avg. loss: 0.342672
Total training time: 0.01 seconds.


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('vectorizer', TfidfVectorizer(analyzer=u'word', binary=False, decode_error='replace',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_id...   penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=2, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'classifier__n_iter': [5], 'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)], 'classifier__alpha': [0.001, 0.01], 'classifier__learning_rate': ['invscaling'], 'vectorizer__min_df': [2], 'classifier__eta0': [0.01, 0.1], 'classifier__loss': ['log'], 'vectorizer__max_df': [0.98], 'classifier__penalty': ['l1']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(fbeta_score, beta=4, pos_label=1), verbose=1)

In [6]:
# Get best vectorizer and classifier, print best tuning parameters

best_vect = gs.best_estimator_.named_steps["vectorizer"]
best_clf  = gs.best_estimator_

print ("Best Pipeline Parameters")
for param_name in sorted(parameters.keys()):
    print("%s:%r"% (param_name, gs.best_params_[param_name]) )

Best Pipeline Parameters
classifier__alpha:0.01
classifier__eta0:0.01
classifier__learning_rate:'invscaling'
classifier__loss:'log'
classifier__n_iter:5
classifier__penalty:'l1'
vectorizer__max_df:0.98
vectorizer__min_df:2
vectorizer__ngram_range:(1, 1)


In [7]:
# Print out some features as a sanity check for the vectorizer

print ht.getVectorizerReport(best_vect, num_features=10)

Number of Features: 5725

First 10 features: [u'a1', u'a10', u'a2', u'aa', u'aa4', u'aad', u'abdominal', u'aberrant', u'aberrations', u'ability']

Middle 10 features: [u'lactogenic', u'lacz', u'lad', u'lag', u'lamin', u'lamina', u'laminin', u'landmarks', u'landscape', u'landscapes']

Last 10 features: [u'zero', u'zfp36', u'zinc', u'zipper', u'zn', u'zona', u'zone', u'zooepidemicus', u'zurich', u'zygotic']



In [8]:
# Print metrics on predictions for test and training sets

y_predicted_test  = best_clf.predict(docs_test)
y_predicted_train = best_clf.predict(docs_train)

print "For the Test Set"
print ht.getFormatedMetrics(y_test, y_predicted_test, BETA)

print "For the Training Set"
print ht.getFormatedMetrics(y_train, y_predicted_train, BETA)

For the Test Set
             precision    recall  f1-score   support

        yes       0.78      0.83      0.80       140
         no       0.91      0.88      0.90       276

avg / total       0.87      0.86      0.86       416

F4: 0.825

['yes', 'no']
[[116  24]
 [ 33 243]]


For the Training Set
             precision    recall  f1-score   support

        yes       0.96      0.99      0.98       423
         no       1.00      0.98      0.99       822

avg / total       0.98      0.98      0.98      1245

F4: 0.989

['yes', 'no']
[[419   4]
 [ 16 806]]




In [9]:
print falselyReported(y_test, y_test_predicted, file_names_test)

NameError: name 'falselyReported' is not defined

In [None]:
mglearn.tools.visualize_coefficients(
    trainedCLF.named_steps["classifier"].coef_, 
    feature_names, n_top_features=30
)
plt.title("SGDClassifier with TfidfVectorizer")