In [2]:
import numpy
from time import time
from sklearn.model_selection import GridSearchCV

from newsgac import config
from newsgac.genres import genre_codes
from newsgac.learners import learners, LearnerSVC, LearnerNB, LearnerXGB, LearnerGB, LearnerMLP, LearnerRF, LearnerLGBM
from newsgac.pipelines.get_sk_pipeline import get_sk_pipeline
from newsgac.pipelines.utils import report


from newsgac import database
from newsgac.data_sources import DataSource
from newsgac.pipelines import Pipeline

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/newsgac/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
config.n_parallel_jobs

8

In [2]:
[d.display_title for d in DataSource.objects.all()]

[u'NGBS Training', u'NBGS test']

In [3]:
p = Pipeline.objects.first()
print 'DataSource: ' + p.data_source.display_title
print 'NLP Tool: ' + p.nlp_tool.name
print 'Classifier: ' + p.learner.name
print 'Task status: ' + str(p.task.status)

DataSource: NGBS Training
NLP Tool: Frog
Classifier: Random Forest
Task status: Status.SUCCESS


In [4]:
len(p.data_source.articles)

813

In [5]:
param_space = {
    LearnerRF: {
        'criterion': ['gini', 'entropy'],
#         'n_estimators': [10, 30, 50, 100, 120, 160],
#         'max_features': [None, 2, 4, 6, 8, 12, 24],
#         'max_depth': [None, 2, 4, 6, 8, 10, 12],
#         'min_samples_split': [2, 4, 8, 16, 32],
#         'min_samples_leaf': [1, 3, 5, 7, 9],          # tune
#         'max_leaf_nodes': [None, 4, 6, 8, 16, 32, 64],
#         'class_weight': [None, 'balanced'],
    }
}

In [6]:
def run_grid_search(pipeline):
    texts = numpy.array([article.raw_text for article in pipeline.data_source.articles])
    labels = numpy.array([article.label for article in pipeline.data_source.articles])
    param_grid = []
    pipeline.grid_search_result = {}
    scores = ['accuracy', 'recall_micro', 'precision_micro', 'f1_micro']
    for learner in learners:
        if learner in param_space:
            space = {
                'Classifier__%s' % name: space for name, space in param_space[learner].iteritems()
            }
            space['Classifier'] = [learner.create().get_classifier()]
            param_grid.append(space)

    print param_grid
    skp = pipeline.get_sk_pipeline()
    search = GridSearchCV(skp, param_grid, iid=False, cv=5, return_train_score=False, 
                          n_jobs=config.n_parallel_jobs,
                          scoring=scores[3])
    start = time()
    search.fit(texts, labels)
    print("Best parameter (CV score=%0.3f):" % search.best_score_)
    print(search.best_params_)

    pipeline.grid_search_result = {
        'full': search.cv_results_,
        'best': search.best_params_
    }
    print pipeline.grid_search_result
    print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
          % (time() - start, len(search.cv_results_['params'])))
    report(search.cv_results_)

In [7]:
run_grid_search(p)

[{'Classifier': [RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=u'auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=8,
            oob_score=False, random_state=42, verbose=0, warm_start=False)], 'Classifier__criterion': ['gini', 'entropy']}]
Best parameter (CV score=0.446):
{'Classifier': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=u'auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=8,
            oob_score=False, random_state=42, verbose=0, warm_start=False), 'Classifier__criterion': 'gini'}
{'full': {'split4_test_score': arr