# Pipelines
Pipelines are pipelines created from the UI. They can be converted to a [scikit-learn pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html). They represent a combination of data source, pre-processing steps, feature extraction and classification.

In [1]:
from newsgac import database
from newsgac.pipelines.models import Pipeline
import pandas

In [2]:
Pipeline._mongometa.fields_dict

{'_id': <pymodm.fields.ObjectIdField at 0x7f5a3d6e6ad0>,
 'created': <pymodm.fields.DateTimeField at 0x7f5a3d6e6450>,
 'data_source': <pymodm.fields.ReferenceField at 0x7f5a3d6e64d0>,
 'display_title': <pymodm.fields.CharField at 0x7f5a3d6e6410>,
 'grid_search_result': <newsgac.common.fields.ObjectField at 0x7f5a3d6e6890>,
 'learner': <pymodm.fields.EmbeddedDocumentField at 0x7f5a3d6e6710>,
 'lemmatization': <pymodm.fields.BooleanField at 0x7f5a3d6e6590>,
 'nlp_tool': <pymodm.fields.EmbeddedDocumentField at 0x7f5a3d6e6690>,
 'quote_removal': <pymodm.fields.BooleanField at 0x7f5a3d6e65d0>,
 'result': <pymodm.fields.EmbeddedDocumentField at 0x7f5a3d6e67d0>,
 'sk_pipeline': <newsgac.common.fields.ObjectField at 0x7f5a3d6e6790>,
 'sw_removal': <pymodm.fields.BooleanField at 0x7f5a3d6e6550>,
 'task': <pymodm.fields.EmbeddedDocumentField at 0x7f5a3d6e6950>,
 'task_id': <pymodm.fields.CharField at 0x7f5a3d6e6850>,
 'updated': <pymodm.fields.DateTimeField at 0x7f5a3d6e6490>,
 'user': <pymodm.f

In [3]:
p = Pipeline.objects.first()
print 'DataSource: ' + p.data_source.display_title
print 'NLP Tool: ' + p.nlp_tool.name
print 'Classifier: ' + p.learner.name
print 'Task status: ' + str(p.task.status)

DataSource: NGBS Training
NLP Tool: Frog
Classifier: Random Forest
Task status: Status.SUCCESS


## Create a scikit-learn pipeline

In [4]:
skp = p.get_sk_pipeline()
skp

Pipeline(memory=None,
     steps=[('CleanOCR', <newsgac.nlp_tools.transformers.CleanOCR object at 0x7f5a3d6f7850>), ('FeatureExtraction', FeatureUnion(n_jobs=None,
       transformer_list=[('Basic', <newsgac.nlp_tools.transformers.ExtractBasicFeatures object at 0x7f5a3d6f7890>), ('Quote', <newsgac.nlp_tools.transformers.Extra...stimators=50, n_jobs=8,
            oob_score=False, random_state=42, verbose=0, warm_start=False))])

In [5]:
skp.named_steps.keys()

['CleanOCR', 'FeatureExtraction', 'Classifier', 'RobustScaler']

In [31]:
fe = skp.named_steps['FeatureExtraction']

In [34]:
fe.transformer_list

[('Basic',
  <newsgac.nlp_tools.transformers.ExtractBasicFeatures at 0x7f1b9e04e8d0>),
 ('Quote', <newsgac.nlp_tools.transformers.ExtractQuotes at 0x7f1b474a4b10>),
 ('Sentiment',
  NoneStepsPipeline(feature_names_from='SentimentFeatures', memory=None,
           steps=[('RemoveQuotes', <newsgac.nlp_tools.transformers.RemoveQuotes object at 0x7f1b474a4950>), ('SentimentFeatures', <newsgac.nlp_tools.transformers.ExtractSentimentFeatures object at 0x7f1b474a4ed0>)])),
 ('Frog', NoneStepsPipeline(feature_names_from='Frog', memory=None,
           steps=[('RemoveQuotes', <newsgac.nlp_tools.transformers.RemoveQuotes object at 0x7f1b474a4a10>), ('Frog', <newsgac.nlp_tools.models.frog.FrogFeatureExtractor object at 0x7f1b474a40d0>)]))]

In [35]:
fe.transformer_list[0]

('Basic',
 <newsgac.nlp_tools.transformers.ExtractBasicFeatures at 0x7f1b9e04e8d0>)

In [41]:
import nltk
nltk.download('punkt')
# todo: Remove above, has been moved to the Dockerfile

[nltk_data] Downloading package punkt to /home/newsgac/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [48]:
basic_feature_extractor = fe.transformer_list[0][1]
features = basic_feature_extractor.transform(['Dit is een test tekst!'])[0]
feature_names = basic_feature_extractor.get_feature_names()
zip(feature_names, features)

[('question_marks_perc', 0.0),
 ('exclamation_marks_perc', 0.16666666666666666),
 ('currency_symbols_perc', 0.0),
 ('digits_perc', 0.0),
 ('sentences', 1.0),
 ('avg_sentence_length', 6.0)]