### preprocess_select_features

In [None]:
to_import = [
    'utils',
    'config_loader',
    'base.pipeline.csv_loader',
    'base.preprocessor.count_vectorizer',
    'base.preprocessor.sklearn_preprocessor',
    'base.pipeline.data_splitter',
    'base.pipeline.mlp_term_selector',
    'base.pipeline.data_splitter',
    'base.pipeline.csv_saver'
]

import importer
importer.import_modules(__name__, __file__, to_import)

import logging
import os
import json

logger = logging.getLogger('pipeline')
Config = config_loader.ConfigLoader(
    'config.json'
)

with open("label_mapping.json", encoding='utf-8') as f:
    label_mapping = json.load(f)

data_columns = ['TARGET','TEXT']
term_selector_limit = 4000

Config.label_mapping = label_mapping
Config.data_columns = data_columns
Config.term_selector_limit = term_selector_limit

################################################################################

data_loader1 = csv_loader.CSVLoader(**Config["csv_loader_params"])
cv1 = count_vectorizer.CV(step_name = 'cv1', **Config["cv_params"])
tfidf1 = sklearn_preprocessor.SklearnPreprocessor(step_name = 'tfidf1', **Config["tfidf_params"])
data_splitter1 = data_splitter.DataSplitter(**Config["splitter_params"])
term_selector = mlp_term_selector.MLPTermSelector(**Config["mlp_term_selector_params"])
data_loader2 = csv_loader.CSVLoader(**Config["csv_loader_params"])
cv2 = count_vectorizer.CV(step_name = 'cv2', **Config["cv_params"])
tfidf2 = sklearn_preprocessor.SklearnPreprocessor(step_name = 'tfidf2', **Config["tfidf_params"])
data_splitter2 = data_splitter.DataSplitter(**Config["splitter_params"])
csv_saver = csv_saver.CSVSaver(**Config["csv_saver_params"])

################################################################################

@utils.catch('PIPELINE_RUNERROR')
def run(debug = False):
    x = {}
    x = data_loader1(x, debug)
    x = cv1.fit_transform(x, debug)
    x = tfidf1.fit_transform(x, debug)
    x = data_splitter1(x, debug)
    x = term_selector(x, debug)
    
    x = {}
    x = data_loader2(x, debug)
    x = tfidf2.fit_transform(x, debug)
    x = data_splitter2(x, debug)
    csv_saver(x, debug)

In [None]:
run()

### Train

In [None]:
to_import = [
    'utils',
    'config_loader',
    'base.pipeline.csv_loader',
    'base.preprocessor.sklearn_preprocessor',
    'base.model.multi_model'
]

import importer
importer.import_modules(__name__, __file__, to_import)

import logging
import os
import json

logger = logging.getLogger('pipeline')
Config = config_loader.ConfigLoader(
    'config.json'
)

with open("label_mapping.json", encoding='utf-8') as f:
    label_mapping = json.load(f)

model_type = 'mlp'
term_selector_limit = 4000

Config.label_mapping = label_mapping
Config.data_columns = Config["data_columns"]
Config.term_selector_limit = term_selector_limit
Config.model_type = model_type

################################################################################

data_loader = csv_loader.CSVLoader(**Config["csv_loader_params"])
model = multi_model.MultiModel(**Config["model_params"])

################################################################################

@utils.catch('PIPELINE_RUNERROR')
def run(debug = False):
    x = {}
    x = data_loader(x, debug)
    
    results = model.search(x, debug)
    return results

### Prod

In [None]:
to_import = [
    'utils',
    'config_loader',
    'text_classification.retriever',
    'base.preprocessor.count_vectorizer',
    'base.preprocessor.sklearn_preprocessor',
    'base.model.multi_model',
    'text_classification.delivery'
]

import importer
importer.import_modules(__name__, __file__, to_import)

import logging
import os
import json

logger = logging.getLogger('pipeline')
Config = config_loader.ConfigLoader(
    'config.json'
)

with open("label_mapping.json", encoding='utf-8') as f:
    label_mapping = json.load(f)

model_type = 'mlp'
term_selector_limit = 4000

Config.label_mapping = label_mapping
Config.data_columns = Config["data_columns"]
Config.term_selector_limit = term_selector_limit
Config.model_type = model_type

################################################################################

retriever = retriever.Retriever()
cv2 = count_vectorizer.CV(step_name = 'cv2', **Config["cv_params"])
tfidf2 = sklearn_preprocessor.SklearnPreprocessor(step_name = 'tfidf2', **Config["tfidf_params"])
model = multi_model.MultiModel(**Config["model_params"])
delivery = delivery.Retriever()

################################################################################

@utils.catch('PIPELINE_RUNERROR')
def predict(x = None, DocId = '', debug = False):
    x = {}
    
    x = retriever(x)
    x = cv2.transform(x, debug)
    x = tfidf2.transform(x, debug)
    x = model.predict(x, debug)
    x = delivery(x)
    
    return x

In [None]:
example_msg = ""
predict(example_msg, 'ID012345')