# Dummy pipeline - step 5: merge and sort elements

This step merge result of conversion from octal and binary files and sorts results and then writes them into an output file.

Input formats:

        ([index]:[value])(\s[index]:[value])*
        3:A 5:T 2:r ...
               
It also log some metrics using **MLflow tracking**.

In [None]:
# Parameters
"""
:param str char_from_bin: path to converted data from binary input file
:param str char_from_octal: path to converted data from octal input file
:param str result_file: path to the result output file
:param str mlflow_output: mlflow tracking output directory
:dvc-in char_from_bin:  ./dummy/data/data_conv_from_bin.txt
:dvc-in char_from_octal:  ./dummy/data/data_conv_from_octal.txt
:dvc-out result_file : ./dummy/data/result.txt
:dvc-out mlflow_output: ./dummy/data/mlflow
"""
# Value of parameters for this Jupyter Notebook only
# the notebook is in ./dummy/pipeline/notebooks
char_from_bin = '../../data/data_conv_from_bin.txt'
char_from_octal = '../../data/data_conv_from_octal.txt'
result_file = '../../data/result.txt'
mlflow_output = '../../data/mlflow'

In [None]:
with open(char_from_bin, 'r') as fd:
    data = [value for value in fd.read().split()]   

In [None]:
with open(char_from_octal, 'r') as fd:
    data += [value for value in fd.read().split()]  

In [None]:
results = [d.split('=')[1] if d.split('=')[1] else ' '  for d in sorted(data, key=lambda d: int(d.split('=')[0]))]

In [None]:
"""
MLflow section 

Not useful in this case, it is just a dummy usage to show how it works
For selected slices of data we want to count some letter occurences

First we should set the output main directory using mlflow.set_tracking_uri

Then we want to log metrics according to parameters for each run
    start a run: mlflow.start_run
    parameters:  mlflow.log_param
    metrics:     mlflow.log_metric
"""
from collections import Counter
import mlflow
import itertools
# Set ouput directory 
mlflow.set_tracking_uri(mlflow_output)

# For each slice of data we will inspect we want to log
# metrics about letters occurences according to the slice size and the starting index
for slice_size, start_idx in itertools.product((20, 50, 70), (0, len(results) // 2)):
     with mlflow.start_run():
        # Log run parameters
        mlflow.log_param('slice_size', slice_size)
        mlflow.log_param('start_index', start_idx)
        
        extract = results[start_idx : start_idx + slice_size]
        occurences = Counter(extract)
        
        # Log run metrics for given parameters
        mlflow.log_metric('nb_a', occurences.get('a', 0))
        mlflow.log_metric('nb_e', occurences.get('e', 0))
        mlflow.log_metric('nb_letter_present_more_than_8', sum([v for k, v in occurences.items() if v >= 8]))

In [None]:
with open(result_file, 'w') as fd:
    fd.write(''.join(results))
    fd.write('\n')

In [None]:
# No effect
print(''.join(results))