# ETL of Pipeline 2 Data for Visualization

In [1]:
import pandas as pd
from joblib import dump, load

In [2]:
# Declare constants

MODEL_DIR_NAMES = ['linear_svc', 'multi_nb']
MODEL_FILE_NAMES = ['linearSVC', 'multinomialNB']
VIZ_2_DATA_LOC = '../../../visualizations/part2/data' # UGLYYYY, want to reference based on root (dev/)


MODEL_NAMES = list(zip(MODEL_DIR_NAMES, MODEL_FILE_NAMES))
# Metrics
METRIC_LOCS = { d_name: f'./data/metrics/{d_name}/{f_name}_metrics.joblib' for d_name, f_name in MODEL_NAMES } 
TRANSFORMED_METRIC_LOCS = { d_name: f'{VIZ_2_DATA_LOC}/metrics/{d_name}/{f_name}_metrics.json' for d_name, f_name in MODEL_NAMES}
# CLEAN_WORDS_OUTPUT_LOC = VIZ_1_DATA_LOC + 'cleaned_words.csv'

# CV_Scores
CV_LOCS = { d_name: f'./data/cv_scores/{d_name}/{f_name}_cv_scores.joblib' for d_name, f_name in MODEL_NAMES } 
TRANSFORMED_CV_LOCS = { d_name: f'{VIZ_2_DATA_LOC}/cv_scores/{d_name}/{f_name}_cv_scores.json' for d_name, f_name in MODEL_NAMES}

### `metrics/` transformation

In [3]:
# Import defined metrics
raw_metrics = { name: load(path) for name, path in METRIC_LOCS.items() }

In [4]:
# Format imported metrics
flattened_metrics = {}
for model_name, metrics in raw_metrics.items():
    model_out = {}
    for metric in metrics:
        if isinstance(metric, list):
            metric = metric[0]
        if not isinstance(metric, dict):
            model_out = metric
        else:
            model_out[metric.get('name')] = metric.get('result')
    flattened_metrics[model_name] = model_out

grouped_metrics = list(zip(*[x.items() for x in flattened_metrics.values()]))
metrics_lookup = { z[0][0]: z[1] for z in [list(zip(*a)) for a in grouped_metrics] }

metrics_lookup

{'CV Classification - accuracy': (array([0.95298281, 0.95500506, 0.9580172 , 0.95599393, 0.9549823 ]),
  array([0.94034378, 0.94034378, 0.94334851, 0.94638341, 0.94284269])),
 'Confusion Matrix': (array([[ 384,    0,    0],
         [   0,  295,    0],
         [   0,    0, 9208]]),
  array([[ 384,    0,    0],
         [   0,  295,    0],
         [   0,    0, 9208]])),
 'Classification Report': ({'-1': {'precision': 1.0,
    'recall': 1.0,
    'f1-score': 1.0,
    'support': 384},
   '0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 295},
   '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 9208},
   'accuracy': 1.0,
   'macro avg': {'precision': 1.0,
    'recall': 1.0,
    'f1-score': 1.0,
    'support': 9887},
   'weighted avg': {'precision': 1.0,
    'recall': 1.0,
    'f1-score': 1.0,
    'support': 9887}},
  {'-1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 384},
   '0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'supp

In [5]:
assign_label = lambda metric_name: tuple([(model_name, val) for model_name, val in zip(raw_metrics.keys(), metrics_lookup[metric_name])])
labeled_metrics = { k: assign_label(k) for k in metrics_lookup }
labeled_metrics

{'CV Classification - accuracy': (('linear_svc',
   array([0.95298281, 0.95500506, 0.9580172 , 0.95599393, 0.9549823 ])),
  ('multi_nb',
   array([0.94034378, 0.94034378, 0.94334851, 0.94638341, 0.94284269]))),
 'Confusion Matrix': (('linear_svc',
   array([[ 384,    0,    0],
          [   0,  295,    0],
          [   0,    0, 9208]])),
  ('multi_nb',
   array([[ 384,    0,    0],
          [   0,  295,    0],
          [   0,    0, 9208]]))),
 'Classification Report': (('linear_svc',
   {'-1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 384},
    '0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 295},
    '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 9208},
    'accuracy': 1.0,
    'macro avg': {'precision': 1.0,
     'recall': 1.0,
     'f1-score': 1.0,
     'support': 9887},
    'weighted avg': {'precision': 1.0,
     'recall': 1.0,
     'f1-score': 1.0,
     'support': 9887}}),
  ('multi_nb',
   {'-1': {'precision': 1.0, 're

### `cv_scores/` transformation

In [8]:
# Import cross validation scores
raw_cv_scores = { name: load(path) for name, path in CV_LOCS.items() }

KeyError: 44