## Imports
Contains import statements for all the required libraries and frameworks

In [37]:
%load_ext autoreload
%autoreload 2

import os
import json
import random
import logging
import numpy as np

from copy import deepcopy
from pprint import pprint
from datetime import datetime

from utils.utils import get_library_class
from utils.generators import generate_data
from utils.styles import generate_styles
from utils.creators import create_graph
from utils.exporters import (
    export_graph_data,
    export_graph_styles,
    export_graph_image,
    LOG_LEVEL
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Define Hyperparameters
Specifies the maximum number of graphs to be generated

Additionally, the types of libraries and plots can be specified as well

In [38]:
# number of graphs to be generated
max_num_graphs = 30

# list of all potential graph types
graph_types = [
    'scatter',
    'bar',
    'line',
    'contour',
    # 'kd',
    'histogram',
    'errorbar',
    'bubble',
    'area',
    'box',
    'violin',
]

# set of libraries generating graphs
libraries = [
    'bokeh',
    'altair',
    'plotnine',
]

# exclude certain graph-library combinations
exclusions = [
    ('contour', 'bokeh'),
    ('contour', 'altair'),
    # ('contour', 'plotnine')
]

## Setup
Set up and clean the pipeline environment

Tasks such as cleaning the output folders and setting up a logger are done here

In [39]:
# directories
OUTPUT_DIR = 'output'
INPUT_DIR = 'input'
LOG_DIR = 'logs'

# empty output folder
for folder in ['data', 'styles', 'images']:
    path = '{dir}/{folder}'.format(dir=OUTPUT_DIR, folder=folder)
    for file in os.listdir(path):
        if file != '__init__.py':
            os.remove(os.path.join(path, file))

# setup logger
timestamp = datetime.now().strftime('%d_%m_%Y-%I_%M_%S')
logging.basicConfig(
    filename='{dir}/{timestamp}.log'.format(dir=LOG_DIR, timestamp=timestamp),
    encoding='utf-8',
    level=LOG_LEVEL,
    format='%(asctime)s %(message)s',
    datefmt='%m/%d/%Y %I:%M:%S',
    force=True,
)

# setup libraries (where applicable)
for library in libraries:
    library_class = get_library_class(library)
    library_class.setup_hook()

# generate dict for each possible graph type
# (e.g. { 'scatter': {}, 'bar': {}, ... })
graphs = dict(
    zip(graph_types, [{} for _ in range(len(graph_types))])
)
pprint(graphs)

{'bar': {}, 'violin': {}}


## Data Separation
Generates a dict representing the number of graphs that need to be created for each library/graph pair

The dictionary keys represent `(library, graph)` where the value represents the number of graphs to be generated

In [40]:
def generate_occurences_dict(
    num_graphs,
    graph_types,
    libraries,
    equal_library_distribution=True,
    equal_graph_distribution=True,
):
    # dict for storing the number of graphs per library / graph to generate
    # (e.g. occurences['bokeh', 'bar'] might return a value of 5)
    occurences, library_occurences, graph_occurences = {}, [], [] 
    
    # split graph occurences (1-d array)
    split_graph_func =  split_number_evenly if equal_graph_distribution else split_number_randomly
    graph_occurences = split_graph_func(num_graphs, len(graph_types))

    # split library occurences (2-d array)
    split_library_func = split_number_evenly if (equal_library_distribution) else split_number_randomly
    for graph_index, num_graphs_per_type in enumerate(graph_occurences):
        # only distribute graph occurences to available libraries (i.e. ones that aren't excluded)
        graph_exclusions = list(filter(lambda x: x[0] == graph_types[graph_index] and x[1] in libraries, exclusions))
        library_occurence = split_library_func(num_graphs_per_type, len(libraries) - len(graph_exclusions))
        # splice an occurence of 0 at all the unavaiable library positions
        # filter is needed to avoid indexing elements that don't exist
        library_indices = sorted(map(lambda x: libraries.index(x[1]), filter(lambda x: x[1] in libraries, graph_exclusions)))
        _ = [library_occurence.insert(library_index, 0) for library_index in library_indices]
        library_occurences.append(library_occurence)

    # add values generated above to occurence dict
    for i, num_graphs_per_type in enumerate(library_occurences):
        for j, num_graphs_per_library in enumerate(num_graphs_per_type):
            graph_type, library = graph_types[i], libraries[j]
            occurences[graph_type, library] = num_graphs_per_library
    return occurences

def split_number_evenly(n, n_arrays):
    values = np.linspace(n, 0, n_arrays+1).astype(int)
    return [value - values[i+1] for i, value in enumerate(values[:-1])]

def split_number_randomly(n, n_arrays):
    number, numbers = n, []
    while (number > 0):
        random_number = round(random.random()*number)
        numbers.append(random_number)
        number -= random_number
    numbers.extend([0] * (n_arrays - len(numbers)))
    return random.sample(numbers, len(numbers))

# generate graph occurence dictionary
graph_types = list(graphs.keys())
occurences = generate_occurences_dict(max_num_graphs, graph_types, libraries)
pprint(occurences)

{('bar', 'altair'): 5,
 ('bar', 'bokeh'): 5,
 ('bar', 'plotnine'): 5,
 ('violin', 'altair'): 5,
 ('violin', 'bokeh'): 5,
 ('violin', 'plotnine'): 5}


## Data Generation
Generates the corresponding data based on graph type (i.e. `generate_bar()`)

Data is stored in a generated_graphs object (i.e. an `X` attribute in said graph object)

In [41]:
# append objects in the format { id, library, data, etc. } to each corresponding graph type
# e.g. graphs['bar']['graphs'] might equal [{ id: 1, library: 'bokeh', data: [[1, 2, 3]]}, ...]
generated_graphs = deepcopy(graphs)
graph_type_id, current_graph_type = 0, None
for (graph_type, library), num_occurences in occurences.items():
    # use a counter to track how many entries belong to the current graph type
    if (current_graph_type != graph_type):
        graph_type_id = 0
        current_graph_type = graph_type

    # generate a n-length set of data points,
    # where n is something like occurences['bar', 'bokeh']
    graphs_list = generated_graphs[graph_type].setdefault('graphs', [])
    for _ in range(num_occurences):
        data = generate_data(graph_type)
        graphs_list.append({
            'id': graph_type_id,
            'library': library,
            'data': data,
        })
        graph_type_id += 1

## Data Stylization
Generates styles for a given library-graph pair (e.g. `generate_style(library, graph)`)

Calls the `generate_bokeh_styles`, `generate_altair_styles`, and `generate_plotnine_styles` functions for each style module dynamically

The stylization code for each graph can be found in the `styles` dir respectively (i.e. `styles/bar.py`)

In [42]:
# goes through each graph and generated corresponding style and theming objects
for (graph_type, graph_object) in generated_graphs.items():
    print(graph_type, graph_object)
    for graph_content in graph_object['graphs']:
        library, num_repeats = graph_content['library'], graph_content['data'].get('num_repeats', 1)
        graph_content['styles'] = generate_styles(graph_type, library, num_repeats)
        graph_content['data'].pop('num_repeats', None)

# displays generated styles objects
pprint([[graph_content['styles'] for graph_content in graph_object['graphs']] for graph_object in generated_graphs.values()])

bar {'graphs': [{'id': 0, 'library': 'bokeh', 'data': {'X': ['A', 'B', 'C', 'D'], 'y': array([33.08723708, 22.88917665,  2.70832629,  2.20196328]), 'is_vertical': False}}, {'id': 1, 'library': 'bokeh', 'data': {'X': ['A', 'B', 'C', 'D', 'E', 'F'], 'y': array([20.38856075, 27.30729951, 27.91696666, 29.98148692, 49.06907735,
       47.0536166 ]), 'is_vertical': True}}, {'id': 2, 'library': 'bokeh', 'data': {'X': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'], 'y': array([ 5.05327808,  8.17744556,  8.70850103, 10.818941  , 23.5392161 ,
       30.88002333, 36.19890976, 45.4560153 ]), 'is_vertical': True}}, {'id': 3, 'library': 'bokeh', 'data': {'X': ['A', 'B', 'C', 'D', 'E', 'F', 'G'], 'y': array([42.7262197 , 33.55786748, 27.93842037, 25.60750005, 11.82091724,
       10.79368371,  4.5513662 ]), 'is_vertical': True}}, {'id': 4, 'library': 'bokeh', 'data': {'X': ['A', 'B', 'C', 'D'], 'y': array([42.5118744 , 34.61941804, 21.43612538,  9.07175985]), 'is_vertical': False}}, {'id': 5, 'library': 'al

## Graph Creation
Uses the generated data points and styles to create the respective graphs

Additional flags can be used to specify the output file path and file type

In [43]:
# prevents inline chart display
%matplotlib agg

for (graph_type, graph_object) in generated_graphs.items():
    for graph_content in graph_object['graphs']:
        # retrieves the create_LIBRARY_graph function based on the graphs
        # library and then appends the created graph to the same object
        library = graph_content['library']
        graph = create_graph(graph_type, library, graph_content)
        
        # export content to filepath
        id = graph_content['id']
        file_name = '{graph_type}_{library}_{id}'.format(graph_type=graph_type, library=library, id=id)

        # save data, styles, and images to disk
        # export_graph_data(graph_content['data'], 'output/{path}/{file_name}.{file_type}'
        #     .format(file_name=file_name, path='data', file_type='json'))
        # export_graph_styles(graph_content['styles'], 'output/{path}/{file_name}.{file_type}'
        #     .format(file_name=file_name, path='styles', file_type='json'))
        export_graph_image(graph, library, 'output/{path}/{file_name}.{file_type}'
            .format(file_name=file_name, path='images', file_type='png'))

# resets inline chart display back to original settings
%matplotlib inline

False
True
True
True
False


AttributeError: 'list' object has no attribute 'get'

### Graph Regeneration
The ability to regenerate graphs is critical, as the pipeline above will be involved in creating input data for a Generative Adversarial Network. It is neccessary to compare the images from both the original pipeline as well as the GAN in order to assess the network's accuracy

By supplying input data to the `input/data` and `input/styles` folders, the following cell will read in the data, generate the corresponding charts, and export the data to the `input/images` folder

The inputted data and styles files are required to be in `json` format

In [None]:
# retrieve a list of contents from the specified input folders
input_folders = ['data', 'styles']
input_folder_paths = ['{dir}/{folder}'.format(dir=INPUT_DIR, folder=folder) for folder in input_folders]
input_folder_contents = [set(os.listdir(path)) for path in input_folder_paths]

# only keep filenames of the graphs with all required files
valid_input_graphs = set.intersection(*input_folder_contents)
_ = [valid_input_graphs.discard(file) for file in ['__init__.py', '__pycache__']]

# retrieve graph type and library based on filename
graphs = [graph.split('_', maxsplit=3) for graph in valid_input_graphs]
for graph_filename in valid_input_graphs:
  # extract graph data (from filename) and re-generate the expected data structure
  # uses the given filename to retrieve the input files on disk
  graph_type, library, id = graph_filename.split('_', maxsplit=3)
  graph_content = {}
  # note that the object property will be equal to the input folder name
  # i.e. files in input/data will be stored as object['data']
  for folder in input_folders:
    with open('{dir}/{folder}/{file_name}'.format(dir=INPUT_DIR, folder=folder, file_name=graph_filename)) as f:
      graph_content[folder] = json.load(f)
  # create and export graph
  graph = create_graph(graph_type, library, graph_content)
  export_graph_image(graph, library, 'input/{path}/{file_name}.{file_type}'
    .format(file_name=graph_filename, path='images', file_type='png'))

logging.shutdown()