## Imports
Contains import statements for all the required libraries and frameworks

In [1]:
import random
import numpy as np
from pprint import pprint
from importlib import import_module

from generators.utils import generate_data
from styles.utils import generate_styles
from creators.utils import create_graph, get_file_path, export_graph

## Define Hyperparameters
Specifies the maximum number of graphs to be generated

Additionally, the types of libraries and plots can be specified as well

In [2]:
# number of graphs to be generated
max_num_graphs = 35

# set of libraries generating graphs
libraries = [
    'bokeh',
    'altair',
    'plotnine',
]

# list of all potential graph types
graph_types = [
    'scatter',
    'bar',
    'line',
    'contour',
    'kd',
    'histogram',
    'errorbar',
    'bubble',
    'area',
    'box',
]

# generate dict for each possible graph type
# (e.g. { 'scatter': {}, 'bar': {}, ... })
graphs = dict(
    zip(graph_types, [{}] * len(graph_types))
)
pprint(graphs)

{'area': {},
 'bar': {},
 'box': {},
 'bubble': {},
 'contour': {},
 'errorbar': {},
 'histogram': {},
 'kd': {},
 'line': {},
 'scatter': {}}


## Data Separation
Generates a dict representing the number of graphs that need to be created for each library/graph pair

The dictionary keys represent `(library, graph)` where the value represents the number of graphs to be generated

In [3]:
def split_number_evenly(n, n_arrays):
    values = np.linspace(n, 0, n_arrays+1).astype(int)
    return [value - values[i+1] for i, value in enumerate(values[:-1])]

def split_number_randomly(n, n_arrays):
    number, numbers = n, []
    while (number > 0):
        random_number = round(random.random()*number)
        numbers.append(random_number)
        number -= random_number
    numbers.extend([0] * (n_arrays - len(numbers)))
    return random.sample(numbers, len(numbers))

def generate_occurences_dict(
    num_graphs,
    graph_types,
    libraries,
    equal_library_distribution=True,
    equal_graph_distribution=True,
):
    # dict for storing the number of graphs per library / graph to generate
    # (e.g. occurences['bokeh', 'bar'] might return a value of 5)
    occurences, library_occurences, graph_occurences = {}, [], [] 
    
    # split graph occurences (1-d array)
    split_graph_func =  split_number_evenly if equal_graph_distribution else split_number_randomly
    graph_occurences = split_graph_func(num_graphs, len(graph_types))

    # split library occurences (2-d array)
    split_library_func = split_number_evenly if (equal_library_distribution) else split_number_randomly
    library_occurences = [split_library_func(num_graphs_per_type, len(libraries)) for num_graphs_per_type in graph_occurences]

    # add values generated above to occurence dict
    for i, num_graphs_per_type in enumerate(library_occurences):
        for j, num_graphs_per_library in enumerate(num_graphs_per_type):
            graph_type, library = graph_types[i], libraries[j]
            occurences[graph_type, library] = num_graphs_per_library
    return occurences

# generate graph occurence dictionary
graph_types = list(graphs.keys())
occurences = generate_occurences_dict(100, graph_types, libraries)
pprint(occurences)

{('area', 'altair'): 3,
 ('area', 'bokeh'): 4,
 ('area', 'plotnine'): 3,
 ('bar', 'altair'): 3,
 ('bar', 'bokeh'): 4,
 ('bar', 'plotnine'): 3,
 ('box', 'altair'): 3,
 ('box', 'bokeh'): 4,
 ('box', 'plotnine'): 3,
 ('bubble', 'altair'): 3,
 ('bubble', 'bokeh'): 4,
 ('bubble', 'plotnine'): 3,
 ('contour', 'altair'): 3,
 ('contour', 'bokeh'): 4,
 ('contour', 'plotnine'): 3,
 ('errorbar', 'altair'): 3,
 ('errorbar', 'bokeh'): 4,
 ('errorbar', 'plotnine'): 3,
 ('histogram', 'altair'): 3,
 ('histogram', 'bokeh'): 4,
 ('histogram', 'plotnine'): 3,
 ('kd', 'altair'): 3,
 ('kd', 'bokeh'): 4,
 ('kd', 'plotnine'): 3,
 ('line', 'altair'): 3,
 ('line', 'bokeh'): 4,
 ('line', 'plotnine'): 3,
 ('scatter', 'altair'): 3,
 ('scatter', 'bokeh'): 4,
 ('scatter', 'plotnine'): 3}


## Data Generation
Generates the corresponding data based on graph type (i.e. `generate_bar()`)

Data is stored in a generated_graphs object (i.e. an `X` attribute in said graph object)

In [4]:
# append objects in the format { id, library, data, etc. } to each corresponding graph type
# e.g. graphs['bar']['graphs'] might equal [{ id: 1, library: 'bokeh', data: [[1, 2, 3]]}, ...]
generated_graphs = graphs.copy()
graph_type_id, current_graph_type = 0, None
for (graph_type, library) in occurences:

    # use a counter to track how many entries belong to the current graph type
    if (current_graph_type != graph_type):
        graph_type_id = 0
        current_graph_type = graph_type

    # generate a n-length set of data points,
    # where n is something like occurences['bar', 'bokeh']
    generated_graphs[graph_type].setdefault('graphs', [])
    for _ in range(occurences[graph_type, library]):
        data = generate_data(graph_type)
        generated_graphs[graph_type]['graphs'].append({
            'id': graph_type_id,
            'library': library,
            'data': data,
        })
        graph_type_id += 1

pprint(generated_graphs)

{'area': {'graphs': [{'data': (array([[ 8.62054669e-01],
       [ 9.14069736e-01],
       [-2.07204620e+00],
       [-1.28124544e+00],
       [ 2.28766556e+00],
       [ 7.11121518e-01],
       [-7.70649793e-01],
       [ 6.36733875e-01],
       [ 1.08280671e+00],
       [-1.21040186e-01],
       [ 1.95362671e-01],
       [-2.81044549e-01],
       [ 1.07864262e-01],
       [ 2.37859525e-01],
       [-1.37891962e+00],
       [-3.81476481e-01],
       [-6.50957288e-01],
       [-2.26186158e-01],
       [-2.01921174e+00],
       [ 3.00476550e+00],
       [ 8.50217619e-01],
       [-1.54100365e+00],
       [-1.38275221e-01],
       [-1.59469052e-01],
       [-1.69550361e+00],
       [ 5.90853533e-01],
       [-3.41531924e-01],
       [ 6.35984091e-01],
       [ 3.87990090e-01],
       [ 1.12103657e+00],
       [ 7.75673501e-01],
       [ 2.73891294e-01],
       [-4.60653465e-02],
       [-8.02664219e-01],
       [ 1.11911497e+00],
       [-3.42208586e-02],
       [ 6.20078000e-01],
       

## Data Stylization
Generates styles for a given library-graph pair (e.g. `generate_style(library, graph)`)

Calls the `generate_bokeh_styles`, `generate_altair_styles`, and `generate_plotnine_styles` functions for each style module dynamically

The stylization code for each graph can be found in the `styles` dir respectively (i.e. `styles/bar.py`)

In [5]:
for (graph_type, graph_object) in generated_graphs.items():
    for graph_content in graph_object['graphs']:
        # retrieves the generate_LIBRARY_style function based on the graphs
        # library and then appends the generated style to the same object
        library = graph_content['library']
        graph_content['styles'] = generate_styles(graph_type, library)

# displays generated styles objects
pprint([[graph_content['styles'] for graph_content in graph_object['graphs']] for graph_object in generated_graphs.values()])

[[{},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {}],
 [{},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {},
  {

## Graph Creation
Uses the generated data points and styles to create the respective graphs

Additional flags can be used to specify the output file path and file type

In [6]:
for (graph_type, graph_object) in generated_graphs.items():
    for graph_content in graph_object['graphs']:
        # retrieves the create_LIBRARY_graph function based on the graphs
        # library and then appends the created graph to the same object
        library = graph_content['library']
        graph_content['graph'] = create_graph(graph_content, graph_type, library)
        
        # export content to filepath
        id = graph_content['id']
        file_path = get_file_path(graph_type, library, id)
        export_graph(graph_content, library, file_path)

<hr/>

## TODO: 
**Wednesday the 24th**
- Add in graph creation code (area, bar, box, errorbar, etc.)
- Add in theming code (selects random theme from directory, can be found in legacy codebase)

**Friday the 26th**
- Add in stylization code ()
- Add README documentation

## POSSIBLE TODOS:
- Possibly batching (if not enough resources to run pipeline for large number of graphs)

<br />
<hr/>

## Define Hyperparameters
1. Define hyperparameters such as number of total graphs, what types of libraries and graphs to be used, etc.
## Data Separation
1. Split the dataset up by library / graph type
1. Allow for distribution flags (so not entirely random)
    - `library_distribution` and `graph_distribution` flags

## Data Generation
1. Generate the corresponding data (i.e. `generate_bar()`)
1. Attach data to graph object (i.e. add `X` attribute to graph object)

# Style Generation
1. Chart Stylization (i.e. each library will be in charge of their own stylization)

# Chart Generation
1. Chart Generation