## Imports
Contains import statements for all the required libraries and frameworks

In [33]:
%load_ext autoreload
%autoreload 2

import random
import numpy as np
from copy import deepcopy
from pprint import pprint
from importlib import import_module

from generators.utils import generate_data
from styles.utils import generate_styles
from creators.utils import create_graph, get_file_path, export_graph

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:
# empty output folder
import os
dir = 'data'
for file in os.listdir(dir):
    if file != '__init__.py':
        os.remove(os.path.join(dir, file))

## Define Hyperparameters
Specifies the maximum number of graphs to be generated

Additionally, the types of libraries and plots can be specified as well

In [35]:
# number of graphs to be generated
max_num_graphs = 15

# set of libraries generating graphs
libraries = [
    # 'bokeh',
    'altair',
    # 'plotnine',
]

# list of all potential graph types
graph_types = [
    # 'scatter',
    # 'bar',
    # 'line',
    # 'contour',
    # 'kd',
    # 'histogram',
    # 'errorbar',
    # 'bubble',
    # 'area',
    # 'box',
    'violin',
]

# generate dict for each possible graph type
# (e.g. { 'scatter': {}, 'bar': {}, ... })
graphs = dict(
    zip(graph_types, [{} for _ in range(len(graph_types))])
)
pprint(graphs)

{'violin': {}}


## Data Separation
Generates a dict representing the number of graphs that need to be created for each library/graph pair

The dictionary keys represent `(library, graph)` where the value represents the number of graphs to be generated

In [36]:
def split_number_evenly(n, n_arrays):
    values = np.linspace(n, 0, n_arrays+1).astype(int)
    return [value - values[i+1] for i, value in enumerate(values[:-1])]

def split_number_randomly(n, n_arrays):
    number, numbers = n, []
    while (number > 0):
        random_number = round(random.random()*number)
        numbers.append(random_number)
        number -= random_number
    numbers.extend([0] * (n_arrays - len(numbers)))
    return random.sample(numbers, len(numbers))

def generate_occurences_dict(
    num_graphs,
    graph_types,
    libraries,
    equal_library_distribution=True,
    equal_graph_distribution=True,
):
    # dict for storing the number of graphs per library / graph to generate
    # (e.g. occurences['bokeh', 'bar'] might return a value of 5)
    occurences, library_occurences, graph_occurences = {}, [], [] 
    
    # split graph occurences (1-d array)
    split_graph_func =  split_number_evenly if equal_graph_distribution else split_number_randomly
    graph_occurences = split_graph_func(num_graphs, len(graph_types))

    # split library occurences (2-d array)
    split_library_func = split_number_evenly if (equal_library_distribution) else split_number_randomly
    library_occurences = [split_library_func(num_graphs_per_type, len(libraries)) for num_graphs_per_type in graph_occurences]

    # add values generated above to occurence dict
    for i, num_graphs_per_type in enumerate(library_occurences):
        for j, num_graphs_per_library in enumerate(num_graphs_per_type):
            graph_type, library = graph_types[i], libraries[j]
            occurences[graph_type, library] = num_graphs_per_library
    return occurences

# generate graph occurence dictionary
graph_types = list(graphs.keys())
occurences = generate_occurences_dict(max_num_graphs, graph_types, libraries)
pprint(occurences)

{('violin', 'altair'): 15}


## Data Generation
Generates the corresponding data based on graph type (i.e. `generate_bar()`)

Data is stored in a generated_graphs object (i.e. an `X` attribute in said graph object)

In [37]:
# append objects in the format { id, library, data, etc. } to each corresponding graph type
# e.g. graphs['bar']['graphs'] might equal [{ id: 1, library: 'bokeh', data: [[1, 2, 3]]}, ...]
generated_graphs = deepcopy(graphs)
graph_type_id, current_graph_type = 0, None
for (graph_type, library), num_occurences in occurences.items():
    # use a counter to track how many entries belong to the current graph type
    if (current_graph_type != graph_type):
        graph_type_id = 0
        current_graph_type = graph_type

    # generate a n-length set of data points,
    # where n is something like occurences['bar', 'bokeh']
    graphs_list = generated_graphs[graph_type].setdefault('graphs', [])
    for _ in range(num_occurences):
        data = generate_data(graph_type)
        graphs_list.append({
            'id': graph_type_id,
            'library': library,
            'data': data,
        })
        graph_type_id += 1

## Data Stylization
Generates styles for a given library-graph pair (e.g. `generate_style(library, graph)`)

Calls the `generate_bokeh_styles`, `generate_altair_styles`, and `generate_plotnine_styles` functions for each style module dynamically

The stylization code for each graph can be found in the `styles` dir respectively (i.e. `styles/bar.py`)

In [38]:
for (graph_type, graph_object) in generated_graphs.items():
    for graph_content in graph_object['graphs']:
        # retrieves the generate_LIBRARY_style function based on the graphs
        # library and then appends the generated style to the same object
        library = graph_content['library']
        # print(graph_type, library)
        graph_content['styles'] = generate_styles(graph_type, library)

# displays generated styles objects
pprint([[graph_content['styles'] for graph_content in graph_object['graphs']] for graph_object in generated_graphs.values()])

[[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}]]


## Graph Creation
Uses the generated data points and styles to create the respective graphs

Additional flags can be used to specify the output file path and file type

In [39]:
%matplotlib agg
for (graph_type, graph_object) in generated_graphs.items():
    for graph_content in graph_object['graphs']:
        # retrieves the create_LIBRARY_graph function based on the graphs
        # library and then appends the created graph to the same object
        library = graph_content['library']
        graph = create_graph(graph_type, library, graph_content)
        
        # export content to filepath
        id = graph_content['id']
        file_path = get_file_path(graph_type, library, id)
        export_graph(graph, library, file_path)
%matplotlib inline

0     102.889525
1       7.365760
2     171.973037
3     112.155286
4     349.237849
         ...    
77    264.580185
78     28.347795
79   -195.433733
80    262.639400
81    108.353268
Length: 82, dtype: float64
[-615.4495866234399, 710.7104146191549]
Exporting image data/violin_altair_0.png...


JavascriptError: Error: Unrecognized signal name: "concat_0_child_x_step"

<hr/>

## TODO: 
**Wednesday the 24th**
- Add in graph creation code (area, bar, box, errorbar, etc.)
- Add in theming code (selects random theme from directory, can be found in legacy codebase)

**Friday the 26th**
- Add in stylization code ()
- Add README documentation

## POSSIBLE TODOS:
- Possibly batching (if not enough resources to run pipeline for large number of graphs)

<br />
<hr/>

## Define Hyperparameters
1. Define hyperparameters such as number of total graphs, what types of libraries and graphs to be used, etc.
## Data Separation
1. Split the dataset up by library / graph type
1. Allow for distribution flags (so not entirely random)
    - `library_distribution` and `graph_distribution` flags

## Data Generation
1. Generate the corresponding data (i.e. `generate_bar()`)
1. Attach data to graph object (i.e. add `X` attribute to graph object)

# Style Generation
1. Chart Stylization (i.e. each library will be in charge of their own stylization)

# Chart Generation
1. Chart Generation

In [None]:
import pandas as pd 
pd.Series([1, 2, 3])