In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import scipy.stats
import geopandas as gpd
%matplotlib inline
from IPython.display import Markdown
from functools import reduce
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

### Theming

In [None]:
if 0:
    style = {
        'background-color': '#1b1b1b', # rgb(27, 27, 27)
        'color': 'white', # font
    }
    pio.templates.default = 'plotly_dark'
else:
    style = {}
    pio.templates.default = 'plotly'

### Loading
Table 40 by kraje/regions  
Age of 90 will mean age of 90 or more

In [None]:
from os.path import join
import preprocessing

data_path = 'data'
# data_path = 'https://raw.githubusercontent.com/nolasemon/slovakia-education/dev/data' # in case of using notebook as a file - requires internet

table_40 = pd.read_csv(join(data_path, 'RV_O_040_R_KR_SK.CSV'), sep=';')
table_40 = preprocessing.preprocess(table_40)

## Computed/Display values
The following function returns a data frame with such properties
- `number` - number of people that satisfy query filter
- `category_percent` - total/denominator is category
- `chosen_percent` - chosen / all
- `age_percent` - total/denominator is all at the same age

In [None]:
table_40

In [None]:
def compute_age_based(data, query, groupby=None, filter_result=None):
    """
    Parameters:
        data (DataFrame): The input DataFrame containing the data to count.
        query (str): The query string to filter the data.
        filter_result (number, optional): The lower bound of the number of people to be
            displayed on the plot. 0 does nothing
        groupby (str, optional): The column name to group by,
            determines categories (optional).
    """
    if query != "":
        chosen = data.query(query)
    else:
        chosen = data
    if groupby is None:
        aggregated =     chosen.groupby(['age'], observed=True)['count'].sum().reset_index(name='number')
        total_category = chosen                                ['count'].sum()
        total_unfiltered = data.groupby(['age'], observed=True)['count'].sum().rename('total_unfiltered')
        total_age =      chosen.groupby(['age'], observed=True)['count'].sum().rename('total_age') # Makes no sense, always 100%
        aggregated = pd.merge(aggregated, total_unfiltered, on=['age'], how='left')
        aggregated = pd.merge(aggregated, total_age,        on=['age'], how='left')
        aggregated['category_percent'] = aggregated['number'] / total_category * 100
        aggregated['filtered_percent'] = aggregated['number'] / aggregated['total_unfiltered'] * 100
        aggregated['age_percent'] =      aggregated['number'] / aggregated['total_age'] * 100
    else:
        aggregated =     chosen.groupby(['age', groupby], observed=True)['count'].sum().reset_index(name='number')
        total_category = chosen.groupby([       groupby], observed=True)['count'].sum().rename('total_category')
        total_unfiltered = data.groupby(['age', groupby], observed=True)['count'].sum().rename('total_unfiltered')
        total_age =      chosen.groupby(['age'         ], observed=True)['count'].sum().rename('total_age')
        aggregated = pd.merge(aggregated, total_category,   on=[       groupby], how='left')
        aggregated = pd.merge(aggregated, total_unfiltered, on=['age', groupby], how='left')
        aggregated = pd.merge(aggregated, total_age,        on=['age',        ], how='left')
        aggregated['category_percent'] = aggregated['number'] / aggregated['total_category'] * 100
        aggregated['filtered_percent'] = aggregated['number'] / aggregated['total_unfiltered'] * 100
        aggregated['age_percent'] =      aggregated['number'] / aggregated['total_age'] * 100
    if filter_result is not None and filter_result > 0:
        aggregated = aggregated[aggregated['number'] >= filter_result]
    return aggregated


def plot_age_based(
    data,
    groupby=None,
    title="",
    display_value="number",
    markers=False,
):
    figure = px.line(
        data, x="age", y=display_value, color=groupby, hover_data=["number"]
    )
    figure.update_layout(xaxis_title="Age", title=title)
    if display_value == 0:
        figure.update_layout(yaxis_title="Number of people")
    else:
        figure.update_layout(yaxis_title="Percent of people")
    figure.update_traces(
        mode="lines" + ("+markers" if markers else ""),
        connectgaps=True,
    )
    return figure


groupby = "sex"
data = compute_age_based(
    table_40, 'education == "vysokoškolské vzdelanie - 1. stupeň (Bc.)"', groupby
)
# plot_age_based(data, groupby=groupby, display_value="number").show()
# plot_age_based(data, groupby=groupby, display_value="category_percent").show()
# plot_age_based(data, groupby=groupby, display_value="filtered_percent").show()
# plot_age_based(data, groupby=groupby, display_value="age_percent").show()

## Interactive `dash` plot by age

Data is filtered and than optionally grouped by a feature  
Display value can be choosed from
- Number
- Category percent - total is category
- Filtered percent - filtered / not filtered
- Age percent - total is all at the same age

Filtering can be done by and features can be
- Vzdelanie
- Grouped edu
- Názov kraja
- Pohlavie
- Súčasná ekonomická aktivita

In [None]:
ATTR_SELECTOR_MAP = {
    # lambda for lazyness
    "category": lambda data, attr, type: dcc.Dropdown(
        # Mark selector element with type and attr to find then
        id={"type": type, "attr": attr},
        # attr must be of type `category`
        options=data[attr].cat.categories,
        persistence=True,
        multi=True,
    ),
    "int64": lambda data, attr, type: dcc.RangeSlider(
        id={"type": type, "attr": attr},
        min=data[attr].min(),
        max=data[attr].max() + 1,
        step=1,
        marks={
            i: str(i)
            for i in range(
                data[attr].min(),
                data[attr].max() + 1,
                (data[attr].max() - data[attr].min()) // 10,
            )
        },
        value=[data[attr].min(), data[attr].max() + 1],
        persistence=True,
    ),
}

ATTR_QUERY_EXPR_MAP = {
    # value of a Dropdown is an array of option values
    "category": lambda attr, entry: (f"`{attr}`.isin({entry})"),
    # value of a Range Slider is an array of two boundaries
    "int64": lambda attr, entry: f"{entry[0]} <= `{attr}` < {entry[1]}",
}

TEST_ATTR_FILTER_MAP = {
    "category": lambda entry: len(entry) > 0,
    "int64": lambda entry: len(entry) == 2,
}


def get_selectivity(data, attributes, type):
    return [
        html.Div(
            [
                html.H4(f"Select {data[attr].name}"),
                # Choose appropriate 'selector' according to attr type
                # Cause error by calling None in case of unmatched type
                ATTR_SELECTOR_MAP.get(str(data[attr].dtype).lower(), None)(
                    data, attr, type
                ),
            ]
        )
        for attr in attributes
    ]


def form_query(data, selectors, attrs):
    return " and ".join(
        [
            ATTR_QUERY_EXPR_MAP.get(str(data[attr].dtype).lower(), None)(attr, entry)
            for attr, entry in zip(attrs, selectors)
            if entry is not None
            and TEST_ATTR_FILTER_MAP.get(str(data[attr].dtype).lower(), None)(entry)
        ]
    )


def big_annotation(text: str, color: str):
    return dict(
        name="draft watermark",
        text=text.upper(),
        textangle=-30,
        opacity=0.1,
        font=dict(color=color, size=100),
        xref="paper",
        yref="paper",
        x=0.5,
        y=0.5,
        showarrow=False,
    )

In [None]:
# %%script true # Skip
from dash import Dash, dcc, html, Input, Output, ALL, State
from dash.exceptions import PreventUpdate

app = Dash(__name__)

figure = go.Figure()
figure.add_annotation(big_annotation("START", "black"))

app.layout = html.Div(
    [
        html.Div(
            [
                html.H4("Enter title"),
                dcc.Input(
                    id="title", type="text", persistence=True, style={"width": "100%"}
                ),
                html.H4("Select groupby"),
                dcc.Dropdown(
                    id="groupby",
                    options=["None"] + list(table_40.columns),
                    persistence=True,
                ),
                html.H4("Select display value"),
                dcc.Dropdown(
                    [
                        {"value": i, "label": l}
                        for i, l in enumerate(
                            [
                                "Number",
                                "Category percent",
                                "Filtered percent",
                                "Age percent",
                            ]
                        )
                    ],
                    0,
                    id="display-value",
                    persistence=True,
                ),
                html.H4("Enter lower bound"),
                dcc.Input(id="lower-bound", type="number", persistence=True),
                dcc.Checklist(
                    id="markers-checkbox",
                    options=[{"label": "Add markers", "value": True}],
                    value=[],
                    persistence=True,
                ),
                html.H4("Select chosen/percented attributes"),
                dcc.Dropdown(
                    id="chosen-attributes",
                    multi=True,
                    persistence=True,
                    options=table_40.columns,
                ),
                html.Div(id="choose-zone"),
            ],
            style={"flex": 1, "minWidth": 400, "padding": 10},
        ),
        html.Div(
            [
                dcc.Graph(
                    id="line-plot",
                    style={"aspect-ratio": "1.6"},
                    figure=figure,
                ),
                dcc.Textarea(id="function-call", style={"width": "100%"}),
            ],
            style={"flex": 2, "padding": 10},
        ),
    ],
    style=style | {"padding": 10, "display": "flex", "flexDirection": "row"},
)


@app.callback(
    Output("choose-zone", "children"),
    Input("chosen-attributes", "value"),
)
def update_chosen(chosen_attributes):
    if chosen_attributes is None:
        raise PreventUpdate
    return get_selectivity(table_40, chosen_attributes, type="chosen")


@app.callback(
    Output("line-plot", "figure"),
    Output("function-call", "value"),
    Input("groupby", "value"),
    Input("lower-bound", "value"),
    Input("title", "value"),
    Input("display-value", "value"),
    Input("markers-checkbox", "value"),
    Input({"type": "chosen", "attr": ALL}, "value"),
    State({"type": "chosen", "attr": ALL}, "id"),
)
def update_figure(
    groupby, lower_bound, title, display_value, checkbox, chosen, chosen_id
):
    query = form_query(table_40, chosen, [a["attr"] for a in chosen_id])
    groupby = groupby if groupby != "None" else None
    compute_values = ["number", "category_percent", "filtered_percent", "age_percent"]
    display_value = compute_values[display_value]
    figure = go.Figure()
    try:
        data = compute_age_based(table_40, query, groupby, filter_result=lower_bound)
        figure = plot_age_based(data, groupby, title, display_value, len(checkbox) > 0)
    except Exception as e:
        figure.add_annotation(big_annotation("ERROR", "red"))
        print(e)

    return [
        figure,
        f"data = compute_age_based(table_40, {query=!r}, {groupby=!r}, filter_result={lower_bound!r})\nfigure = plot_age_based(data, {groupby=!r}, {title=!r}, {display_value=!r}, markers={len(checkbox) > 0!r})",
    ]


app.run_server(mode="inline", port=8053, use_reloader=True, debug=True)