## Initial Draft

### Querying the DB

In [100]:
# Imports
import altair as alt
alt.data_transformers.enable("vegafusion")
import os 
import sqlite3
import polars as pl
import textwrap
import vl_convert

In [2]:
# Data
db_path = os.path.join(os.path.abspath(''), os.path.pardir, 'data', 'ui_stats.db') 
cts_st = """
SELECT ui_cts.st as st, ui_cts.dt_m as dt_m, ui_cts.dt_y as dt_y, ct_wks_12mo, ct_u3_12mo, unemp.rt_recip as rt_recip
FROM ui_cts 
JOIN unemp ON ui_cts.st = unemp.st AND ui_cts.dt_m = unemp.dt_m AND ui_cts.dt_y = unemp.dt_y
WHERE ui_cts.dt_y > 2006;"""
demos_st = ("SELECT * FROM ui_demos WHERE dt_y > 2006;")

In [3]:
conn = sqlite3.connect(db_path)
c = conn.cursor()

# Execute two queries and extract column names
# Ref to extract column names: https://stackoverflow.com/a/7831685
c.execute(cts_st)
st_cts = c.fetchall()
cols_cts = []
for elem in c.description:
    cols_cts.append(elem[0])

c.execute(demos_st)
st_demos = c.fetchall()
cols_demos = []
for elem in c.description:
    cols_demos.append(elem[0])

conn.close()

In [4]:
# Now convert to Polars object for plotting
st_cts_df = pl.DataFrame(st_cts, schema=cols_cts).with_columns(
    pl.date(year=pl.col('dt_y'),month=pl.col('dt_m'), day = 1).alias('date')
)
st_demos_df = pl.DataFrame(st_demos, schema=cols_demos).with_columns(
    pl.date(year=pl.col("dt_y"), month=pl.col("dt_m"), day=1).alias("date")
)
st_cts_df.head()

  st_cts_df = pl.DataFrame(st_cts, schema=cols_cts).with_columns(
  st_demos_df = pl.DataFrame(st_demos, schema=cols_demos).with_columns(


st,dt_m,dt_y,ct_wks_12mo,ct_u3_12mo,rt_recip,date
str,i64,i64,f64,i64,f64,date
"""WY""",8,2024,2084.224651,8605,0.242,2024-08-01
"""WV""",8,2024,8744.751406,33404,0.262,2024-08-01
"""WI""",8,2024,27543.759557,94546,0.291,2024-08-01
"""WA""",8,2024,59261.320266,189971,0.312,2024-08-01
"""VT""",8,2024,2766.056856,7751,0.357,2024-08-01


In [5]:
# Then create a federal level
fed_cts_df = (
    st_cts_df.group_by('dt_m', 'dt_y').agg(
        pl.exclude('dt_m', 'dt_y', 'date', 'rt_recip').sum(),
        pl.col('date').max()
    )
    .with_columns(
        (pl.col('ct_wks_12mo') / pl.col('ct_u3_12mo')).alias('rt_recip')
    )
)
fed_cts_df.head()

dt_m,dt_y,st,ct_wks_12mo,ct_u3_12mo,date,rt_recip
i64,i64,str,f64,i64,date,f64
12,2023,,1774800.0,6065875,2023-12-01,0.292592
4,2010,,5501400.0,14954429,2010-04-01,0.367875
8,2009,,5301200.0,12683308,2009-08-01,0.417969
4,2014,,2913800.0,10862185,2014-04-01,0.268255
7,2011,,4015000.0,14247547,2011-07-01,0.2818


## Plots
### What is recipiency?
1. (Single line plot) Federal Recipiency rate: This shows the federal recipiency rate for unemployment insurance since 2006, up until 2020. It aims to introduce the concept of recipiency of unemployment insurance and illustrate that not everyone gets benefits.
2. (Binned historgram(?) with line): Initial claims: This plot aims to introduce more information about what's being counted by showing new claims for unemployment insurance on the same graph. It starts to complicate what the data is showing by saying recipiency is just one dimension.
3. (Highlit line plot) State variation line plot: This adds in 51 lines to the plot to show that there's a lot of variation in recipiency rates that goes into the federal average. Again, this aims to reinforce that this isn't just a quick fix, and some areas the problem is more acute than others.
    - Make sure to mark 12/2022 point
4. (Bar) State count: Since everything so far has been in rates, I will show the actual counts of the insured unemployed at a fixed time point (12/2022). This will illustrate that rates are misleading because the vast majority of UI claims actually occur in 3 states: NY, TX, and CA.
5. (Chloropleth sqaures map) Mapping recipiency compared to average across the country to start to introduce geographic variation. It shows that some of the difference is related to different states. This loosely maps onto political leaning, so will be obviously interpretable, but I will not plot political leaning.

### Why does recipiency not make sense?
6. (Line plot, with colored top): Zoom in on COVID and states and show that recipiency jumps way above 100% during COVID. By extending the graph into 2020-2023, the rates fully stop making sense, because more than 100% of eligible people start receiving benefits.
7. (High density rates): Policy choices are relevant, if you drastically expand the program, more people get benefits.
8. (Faceted, grid): Facet charts, who are we helping.

### Archive
6. (Bar and line plot) Calculation of rates. This aims to visualize how the recipiency rate is calculated by showing a line of the number of insured unemployed and the U-3 unemployed counts at the federal level. It takes the mean value across a time point of the U-3 rate and then divides a different time point to generate a range, resulting in a percentage.
7. (Line graph): Show the states lines again in a different color, using the Bell et. al., 2021 preferred metric to better show what happens. I'll include a colored dot at the end (just 12/2022) to highlight the point in Chart 8.
8. (Lollipop plots showing change): Alternative formulation lollipop for each state, showing the change at 12/2022 between the two metrics. 


In [11]:
fed_cts_df_lt2020 = fed_cts_df.filter(
    pl.col("date") < pl.date(year=2020, month=1, day=1)
)
st_cts_df_lt2020 = st_cts_df.filter(
    pl.col("date") < pl.date(year=2020, month=1, day=1)
)

In [163]:
# Create custom theme:
def unemp_theme():
    return {
        "config": {
            # Fix grid lines
            "view": {"stroke": "transparent", "width": 800, "heigh": 300},
            "axis": {
                "labelFont": "Helvetica",
                "titleFont": "Helvetica",  # Set font to Helvetica
            },
            "axisY": {
                "domain": False,
                "ticks": True,
                "grid": True,
                "gridDash": [3, 3],
                "tickColor": "#33333320",  # 20% opacity
                "gridColor": "#33333320",
            },
            "axisX": {"grid": False},
            "title": {"font": "Helvetica"},
            "legend": {
                "labelFont": "Helvetica",
                "titleFont": "Helvetica",
            },
            # Set up a custom palette, constant saturation
            "range": {
                "category": [
                    "#0076BF",  # DOL Strong Blue
                    "#BF4E00",  # Split Complimentary Orange
                    "#6B4D23",  # Complimentary Brown
                    "#BF8700",  # Split Complimentary Yellow
                    "#203440",  # "Darkened Blue"
                ]
            },
            "title": {
                "anchor": "start",
                "orient": "top",
                "offset": 20,
            },
        }
    }

alt.themes.register("unemp_theme", unemp_theme)
alt.themes.enable("unemp_theme")

ThemeRegistry.enable('unemp_theme')

In [200]:
# 1. Federal recipiency rate
c1 = (
    alt.Chart(
        fed_cts_df_lt2020,
        title=alt.Title(
            "Recipiency is how the Department of Labor measures what percentage of unemployed workers receive unemployment insurance.",
            subtitle="Unemployment insurance recipiency, annual moving average: 2006-2019",
        ),
    )
    .mark_line(strokeWidth=3)
    .encode(
        alt.X("date:T").title(None),
        # Ref: https://stackoverflow.com/a/62282675
        alt.Y("rt_recip", scale=alt.Scale(domain=[0, 1]))
        .axis(format="%")
        .title("Recipiency rate"),
    )
)

# TODO: Add shaded block for the recession
c1
c1.save("C:/Users/micha/Documents/CAPP/CAPP-30239/CAPP-30239-Static/src/../static_draft/c1.svg")

In [135]:
# C2 - What does it mean to receive UI
# 1. Not apply
# 2. Exhaust at 26
# 3. Exhaust at 12
# 4. Some missingness
elig_y = []
elig_x = []
ic_y = []
ic_x = []
claimed_y = []
claimed_x = []
for y in range(1, 5):
    for x in range(1, 27):

        # Skip Florida > 12 weeks
        if y == 3 and x > 12:
            continue

        # Always eligible if not in Florida
        elig_y.append(y) # Always has a y value
        elig_x.append(x) 

        # Skip no initial claim filer
        if y == 1:
            continue
        
        ic_y.append(y)
        ic_x.append(x)
        
        # Remove missed weeks
        if y == 2:
            if x in (1, 4, 5, 13, 16):
                continue

        # Append claimed
        
        claimed_y.append(y)
        claimed_x.append(x)

# Load data frame
elig_df = pl.DataFrame({"y": elig_y, "elig": elig_x})
ic_df = pl.DataFrame({"y": ic_y, "ic": ic_x})
claimed_df = pl.DataFrame({"y": claimed_y, "claimed": claimed_x})

In [144]:
# c2: What does it mean to apply?
c2_elig = (
    alt.Chart(
        elig_df,
        title = alt.Title("Filing for UI isn't something you do just once, it's something you have to do every week",
        subtitle= "These are 4 cases of how receiving UI might look different")
    )
    .mark_point(
        size = 100, filled = True, color = "#33333340"
    )
    .encode(
        alt.X('elig').title("Week of Unemployment").axis(domain = False),
        alt.Y('y').title(None).axis(ticks = False, labels = False)
    )
)

c2_ic = (
    alt.Chart(ic_df)
    .mark_point(size=100, filled=False)
    .encode(
        alt.X("ic"),
        alt.Y("y")
    )
)

c2_claimed = (
    alt.Chart(claimed_df)
    .mark_point(size=100, filled=True)
    .encode(
        alt.X("claimed"), 
        alt.Y("y")
    )
)

# Annotation 1
# Ref: https://altair-viz.github.io/gallery/line_chart_with_arrows.html
person_a = alt.layer(
    # Arrow line
    alt.Chart().mark_line(size=1).encode(
        x=alt.datum(2),
        y=alt.datum(3.8),
        x2=alt.datum(3),
        y2=alt.datum(3.5),
        color=alt.ColorValue("#33333380")
    ),
    # Arrow head
    alt.Chart().mark_point(shape="triangle", filled=True, fillOpacity=1).encode(
        x=alt.datum(3),
        y=alt.datum(3.5),
        angle=alt.AngleValue(-30),
        size=alt.SizeValue(50),
        color=alt.ColorValue("#33333380")
    ),
    # Text
    alt.Chart().mark_text(size=10, align="left", baseline="middle").encode(
        x=alt.datum(3.3),
        y=alt.datum(3.5),
        text=alt.datum("A files weekly and receives benefits each week until they exhaust their benefits after 26 weeks")
    ),
)

# Annotation 2
person_b = alt.layer(
    # Arrow line
    alt.Chart()
    .mark_line(size=1)
    .encode(
        x=alt.datum(12),
        y=alt.datum(2.8),
        x2=alt.datum(13),
        y2=alt.datum(2.5),
        color=alt.ColorValue("#33333380"),
    ),
    # Arrow head
    alt.Chart()
    .mark_point(shape="triangle", filled=True, fillOpacity=1)
    .encode(
        x=alt.datum(13),
        y=alt.datum(2.5),
        angle=alt.AngleValue(-30),
        size=alt.SizeValue(50),
        color=alt.ColorValue("#33333380"),
    ),
    # Text
    alt.Chart()
    .mark_text(size=10, align="left", baseline="middle")
    .encode(
        x=alt.datum(13.3),
        y=alt.datum(2.5),
        text=alt.datum(
            "B lives in Florida, which only provides 12 weeks of benefits"
        ),
    ),
)

# Annotation 2
person_c = alt.layer(
    # Arrow line
    alt.Chart()
    .mark_line(size=1)
    .encode(
        x=alt.datum(4),
        y=alt.datum(1.8),
        x2=alt.datum(4.7),
        y2=alt.datum(1.5),
        color=alt.ColorValue("#33333380"),
    ),
    # Arrow head
    alt.Chart()
    .mark_point(shape="triangle", filled=True, fillOpacity=1)
    .encode(
        x=alt.datum(4.7),
        y=alt.datum(1.5),
        angle=alt.AngleValue(-30),
        size=alt.SizeValue(50),
        color=alt.ColorValue("#33333380"),
    ),
    # Text
    alt.Chart()
    .mark_text(size=10, align="left", baseline="middle")
    .encode(
        x=alt.datum(5.1),
        y=alt.datum(1.5),
        text=alt.datum("C doesn't file their weekly certification week 4, and therefore doesn't receve benefits"),
    ),
)

# Annotation 4
person_d = alt.layer(
    # Arrow line
    alt.Chart()
    .mark_line(size=1)
    .encode(
        x=alt.datum(1),
        y=alt.datum(0.8),
        x2=alt.datum(1.6),
        y2=alt.datum(0.5),
        color=alt.ColorValue("#33333380"),
    ),
    # Arrow head
    alt.Chart()
    .mark_point(shape="triangle", filled=True, fillOpacity=1)
    .encode(
        x=alt.datum(1.6),
        y=alt.datum(0.5),
        angle=alt.AngleValue(-30),
        size=alt.SizeValue(50),
        color=alt.ColorValue("#33333380"),
    ),
    # Text
    alt.Chart()
    .mark_text(size=10, align="left", baseline="middle")
    .encode(
        x=alt.datum(2.0),
        y=alt.datum(0.5),
        text=alt.datum(
            "D never files an initial claim"
        ),
    ),
)

c2_elig + c2_ic + c2_claimed + person_a + person_b + person_c + person_d

In [145]:
# c3: Include states
c3_fed = (
    alt.Chart(
        fed_cts_df,
        title=alt.Title(
            "The federal average hides a lot of variation between states",
            subtitle="Unemployment insurance recipiency by state, annual moving average: 2006-2019",
            anchor="start",
            orient="top",
            offset=10,
        ),
    )
    .mark_line(strokeWidth = 3)
    .encode(
        alt.X("date:T").title(None),
        alt.Y("rt_recip", scale=alt.Scale(domain=[0, 1]))
        .axis(format="%")
        .title("Recipiency rate"),
    )
    .transform_filter(
        alt.FieldLTPredicate(field="date", lt=alt.DateTime(year=2020, month=1, day=1))
    )
)
c3_st = (
    alt.Chart(st_cts_df)
    .mark_line(color="#33333340", strokeWidth = 1)
    .encode(
        alt.X("date:T").title(None),
        alt.Y("rt_recip", scale=alt.Scale(domain=[0, 1])),
        # Detail approach from https://github.com/vega/altair/issues/985
        detail = 'st'
    ).transform_filter(
        alt.FieldLTPredicate(field="date", lt=alt.DateTime(year=2020, month=1, day=1))
    )
)
c3 = c3_fed + c3_st
c3
c3.save(
    "C:/Users/micha/Documents/CAPP/CAPP-30239/CAPP-30239-Static/src/../static_draft/c3.svg"
)

In [51]:
# c4: Bar chart, not every state is created equal
c4_u3 = (
    alt.Chart(
        st_cts_df,
        title=alt.Title(
            "Some states also have a larger labor market and then have an outsize influence on recipiency",
            subtitle="Average claims per week 2019",
        ),
    )
    .mark_bar(color="#33333360")
    .encode(
        alt.X("st:N").title("State").sort(field = 'ct_u3_12mo'),
        alt.Y("ct_u3_12mo:Q").title("Count"),
    )
)

c4_ui = (
    alt.Chart(st_cts_df)
    .mark_bar()
    .encode(
        alt.X("st:N").title("State").sort(field="ct_u3_12mo"),
        alt.Y("ct_wks_12mo:Q").title("Count"),
    )
)

c4 = (c4_u3 + c4_ui).transform_filter(
    (alt.datum.dt_y == 2019) & (alt.datum.dt_m == 12)
)
c4

In [164]:
# 5 - Include COVID
c5_st = (
    alt.Chart(
        st_cts_df,
        title=alt.Title(
            "This picture gets complicated. States had over 100 percent recipiency during the COVID-19 pandemic due to loosened eligibility requirements",
            subtitle="Recipiency rate by state: 2006-2024, annually smoothed",
        ),
    )
    .mark_line(color="#33333340", strokeWidth=1)
    .encode(
        alt.X("date:T").title(None),
        alt.Y("rt_recip", scale=alt.Scale(domain=[0, 1.5])).title('Recipinecy Rate').axis(format="%"),
        # Detail approach from https://github.com/vega/altair/issues/985
        detail="st",
    )
)

# Ref for highlighting: https://altair-viz.github.io/gallery/bar_chart_with_single_threshold.html
threshold = 1
st_highlight = (
    alt.Chart(st_cts_df)
    .mark_line(color="#BF4E00")
    .encode(
        alt.X("date:T").title(None),
        alt.Y("rt_recip"),
        detail="st"
    )
    .transform_filter(alt.datum.rt_recip > threshold)
)

# Add in line
rule = (
    alt.Chart()
    .mark_rule(color="#203440", strokeWidth=2)
    .encode(y=alt.Y(datum=threshold))
)
label = rule.mark_text(
    x="width",
    dx=-2,
    align="right",
    baseline="bottom",
    text="More than 100% recipiency",
    color="#BF4E00",
)

c5 = c5_st + st_highlight + rule + label
c5.save(
    "C:/Users/micha/Documents/CAPP/CAPP-30239/CAPP-30239-Static/src/../static_draft/c5.svg"
)
c5

In [198]:
# c6. Ridgeline plot for recipiency
# Ref: https://altair-viz.github.io/gallery/ridgeline_plot.html
step = 20
overlap = 1

st_pa = st_cts_df.filter(col.st_pa)

c6 = (
    alt.Chart(st_pa)
    .transform_filter(
        (alt.datum.st == "PA")
        | (alt.datum.st == "IL")
        | (alt.datum.st == "NJ")
        | (alt.datum.st == "TX")
        | (alt.datum.st == "CA")
        | (alt.datum.st == "NY")
    )  # TODO: Fix to preprocess
    .transform_density("rt_recip", as_=["rt_recip", "density"], groupby=["st"])
    .mark_area(
        interpolate="monotone", fillOpacity=0.8, stroke="lightgray", strokeWidth=0.5
    )
    .encode(
        alt.X("rt_recip:Q").title("Recipiency"),
        alt.Y("density:Q").axis(None),
        alt.Fill("mean_temp:Q").legend(None)
    )
    .facet(
        row=alt.Row("st:N")
        .title(None)
        .header(labelAngle=0, labelAlign="left", format="%B")
    )
    .properties(title="Recipiency Rate", bounds="flush")
    .configure_facet(spacing=0)
)
c6

In [None]:
# 7 - Actual heatmap
# Ref for handling GEOJSON: https://stackoverflow.com/questions/67283970/altair-choropleth-adding-values-associated-with-each-county-to-the-map

# Found a GeoJSON by Googling
states = alt.topo_feature("https://cdn.jsdelivr.net/npm/us-atlas@3/states-10m.json", "states")

c7 = (
    alt.Chart(states)
    .mark_geoshape()
    .transform_lookup(
        lookup="properties.Name:N",
        from_=alt.LookupData(st_cts_df, "st_name", ["rt_recip"])
    )
    .encode(color="rt_recip:Q")
    .project(type="albersUsa")
)

In [None]:
# 8 - >

### Archive

In [9]:
# IF IC and CC 
 c2_ic = alt.Chart(
        fed_cts_df,
        title=alt.Title(
            "Just because you have a valid claim for unemployment, doesn't mean you are actually receiving unemployment",
            subtitle="Initial claims that recently unemployed people make make you eligible to collect benefits for 26 weeks or so, but not everyone does each week",
            anchor="start",
            orient="top",
            offset=10,
        ),
    ).mark_bar(
        color= "red"
    ).encode(
        alt.X("date:T").title("Month"),
        alt.Y("ct_ic").title("Claims"),
    )

c2_cc = alt.Chart(fed_cts_df).mark_bar(
    ).encode(
        alt.X("date:T").title("Month"),
        alt.Y("ct_wks").title("Claims"),
    )

c2 = (c2_cc + c2_ic).transform_filter(
        alt.FieldLTPredicate(field="date", lt=alt.DateTime(year=2020, month=1, day=1))
    ).properties(
        width=800, height=300
    )

# Fix bar transparency and width
# Add annotation on 12/2019

c2

IndentationError: unexpected indent (103868588.py, line 2)

In [199]:
# 6 - Show recipiency by state in high density heatmap
# Exact reference: https://altair-viz.github.io/gallery/lasagna_plot.html
# TODO: Not sure what this does
color_condition = alt.condition(
    "month(datum.value) == 1 && date(datum.value) == 1",
    alt.value("black"),
    alt.value(None),
)

c6 = (
    alt.Chart(
        st_cts_df,
        title=alt.Title(
            "Let's look back at the states with the highest unemployment volume",
            subtitle="Recipiency rate by state: 2006-2023, annually smoothed",
        ),
    )
    .mark_rect()
    .encode(
        alt.X("date").axis(
            format="%Y",
            labelAngle=0,
            labelOverlap=False,
        ),
        alt.Y("st:N").title(None).axis(grid=False).sort("ascending"),
        # Syntax reference for scale: https://stackoverflow.com/q/70295909
        alt.Color(
            "rt_recip:Q",
            scale=alt.Scale(
                domain=[0, 0.5, 1.5], range=["white", "#0076BF", "#BF4E00"]
            ),
        ).title("Recipiency rate"),
    )
    .transform_filter(
        (alt.datum.st == "PA")
        | (alt.datum.st == "IL")
        | (alt.datum.st == "NJ")
        | (alt.datum.st == "TX")
        | (alt.datum.st == "CA")
        | (alt.datum.st == "NY")
    )  # TODO: Fix to preprocess
)

c6.save(
    "C:/Users/micha/Documents/CAPP/CAPP-30239/CAPP-30239-Static/src/../static_draft/c6.svg"
)
c6