In [1]:
import polars as pl
import altair as alt
from vega_datasets import data
import pandas as pd

In [None]:
alt.themes.enable("opaque")
alt.data_transformers.enable("vegafusion")

How many H1B petitioners are there for each year? What sectors are they in?

In [None]:
# Load data files in
uscis_petitioners_2021_2024 = pl.read_excel("../data/uscis_h1b_approvals_2021_2024.xlsx")
uscis_petitioners_2017_2020 = pl.read_excel("../data/uscis_h1b_approvals_2017_2020.xlsx")
uscis_petitioners = uscis_petitioners_2017_2020.vstack(uscis_petitioners_2021_2024)
uscis_petitioners

In [4]:
# Clean data
uscis_petitioners = uscis_petitioners.rename({"Line by line": "id",
                                              "Fiscal Year   ": "fiscal_year",
                                              "Employer (Petitioner) Name": "name",
                                              "Tax ID": "tax_id",
                                              "Industry (NAICS) Code": "naics_code",
                                              "Petitioner City": "city",
                                              "Petitioner State": "state",
                                              "Petitioner Zip Code": "zip",
                                              "Initial Approval": "initial_approval",
                                              "Initial Denial": "initial_denial",
                                              "Continuing Approval": "continuing_approval",
                                              "Continuing Denial": "continuing_denial"})

In [None]:
# Check to see if USCIS data contains duplicate records of employers
uscis_petitioners.n_unique(subset=["fiscal_year", "name"]) # there are 384364 counts of (employers, year)

petitioners_count = uscis_petitioners.group_by("fiscal_year", "name").len(name="employers_count")
petitioners_count # there are indeed duplicates

In [None]:
# Employers by state and NAICS code
unique_employers = uscis_petitioners.group_by("fiscal_year", "name").len().filter(pl.col("fiscal_year") == "2024")
unique_employers = unique_employers.join(uscis_petitioners.select("fiscal_year", "name", "state", "naics_code"),
                                         how="left", on=["fiscal_year", "name"])
unique_employers.group_by("state", "naics_code").len().sort("len", descending=True)

In [7]:
# Construct a new df with each row representing unique employers only
petitioners_df = uscis_petitioners.unique(subset=["fiscal_year", "name"])
petitioners_df = petitioners_df.with_columns(
    pl.when(pl.col("naics_code").str.contains("54"))
    .then(pl.lit("Science & Tech"))
    .when(pl.col("naics_code").str.contains("31-33"))
    .then(pl.lit("Manufacturing"))
    .when(pl.col("naics_code").str.contains("62"))
    .then(pl.lit("Healthcare"))
    .when(pl.col("naics_code").str.contains("52"))
    .then(pl.lit("Finance & Insurance"))
    .when(pl.col("naics_code").str.contains("51"))
    .then(pl.lit("Information"))
    .when(pl.col("naics_code").str.contains("61"))
    .then(pl.lit("Education"))
    .when(pl.col("naics_code").str.contains("42"))
    .then(pl.lit("Wholesale Trade"))
    .otherwise(pl.lit("Other"))
    .alias("sector")
)

In [None]:
# Because there are many sectors, we extract the first 7 popular so as not to clutter the chart
top_naics_by_petitioners = (petitioners_df.group_by(["fiscal_year", "naics_code"])
                            .len().filter(pl.col("fiscal_year") == "2024")
                            .sort("len", descending=True))
top_naics_by_petitioners.head(10)

In [None]:
# Create chart of employers trend by sector 2021-2024
alt.Chart(petitioners_df).mark_area().encode(
    alt.X("fiscal_year", type="nominal"),
    alt.Y(aggregate="count", type="quantitative", title="employers_count"),
    alt.Color("sector", scale=alt.Scale(scheme='tableau10'), legend=alt.Legend(title="Sector", labelLimit = 300))
).properties(
    title=alt.Title("Employers filing for H1B petitions (2017-2024)", fontSize=12),
    width=150,
    height=300
).configure(background='#F6F5F4')


What about the number of H1B filings?

In [None]:
# Create chart for H1B filings trend 2021-2024
uscis_filings = pl.read_excel("../data/uscis_h1b_eligible_registration_2017_2020.xlsx")
uscis_filings = uscis_filings.with_columns(pl.lit(int(85000)).alias("cap"))
uscis_filings = uscis_filings.with_columns((pl.col("count") - pl.col("cap")).alias("not_selected"))
uscis_filings = uscis_filings.rename({"cap": "Selected", "not_selected": "Not Selected"})
uscis_filings_long = uscis_filings.unpivot(index="fiscal_year", on=["Selected", "Not Selected"])

alt.Chart(uscis_filings_long).mark_bar(color="#384d26").encode(
    alt.X("fiscal_year", type="nominal", title="Fiscal Year"),
    alt.Y("value", type="quantitative", title="Total cap-subjected filings"),
    alt.Color("variable:N", sort="ascending", title="Filings status",
              scale=alt.Scale(domain=["Selected", "Not Selected"],
                              range=["#32565D", "#698996"])),
    alt.Order("value")
).properties(
    title=alt.Title("H1B filings received by USCIS (2017-2024)", fontSize=12),
    width=150,
    height=300
).configure(background='#F6F5F4')

Now diving in data at individual level. We examine H1B lottery data (filings that are subjected to cap) 2023 as 2024 might introduce distortion due to large number of multiple filings and fraud claims.

In [11]:
# Load and clean country code
country_code = pl.read_csv("../data/wikipedia-iso-country-codes.csv")
country_code = country_code.rename({"English short name lower case": "country_name", "Alpha-3 code": "a3_code"})

In [12]:
def process_bloomberg_data(filepath):
    df = pl.read_csv(filepath, ignore_errors=True)
    df = df.select(["country_of_nationality", "ben_year_of_birth", "gender", "employer_name",
                      "FEIN", "state", "zip", "lottery_year", "status_type", "rec_date", "FIRST_DECISION", "ben_multi_reg_ind",
                      "first_decision_date", "BEN_CURRENT_CLASS", "REQUESTED_CLASS", "BASIS_FOR_CLASSIFICATION", "JOB_TITLE",
                      "WORKSITE_STATE", "WORKSITE_ZIP", "BEN_PFIELD_OF_STUDY", "BEN_COMP_PAID", "WAGE_AMT", "WAGE_UNIT", "S3Q1",
                      "DOT_CODE", "NAICS_CODE"]).rename({"country_of_nationality": "nationality",
                                                         "ben_year_of_birth": "birthyear",
                                                         "state": "employer_state",
                                                         "zip": "employer_zip",
                                                         "FEIN": "fein",
                                                         "NAICS_CODE": "naics_code",
                                                         "lottery_year": "fiscal_year",
                                                         "FIRST_DECISION": "first_decision",
                                                         "BEN_CURRENT_CLASS": "current_class",
                                                         "REQUESTED_CLASS": "requested_class",
                                                         "BASIS_FOR_CLASSIFICATION": "filing_type",
                                                         "JOB_TITLE": "job_title",
                                                         "WORKSITE_STATE": "worksite_state",
                                                         "WORKSITE_ZIP": "worksite_zip",
                                                         "BEN_PFIELD_OF_STUDY": "field_of_study",
                                                         "DOT_CODE": "dot_code",
                                                         "BEN_COMP_PAID": "base_income",
                                                         "WAGE_AMT": "wage_amount",
                                                         "WAGE_UNIT": "wage_unit",
                                                         "S3Q1": "education_level"})

    # Incorrect state in data file. Modify TS to TN
    df = df.with_columns(pl.when(pl.col("employer_state") == "TS")
                             .then(pl.lit("TN")).otherwise(pl.col("employer_state"))
                             .alias("employer_state_corrected")).drop("employer_state")
    df = df.with_columns((pl.col("employer_zip").str.slice(0, 5)).alias("corrected_employer_zip")).drop("employer_zip")

    # Pad zip with 0
    df = df.with_columns(pl.col("worksite_zip").cast(pl.Utf8).str.zfill(5))

    # Update zipcodes with over 30 filings that are not covered in zipcodes crosswalk database
    df = df.with_columns(pl.when(
        pl.col("worksite_zip") == "94085").
        then(pl.lit("94086")).
        when(pl.col("worksite_zip") == "27617").
        then(pl.lit("27619")).
        when(pl.col("worksite_zip") == "75254").
        then(pl.lit("75253")).
        when(pl.col("worksite_zip") == "75033").
        then(pl.lit("75034")).
        when(pl.col("worksite_zip") == "94158").
        then(pl.lit("94159")).
        when(pl.col("worksite_zip") == "75071").
        then(pl.lit("75070")).
        when(pl.col("worksite_zip") == "95391").
        then(pl.lit("95304")). # closest city nearby. newly incorporate city july 2024
        when(pl.col("worksite_zip") == "48033").
        then(pl.lit("48037")).
        otherwise(pl.col("worksite_zip"))
        .alias("updated_worksite_zip")
    )

    df = df.join(country_code.select("a3_code", "country_name"),
                 how="left",
                 left_on="nationality",
                 right_on="a3_code")

    return df

In [13]:
# Load 2021-2024 USCIS data obtained by Bloomberg
beneficiaries_21 = process_bloomberg_data("../data/bloomberg_h1b_records_2021.csv")
beneficiaries_22 = process_bloomberg_data("../data/bloomberg_h1b_records_2022.csv")
beneficiaries_23 = process_bloomberg_data("../data/bloomberg_h1b_records_2023.csv")
beneficiaries_24_single = process_bloomberg_data("../data/bloomberg_h1b_records_2024_singl.csv")
beneficiaries_24_multi = process_bloomberg_data("../data/bloomberg_h1b_records_2024_multi.csv")

Location-wise, where are the employers and where are the H1B workers?

In [None]:
zipcodes = pl.read_csv("../data/zipcodes.csv")
zipcodes = zipcodes.with_columns(pl.col("zip_code").cast(pl.Utf8).str.zfill(5))
beneficiaries_23_worksite = beneficiaries_23.join(zipcodes, how="left",
                                                  left_on="updated_worksite_zip",
                                                  right_on="zip_code")

# Omit less than 1% of records where worksite_zip has less than 30 filings and not in DB
beneficiaries_23_worksite = beneficiaries_23_worksite.filter((~pl.col("worksite_zip").is_null()) & 
                                                             (~pl.col("latitude").is_null()))

# Capture the one record that's not on the map
beneficiaries_23_worksite.filter(pl.col("updated_worksite_zip") == "00968") # Puerto Rico

In [None]:
# Create choropleth US map
states = alt.topo_feature(data.us_10m.url, feature="states")
beneficiaries_23_bystate = beneficiaries_23.unique("fein").group_by("employer_state_corrected").len(name="petitioners_count")

# Reference: https://stackoverflow.com/questions/66892810/using-transform-lookup-for-an-altair-choropleth-figure
pd_state_code = pd.read_csv('https://www2.census.gov/geo/docs/reference/state.txt', sep="|")
pd_state_code.columns = ['id', 'abbr', 'state', 'statens']
pd_state_code = pd_state_code[['id', 'abbr', 'state']]
pl_state_code = pl.from_pandas(pd_state_code)

beneficiaries_23_bystate = beneficiaries_23_bystate.join(pl_state_code,
                                                         how="left",
                                                         left_on="employer_state_corrected",
                                                         right_on="abbr")

base = alt.Chart(states).mark_geoshape(fill='black', stroke='black', strokeWidth=0.5)

map = alt.Chart(states).mark_geoshape().encode(
    color=alt.Color("petitioners_count:Q", scale=alt.Scale(scheme='lightorange'), title="Employers count")
).transform_lookup(
    lookup="id",
    from_=alt.LookupData(beneficiaries_23_bystate, "id", ["petitioners_count"])
).properties(
    width=500,
    height=300
).project("albersUsa")

dot = alt.Chart(beneficiaries_23_worksite).mark_circle(color="#3D5149", opacity=0.5).encode(
    longitude='longitude:Q',
    latitude='latitude:Q',
    tooltip='updated_worksite_zip:N',
    size=alt.Size("count():Q",
                  legend=alt.Legend(title="Selected filings count"),
                  scale=alt.Scale(domain=(1, 2000)))
).project(
    type='albersUsa'
).properties(
    width=500,
    height=300
)

alt.layer(base, map, dot).properties(title="Employers and selected H1B workers worksite density across the US (2023)").configure(background='#F6F5F4')

In [None]:
# Find states with highest petitioners count
beneficiaries_23_bystate.sort("petitioners_count", descending=True)

What about nationality profiling?

In [17]:
def filings_by_nationality(dataset, multifiling, top_ranking):
    df = dataset.filter((~pl.col("country_name").is_null()) &
                        (pl.col("ben_multi_reg_ind") == multifiling)
                        ).group_by(pl.col("country_name", "fiscal_year")).len()
    
    top_nationality = df.select(
        pl.all().top_k_by("len", top_ranking)
        ).select("country_name").to_series()
    
    df = df.with_columns(pl.when(~pl.col("country_name").is_in(top_nationality))
                                    .then(pl.lit("Others"))
                                    .when(pl.col("country_name").str.contains("Korea, Republic of"))
                                    .then(pl.lit("South Korea"))
                                    .otherwise(pl.col("country_name"))
                                    .alias("nationality_shortlisted"))
    
    return df

In [None]:
# 2023 saw people from 185 countries applying for H1B
beneficiaries_23.filter(~pl.col("country_name").is_null()).group_by(pl.col("country_name")).len().with_columns((pl.col("len") * 100 / pl.sum("len")).alias("percent")).sort("percent", descending=True).head(10)

In [None]:
# Single-filers
filings_by_nationality_single_23 = filings_by_nationality(beneficiaries_23, 0, 6)

alt.Chart(filings_by_nationality_single_23).mark_arc().encode(
    theta="len",
    color=alt.Color("nationality_shortlisted", scale=alt.Scale(domain=["India", "Others", "China", "Canada", "Mexico", "Philippines", "South Korea"],
                                                              range=["#e45756", "#f58518", "#eeca3b", "#4c78a8", "#b279a2", "#72b7b2", "#54a24b"]),
                                                              title="Country of Birth")
).properties(title=alt.Title("Single-filing records by nationality (2023)", fontSize=12),
             height=200,
             width=200).configure(background='#F6F5F4')

In [None]:
# Multi-filers
filings_by_nationality_multiple_23 = filings_by_nationality(beneficiaries_23, 1, 6)

alt.Chart(filings_by_nationality_multiple_23).mark_arc().encode(
    theta="len",
    color=alt.Color("nationality_shortlisted", scale=alt.Scale(domain=["India", "China", "Others", "Nepal", "Pakistan", "Philippines", "United Kingdom"],
                                                              range=["#e45756", "#eeca3b", "#f58518", "#bab0ac", "#ff9da6", "#72b7b2", "#9d755d"]),
                                                              title="Country of Birth")
).properties(title=alt.Title("Multiple-filing records by nationality (2023)", fontSize=12),
             height=200,
             width=200).configure(background='#F6F5F4')

Main characteristics of selected applicants at time of application?

In [None]:
# Education level
education_23 = beneficiaries_23.filter((pl.col("ben_multi_reg_ind") == 0) &
                                       (pl.col("education_level").is_in(["M", "B"]))
                                       ).group_by("education_level").len(name="filings_count")

education_23 = education_23.with_columns(pl
                          .when(pl.col("education_level") == "B")
                          .then(pl.lit("Bachelor's"))
                          .otherwise(pl.lit("Master's"))
                          .alias("degree_type"))

alt.Chart(education_23).mark_bar().encode(
    alt.Y("degree_type:N", title="Education level"),
    alt.X("filings_count:Q", title="Count of Filings"),
    color=alt.Color("degree_type:N", scale=alt.Scale(domain=["Bachelor's", "Master's"],
                                     range=["#DB7F67", "#DBBEA1"]), legend=None)
).properties(title=alt.Title("Single-filings by education level (2023)", fontSize=12)).configure(background='#F6F5F4')

In [None]:
# Age and gender distribution
selected_by_age_23 = beneficiaries_23.filter(pl.col("ben_multi_reg_ind") == 0
                                             ).with_columns((pl.col("fiscal_year") - pl.col("birthyear")
                                                             ).alias("age"))

alt.Chart(selected_by_age_23.filter(pl.col("age").is_in(range(10, 71)))).mark_bar().encode(
    alt.X("age:Q", bin=alt.Bin(maxbins=15), scale=alt.Scale(domain=[10, 70]), title="Age Group"),
    alt.Y("count():Q", title="Records count"),
    color=alt.Color("gender:N", title="Gender",
                    scale=alt.Scale(domain=["female", "male"],
                                    range=["#ff9da6", "#79706e"])
)).properties(title=alt.Title("Single-filing records by gender and age (2023)", fontSize=12)).configure(background='#F6F5F4')

In [None]:
# Income distribution
pl.Config.set_fmt_str_lengths(80)

selected_income_23 = beneficiaries_23.filter(~pl.col("base_income").is_null() |
                                             ~pl.col("wage_amount").is_null()
                                             ).with_columns(pl.col("dot_code").cast(pl.Utf8).str.zfill(3))

# Read in job code data file
job_code = pl.read_excel("../data/i129_job_codes.xlsx")
job_code = job_code.rename({"Occupation Category": "category", "Occupation Code": "dot_code", "Occupation Description": "occupation"})

# Filter out NA values and values likely due to misinput
selected_income_23 = selected_income_23.join(job_code, how="left", on="dot_code")
selected_income_23 = selected_income_23.with_columns((pl.col("fiscal_year") - pl.col("birthyear")).alias("age"))
selected_income_23 = selected_income_23.filter((pl.col("base_income") > 0) &
                                               (pl.col("wage_amount") > 0) &
                                               (pl.col("base_income") < 800000) &
                                               (pl.col("age") > 18) &
                                               (~pl.col("category").is_null()))

# Clean data
selected_income_23 = selected_income_23.with_columns(pl
                             .when((pl.col("wage_unit") == "HOUR") & (pl.col("wage_amount") < 100))
                             .then(pl.col("wage_amount") * 2080)
                             .when((pl.col("wage_unit") == "WEEK") & (pl.col("wage_amount") < 4000))
                             .then(pl.col("wage_amount") * 52)
                             .when((pl.col("wage_unit") == "MONTH") & (pl.col("wage_amount") < 16000))
                             .then(pl.col("wage_amount") * 12)
                             .when((pl.col("wage_unit") == "YEAR") & (pl.col("wage_amount").is_in(range(20000, 800000))))
                             .then(pl.col("wage_amount"))
                             .when((pl.col("wage_unit") == "YEAR") & (pl.col("wage_amount") < 100))
                             .then(pl.col("wage_amount") * 2080)
                             .otherwise("base_income")
                             .alias("computed_yearly_wage")
)

selected_income_23 = selected_income_23.with_columns(max=pl.max_horizontal("computed_yearly_wage", "base_income"),
                             min=pl.min_horizontal("computed_yearly_wage", "base_income"))

selected_income_23 = selected_income_23.with_columns(pl
                             .when((pl.col("computed_yearly_wage").is_in(range(0, 300000))) &
                                   (pl.col("max") < 400000))
                             .then(pl.col("max"))
                             .otherwise(pl.col("min"))
                             .alias("final_yearly_income")
                             ).sort("final_yearly_income", descending=True).filter(pl.col("final_yearly_income") >= 20000) # 13 observtions with irreconcilable typos in both wage columns

base = alt.Chart(selected_income_23.filter(pl.col("final_yearly_income") < 260000)).mark_bar(color="#32565D").encode(
    x=alt.X("final_yearly_income", type="quantitative",
            bin=alt.Bin(maxbins=30),
            title="Count of filings (thousands)",
            axis=alt.Axis(labelExpr="datum.value / 1000")),
    y=alt.Y("count():O", title="Yearly income range (USD)")
).properties(
    height=200,
    width=450
).properties(title=alt.Title("Yearly income reported for selected filings (2023)", fontSize=12))

alt.concat(base, background="#F6F5F4").properties(title=alt.TitleParams("Yearly income outliers over $320,000 not shown in chart.",
                                   baseline='bottom',
                                   orient='bottom',
                                   anchor='end',
                                   fontWeight='normal',
                                   fontSize=7))


In [None]:
# Top 10 earnings category, title and field of study table
top_table = selected_income_23.sort("final_yearly_income", descending=True).head(10).select("job_title", "field_of_study", "final_yearly_income")
top_table = top_table.with_columns(pl.when(pl.col("field_of_study").str.contains_any(["BUSINESS", "MANAGEMENT"]))
                               .then(pl.lit("BUSINESS & MANAGEMENT"))
                               .when(pl.col("field_of_study").str.contains("ELECTRONIC"))
                               .then(pl.lit("ELECTRICAL ENGINEERING"))
                               .otherwise(pl.col("field_of_study"))
                               .alias("summarized_field_of_study")
                               ).select("summarized_field_of_study", "job_title", "final_yearly_income")

top_table

In [None]:
# Income by occupation category
alt.Chart(selected_income_23.filter(pl.col("category") != "Miscellaneous")
          ).mark_boxplot(extent='min-max', color="#32565D").encode(
    alt.X("category", type="nominal",
          axis=alt.Axis(labelAngle=-45, labelLimit=350),
          title="Occupation category"),
    alt.Y("final_yearly_income:Q",
          title="Reported yearly income (thousand USD)",
          axis=alt.Axis(labelExpr="datum.value / 1000"))
).properties(
    width=500,
    height=300
).properties(title=alt.Title("Yearly income reported by Occupation Category (2023)", fontSize=12)).configure(background="#F6F5F4")

What of an individual chances?

In [26]:
def chances_df(df):
    df = df.filter(pl.col("status_type") != "(b)(3) (b)(6) (b)(7)(c)")
    df = df.with_columns(pl
                    .when(pl.col("status_type") == "CREATED")
                    .then(pl.lit("ELIGIBLE"))
                    .otherwise(pl.col("status_type"))
                    .alias("status")
    )

    df = df.select("nationality", "fiscal_year", "status", "ben_multi_reg_ind", "current_class", "first_decision")
    return df

# Create complete df for chance analysis
chances_24_single = chances_df(beneficiaries_24_single)
chances_24_multi = chances_df(beneficiaries_24_multi)

chances_df = pl.concat([chances_df(beneficiaries_21),
                        chances_df(beneficiaries_22),
                        chances_df(beneficiaries_23),
                        chances_24_single,
                        chances_24_multi])
chances_df = chances_df.filter(~((pl.col("status") == "ELIGIBLE") &
                                 (~pl.col("first_decision").is_null()))) # irrelevant, not picked on lottery but still filing

In [27]:
beneficiaries_by_filing = chances_df.filter(~pl.col("ben_multi_reg_ind").is_null())
filing_chance = beneficiaries_by_filing.group_by(["fiscal_year", "ben_multi_reg_ind", "status", "first_decision"]).len(name="count")

In [28]:
sum_of_filings = filing_chance.group_by(["fiscal_year", "ben_multi_reg_ind"]
                                        ).agg(total_yearly_filings=pl.col("count").sum())
sum_of_filings = sum_of_filings.with_columns((pl.col("total_yearly_filings") / 1000).alias("thousands"))

In [None]:
# Create chart for filings by type (removing legend for inforgraphics incorporation purposes)
sum_of_filings = sum_of_filings.with_columns(pl
                         .when(pl.col("ben_multi_reg_ind") == 0)
                         .then(pl.lit("Single"))
                         .otherwise(pl.lit("Multiple"))
                         .alias("filing_type"))

alt.Chart(sum_of_filings).mark_bar().encode(
    alt.Y("fiscal_year", type="nominal", title="Fiscal Year"),
    alt.X("thousands", type="quantitative", axis=alt.Axis(title="Total yearly Filings (thousands)",
                                                          tickMinStep=200000)),
    color=alt.Color("filing_type:N", title="Filing type", 
                    scale=alt.Scale(domain=["Single", "Multiple"],
                                    range=["#698C7D", "#32555D"]),
                    legend=None),
).properties(width=250, height=70).configure(background="#F6F5F4")

In [None]:
filings_df = filing_chance.join(sum_of_filings, how="left", on=["fiscal_year", "ben_multi_reg_ind"])
filings_df.filter(pl.col("fiscal_year") == 2023)

In [None]:
# Chance to get picked from lottery
selected_count = filings_df.filter(pl.col("status") == "SELECTED"
                                   ).group_by(["fiscal_year", "ben_multi_reg_ind"]
                                              ).agg(total_selected=pl.col("count").sum())
lottery_win = selected_count.join(sum_of_filings, how="left", on=["fiscal_year", "ben_multi_reg_ind"])
lottery_win = lottery_win.with_columns((pl.col("total_selected") / pl.col("total_yearly_filings")).alias("winning_rate"))

lottery_win = lottery_win.with_columns(pl
                         .when(pl.col("ben_multi_reg_ind") == 0)
                         .then(pl.lit("Single"))
                         .otherwise(pl.lit("Multiple"))
                         .alias("filing_type"))

chance_1 = alt.Chart(lottery_win).mark_bar(color="#454722").encode(
    alt.X("filing_type", type="nominal", title=None, axis=alt.Axis(labels=False)),
    alt.Y("winning_rate", type="quantitative", title="Lottery winning rate").scale(domain=(0, 1)),
    color=alt.Color("filing_type:N", title="Filing type", 
                    scale=alt.Scale(domain=["Single", "Multiple"],
                                    range=["#698C7D", "#32555D"])),
    column=alt.Column("fiscal_year:N", title="Fiscal Year",
                      spacing=10,
                      header=alt.Header(titleOrient='bottom', labelOrient='bottom'))
).properties(title=alt.Title("H1B selection from H1B lottery (2021-2024)", fontSize=12)).configure(background="#F6F5F4")

chance_1

In [None]:
responded = filing_chance.filter((pl.col("status") == "SELECTED") &
                                 (~pl.col("first_decision").is_null()))
sum_of_getback = responded.group_by(["fiscal_year", "ben_multi_reg_ind"]).agg(getback_sum=pl.col("count").sum())
sum_of_getback

In [None]:
# Conditional on being picked, chance to get approved
approval_rate = filing_chance.filter((pl.col("status") == "SELECTED") &
                                     (~pl.col("first_decision").is_null()))
approval_rate = approval_rate.join(sum_of_getback,
                                   how="left",
                                   on=["fiscal_year", "ben_multi_reg_ind"]
                                   ).filter(pl.col("first_decision") == "Approved"
                                            ).with_columns((pl.col("count") / pl.col("getback_sum")).alias("approved_rate"))

approval_rate = approval_rate.with_columns(pl
                         .when(pl.col("ben_multi_reg_ind") == 0)
                         .then(pl.lit("Single"))
                         .otherwise(pl.lit("Multiple"))
                         .alias("filing_type"))

chance_2 = alt.Chart(approval_rate).mark_bar().encode(
    alt.X("filing_type", type="nominal", title=None, axis=alt.Axis(labels=False)),
    alt.Y("approved_rate", type="quantitative", title="Approval rate").scale(domain=(0, 1)),
    color=alt.Color("filing_type:N", title="Filing type", 
                    scale=alt.Scale(domain=["Single", "Multiple"],
                                    range=["#698C7D", "#32555D"])),
    column=alt.Column("fiscal_year:N", title="Fiscal Year",
                      spacing=10,
                      header=alt.Header(titleOrient='bottom', labelOrient='bottom'))
).properties(title=alt.Title("H1B approval from lottery-won filings (2021-2024)", fontSize=12)).configure(background="#F6F5F4")

chance_2

In [None]:
# Because approval rate is almost 100%, overall rate is not insightful
# So we look at response rate
response_rate = filings_df.filter((pl.col("status") == "SELECTED") & (pl.col("first_decision").is_null()))
response_rate = response_rate.join(selected_count, how="left", on=["fiscal_year", "ben_multi_reg_ind"])
response_rate = response_rate.with_columns((pl.col("count") / pl.col("total_selected")).alias("non_response_rate"))

response_rate = response_rate.with_columns(pl
                         .when(pl.col("ben_multi_reg_ind") == 0)
                         .then(pl.lit("Single"))
                         .otherwise(pl.lit("Multiple"))
                         .alias("filing_type"))

chance_3 = alt.Chart(response_rate).mark_bar().encode(
    alt.X("filing_type", type="nominal", title=None, axis=alt.Axis(labels=False)),
    alt.Y("non_response_rate", type="quantitative", title="Non-response rate").scale(domain=(0, 1)),
    color=alt.Color("filing_type:N", title="Filing type", 
                    scale=alt.Scale(domain=["Single", "Multiple"],
                                    range=["#698C7D", "#32555D"])),
    column=alt.Column("fiscal_year:N", title="Fiscal Year",
                      spacing=10,
                      header=alt.Header(titleOrient='bottom', labelOrient='bottom'))
).properties(title=alt.Title("Non-responses from lottery-won filings (2021-2024)", fontSize=12)).configure(background="#F6F5F4")

chance_3