In [1]:
from dask_jobqueue import SLURMCluster

# Compose SLURM script
cluster = SLURMCluster(queue='caslake', cores=5, memory='80GB', 
                       processes=5, walltime='02:00:00', interface='ib0',
                       job_extra=['--account=macs30123']
                      )

# Request resources
cluster.scale(jobs=1)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 43425 instead


In [2]:
! squeue -u mnghiem

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
          31295010   caslake dask-wor  mnghiem  R       0:42      1 midway3-0023


In [3]:
from dask.distributed import Client

client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: http://172.25.0.66:43425/status,

0,1
Dashboard: http://172.25.0.66:43425/status,Workers: 5
Total threads: 5,Total memory: 74.50 GiB

0,1
Comm: tcp://172.25.0.66:36497,Workers: 5
Dashboard: http://172.25.0.66:43425/status,Total threads: 5
Started: Just now,Total memory: 74.50 GiB

0,1
Comm: tcp://172.25.2.23:37209,Total threads: 1
Dashboard: http://172.25.2.23:41473/status,Memory: 14.90 GiB
Nanny: tcp://172.25.2.23:34835,
Local directory: /scratch/local/jobs/31295010/dask-worker-space/worker-re1g44_q,Local directory: /scratch/local/jobs/31295010/dask-worker-space/worker-re1g44_q

0,1
Comm: tcp://172.25.2.23:43741,Total threads: 1
Dashboard: http://172.25.2.23:45043/status,Memory: 14.90 GiB
Nanny: tcp://172.25.2.23:37967,
Local directory: /scratch/local/jobs/31295010/dask-worker-space/worker-wqyc70x4,Local directory: /scratch/local/jobs/31295010/dask-worker-space/worker-wqyc70x4

0,1
Comm: tcp://172.25.2.23:44501,Total threads: 1
Dashboard: http://172.25.2.23:35187/status,Memory: 14.90 GiB
Nanny: tcp://172.25.2.23:36927,
Local directory: /scratch/local/jobs/31295010/dask-worker-space/worker-s3vane2o,Local directory: /scratch/local/jobs/31295010/dask-worker-space/worker-s3vane2o

0,1
Comm: tcp://172.25.2.23:40625,Total threads: 1
Dashboard: http://172.25.2.23:33979/status,Memory: 14.90 GiB
Nanny: tcp://172.25.2.23:42737,
Local directory: /scratch/local/jobs/31295010/dask-worker-space/worker-xn6yf5x5,Local directory: /scratch/local/jobs/31295010/dask-worker-space/worker-xn6yf5x5

0,1
Comm: tcp://172.25.2.23:35859,Total threads: 1
Dashboard: http://172.25.2.23:35811/status,Memory: 14.90 GiB
Nanny: tcp://172.25.2.23:45325,
Local directory: /scratch/local/jobs/31295010/dask-worker-space/worker-jexwymso,Local directory: /scratch/local/jobs/31295010/dask-worker-space/worker-jexwymso


In [4]:
import dask.dataframe as dd
import dask

daily_dtype = {
    "State Name": "string",
    "county Name": "string",
    "State Code": "string",
    "County Code": "string",
    "Date": "string",
    "AQI": "float64",
    "Defining Parameter": "string"
}    

hourly_dtype = {
    "State Name": "string",
    "County Name": "string",
    "State Code": "string",
    "County Code": "string",
    "Date Local": "string",
    "Time Local": "string",
    "Sample Measurement": "float64",
    "Units of Measure": "string"
}

daily_aqi = dd.read_csv('./data/daily_aqi_by_county/*.csv', 
                        usecols=["State Name", "county Name", "State Code", "County Code", "Date", "AQI", "Defining Parameter"],
                        dtype=daily_dtype) # 198 counties
hourly_pm2 = dd.read_csv('./data/hourly_pm25_by_county/*.csv',
                         usecols=["State Name", "County Name", "State Code", "County Code", "Date Local", "Time Local", "Sample Measurement", "Units of Measure"],
                        dtype=hourly_dtype) # 131 counties
tract_daily_pm25 = dd.read_csv('./data/daily_pm25_by_tract.csv')

In [5]:
import re

# Define snake_case conversion
def to_snake_case(colname):
    colname = re.sub(r'[\s\-]+', '_', colname)
    colname = re.sub(r'([a-z])([A-Z])', r'\1_\2', colname)
    return colname.lower()

# Rename columns to snake_case
hourly_pm2.columns = [to_snake_case(col) for col in hourly_pm2.columns]
daily_aqi.columns = [to_snake_case(col) for col in daily_aqi.columns]

In [6]:
tract_daily_pm25["date"] = dd.to_datetime(tract_daily_pm25["date"])
daily_aqi["date"] = dd.to_datetime(daily_aqi["date"])
daily_aqi["year"] = daily_aqi["date"].dt.year
hourly_pm2["date_local"] = dd.to_datetime(hourly_pm2["date_local"])
hourly_pm2["hour"] = hourly_pm2["time_local"].str.slice(0, 2).astype("int64")
hourly_pm2 = hourly_pm2.drop(columns="time_local")

In [9]:
daily_aqi.dtypes

state_name                    string
county_name                   string
state_code                    string
county_code                   string
date                  datetime64[ns]
aqi                          float64
defining_parameter            string
year                           int64
dtype: object

In [None]:
daily_aqi.head(5)

In [57]:
tract_daily_pm25.head(5)

Unnamed: 0,year,date,statefips,countyfips,ctfips,latitude,longitude,DS_PM_pred,DS_PM_stdd
0,2020,2020-01-01,1,19,1019956102,-85.541,34.06629,7.51,4.2824
1,2020,2020-01-01,1,59,1059973000,-87.71912,34.5418,4.576,3.0366
2,2020,2020-01-01,4,13,4013092311,-112.1884,33.56024,67.803,28.8926
3,2020,2020-01-01,1,1,1001020100,-86.49007,32.47718,9.473,5.513
4,2020,2020-01-01,1,1,1001020200,-86.47337,32.47434,9.971,5.6818


In [38]:
hourly_pm2.head(5)

Unnamed: 0,state_code,county_code,date_local,time_local,sample_measurement,units_of_measure,state_name,county_name
0,1,33,2010-01-01,00:00,11.9,Micrograms/cubic meter (LC),Alabama,Colbert
1,1,33,2010-01-01,01:00,10.0,Micrograms/cubic meter (LC),Alabama,Colbert
2,1,33,2010-01-01,02:00,5.8,Micrograms/cubic meter (LC),Alabama,Colbert
3,1,33,2010-01-01,03:00,6.0,Micrograms/cubic meter (LC),Alabama,Colbert
4,1,33,2010-01-01,04:00,8.2,Micrograms/cubic meter (LC),Alabama,Colbert


In [58]:
daily_aqi.head(5)

Unnamed: 0,state_name,county_name,state_code,county_code,date,aqi,defining_parameter
0,Alabama,Autauga,1,1,1980-04-05,67.0,Ozone
1,Alabama,Autauga,1,1,1980-04-06,84.0,Ozone
2,Alabama,Autauga,1,1,1980-04-07,61.0,Ozone
3,Alabama,Autauga,1,1,1980-04-08,49.0,Ozone
4,Alabama,Autauga,1,1,1980-04-09,58.0,Ozone


In [7]:
# Load median household income from 5-year ACS 2023
from census import Census
from us import states
import pandas as pd

API_KEY = "9df5183d7032ec5b9690b0ca901ef955922d1fa1"

c = Census(API_KEY)

ACS_YEAR = 2023
ACS_SOURCE = 'acs5'
variable = "B19013_001E"

# Fetch data from ACS
results = c.acs5.state_county(
    fields=(variable,),
    state_fips="*",
    county_fips="*",
    year=ACS_YEAR
)

# Convert to DataFrame
county_income = pd.DataFrame(results)

# Format and enrich columns
county_income["state_fips"] = county_income["state"].str.zfill(2)
county_income["county_fips"] = county_income["county"].str.zfill(3)
county_income["fips"] = county_income["state_fips"] + county_income["county_fips"]
county_income["median_income"] = county_income[variable].astype(float)

# Optional: Add state abbreviation for reference
state_fips_to_code = {s.fips.zfill(2): s.abbr for s in states.STATES}
county_income["state_code"] = county_income["state_fips"].map(state_fips_to_code)

# Final column order
county_income = county_income[["fips", "state_fips", "state_code", "county_fips", "median_income"]]

In [8]:
# Step 2: Define race variables + total population
race_vars = {
    "white": "B02001_002E",
    "black": "B02001_003E",
    "native": "B02001_004E",
    "asian": "B02001_005E",
    "pacific": "B02001_006E",
    "other": "B02001_007E",
    "2plus": "B02001_008E"
}
total_var = "B02001_001E"  # Total population

# Step 3: Define fields to query (race + total pop)
fields = list(race_vars.values()) + [total_var]

# Step 4: Query Census API for all counties
data = c.acs5.state_county(
    fields=fields,
    state_fips="*",
    county_fips="*",
    year=ACS_YEAR
)

# Load into DataFrame
majority_race = pd.DataFrame(data)

# Add and format FIPS columns
majority_race["state_fips"] = majority_race["state"].str.zfill(2)
majority_race["county_fips"] = majority_race["county"].str.zfill(3)
majority_race["fips"] = majority_race["state_fips"] + majority_race["county_fips"]

# Rename and cast columns
majority_race = majority_race.rename(columns={v: k for k, v in race_vars.items()})
majority_race = majority_race.rename(columns={total_var: "total_pop"})

# Convert race columns to float
race_cols = list(race_vars.keys())
majority_race[race_cols + ["total_pop"]] = majority_race[race_cols + ["total_pop"]].astype(float)

# Determine majority race and population count
majority_race["race"] = majority_race[race_cols].idxmax(axis=1)
majority_race["count"] = majority_race.lookup(majority_race.index, majority_race["race"])
majority_race["percentage"] = (majority_race["count"] / majority_race["total_pop"]) * 100

majority_race = majority_race[["fips", "state_fips", "county_fips", "race", "percentage"]]

  majority_race["count"] = majority_race.lookup(majority_race.index, majority_race["race"])


In [68]:
majority_race

Unnamed: 0,fips,state_fips,county_fips,race,count,percentage
0,01001,01,001,white,43616.0,73.570043
1,01003,01,003,white,198721.0,82.819396
2,01005,01,005,black,11616.0,46.920063
3,01007,01,007,white,16634.0,75.090285
4,01009,01,009,white,53062.0,89.492680
...,...,...,...,...,...,...
3217,72145,72,145,2plus,26581.0,49.171260
3218,72147,72,147,other,6252.0,76.739904
3219,72149,72,149,2plus,9236.0,42.409771
3220,72151,72,151,other,20950.0,70.141958


In [69]:
county_income

Unnamed: 0,fips,state_fips,state_code,county_fips,median_income
0,01001,01,AL,001,69841.0
1,01003,01,AL,003,75019.0
2,01005,01,AL,005,44290.0
3,01007,01,AL,007,51215.0
4,01009,01,AL,009,61096.0
...,...,...,...,...,...
3217,72145,72,,145,23877.0
3218,72147,72,,147,17531.0
3219,72149,72,,149,24882.0
3220,72151,72,,151,21279.0


In [9]:
# Average daily AQI time series (US)
us_yearly_avg_aqi = daily_aqi.groupby(["year"]).agg({"aqi": "mean"}).reset_index()
us_yearly_avg_aqi = us_yearly_avg_aqi.compute()

In [11]:
us_yearly_avg_aqi

Unnamed: 0,year,aqi
0,1980,52.542755
1,1981,48.775308
2,1982,48.267123
3,1983,51.696244
4,1984,49.040185
5,1985,49.014819
6,1986,48.185493
7,1987,51.061623
8,1988,52.767152
9,1989,45.689762


In [14]:
import altair as alt
line = alt.Chart(us_yearly_avg_aqi).mark_line().encode(
    x=alt.X("year:N"),
    y="aqi:Q"
).properties(
    width=600,
    height=300,
    title="Yearly Average AQI in the US"
)

regression = line.transform_regression(
    "year", "aqi", method="linear"
).mark_line(color="orange", strokeDash=[5, 5]).encode(
    tooltip=["year:Q", "aqi:Q"]
)

line + regression

In [19]:
# Daily highest AQI time series (US)
us_highest_daily_aqi = daily_aqi.groupby("year")["aqi"].max().reset_index()
us_highest_daily_aqi = us_highest_daily_aqi.compute()

In [21]:
us_highest_daily_aqi

Unnamed: 0,year,aqi
0,1980,386.0
1,1981,331.0
2,1982,410.0
3,1983,346.0
4,1984,300.0
5,1985,420.0
6,1986,368.0
7,1987,576.0
8,1988,747.0
9,1989,1897.0


In [22]:
# Get county-level max AQI per year
county_year_max = daily_aqi.groupby(["year", "county_code"])["aqi"].max().reset_index()

# Compute yearly average of those county max AQIs
yearly_avg_of_max = county_year_max.groupby("year")["aqi"].mean().reset_index()

yearly_avg_of_max = yearly_avg_of_max.compute()
yearly_avg_of_max

Unnamed: 0,year,aqi
0,1980,219.513043
1,1981,209.330508
2,1982,208.834783
3,1983,217.321739
4,1984,209.294643
5,1985,201.112069
6,1986,195.735043
7,1987,202.264
8,1988,210.100719
9,1989,202.51049


In [34]:
# Counties that see a worsening trend in AQI (how many improved? any patterns?)
from sklearn.linear_model import LinearRegression

def compute_slope(df, val):
    if df.shape[0] < 2:
        return pd.Series({"slope": float("nan")})
    X = df["year"].values.reshape(-1, 1)
    y = df[val].values
    model = LinearRegression().fit(X, y)
    return pd.Series({"slope": model.coef_[0]})

county_aqi_trend = daily_aqi.groupby("county_code").apply(compute_slope, val="aqi", meta={"slope": "f8"}).compute()

county_negative_trend = county_aqi_trend[county_aqi_trend["slope"] < 0].sort_values("slope")
county_negative_trend

Unnamed: 0_level_0,slope
county_code,Unnamed: 1_level_1
232,-12.371408
050,-5.158113
297,-1.749834
409,-1.414176
580,-1.319843
...,...
199,-0.018320
099,-0.015067
053,-0.010135
093,-0.009812


In [24]:
# Days over unhealthy AQI level (US)
daily_aqi["unhealthy"] = (daily_aqi["aqi"] >= 101).astype(int)
county_days_unhealthy = daily_aqi.groupby(["county_code", "year"])["unhealthy"].sum().reset_index()
us_days_unhealthy = county_days_unhealthy.groupby(["year"])["unhealthy"].mean().reset_index()
us_days_unhealthy.compute()

Unnamed: 0,year,unhealthy
0,1980,133.8
1,1981,102.008475
2,1982,99.147826
3,1983,139.356522
4,1984,109.071429
5,1985,104.077586
6,1986,101.521368
7,1987,121.128
8,1988,138.568345
9,1989,77.552448


In [37]:
county_days_unhealthy_trend = daily_aqi.groupby("county_code").apply(compute_slope, val="unhealthy", meta={"slope": "f8"}).reset_index().compute()

county_positive_unhealthy_days_trend = county_days_unhealthy_trend[county_days_unhealthy_trend["slope"] > 0].sort_values("slope")
county_positive_unhealthy_days_trend

Unnamed: 0,county_code,slope
2,770,6.7e-05
6,381,7.1e-05
5,323,0.000105
5,700,0.000109
1,273,0.000118
2,479,0.000202
1,171,0.000244
0,50,0.000278
1,90,0.001607


In [43]:
county_unhealthy_param = daily_aqi.groupby(["year", "defining_parameter"])["unhealthy"].count().reset_index()
county_unhealthy_param = county_days_unhealthy_param.compute()
county_unhealthy_param

Unnamed: 0,year,defining_parameter,unhealthy
0,1980,CO,32070
1,1980,NO2,25114
2,1980,Ozone,78776
3,1981,CO,32596
4,1981,NO2,25422
...,...,...,...
210,2024,CO,717
211,2024,NO2,1606
212,2024,Ozone,115550
213,2024,PM10,5891


In [46]:
county_total_unhealthy_param = county_unhealthy_param.groupby(["year"])["unhealthy"].sum().reset_index()
county_unhealthy_param = county_unhealthy_param.merge(county_total_unhealthy_param, on="year", how="left")
county_unhealthy_param

In [47]:
county_unhealthy_param["percentage"] = 

Unnamed: 0,year,defining_parameter,unhealthy_x,unhealthy_y
0,1980,CO,32070,135960
1,1980,NO2,25114,135960
2,1980,Ozone,78776,135960
3,1981,CO,32596,144549
4,1981,NO2,25422,144549
...,...,...,...,...
210,2024,CO,717,206919
211,2024,NO2,1606,206919
212,2024,Ozone,115550,206919
213,2024,PM10,5891,206919


In [72]:
# 24-hour average concentration for PM2.5, in cig equivalents


In [None]:
# AQI by time of day (national average)

In [74]:
# Spatial pattern
# choropleth map—yearly avg aqi daily highest shade by time of day and 24 hour average details by click
# choropleth map (hours exposure beyond healthy level)—total hours shade by year and yearly cig equivalents

In [76]:
# Asthma prevalence prediction: AQI, pm2.5, ozone, county, time

In [None]:
# You live in ___ county. How much are you smoking--birthyear on average, county? (cigs equivalent)

In [None]:
# Environmental justice (race+income)
# range of daily hours under healthy level (AQI)—boxplot
# median cigs equivalent by year
# total cigs equivalent and hours under healthy hours by year
# 24-hour average concentration for PM2.5