In [1]:
from dask_jobqueue import SLURMCluster

# Compose SLURM script
cluster = SLURMCluster(queue='caslake', cores=5, memory='80GB', 
                       processes=5, walltime='03:00:00', interface='ib0',
                       job_extra=['--account=macs30123']
                      )

# Request resources
cluster.scale(jobs=1)

In [39]:
! squeue -u mnghiem

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
          31347229   caslake dask-wor  mnghiem PD       0:00      1 (Priority)


In [3]:
from dask.distributed import Client

client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: http://172.25.0.65:8787/status,

0,1
Dashboard: http://172.25.0.65:8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://172.25.0.65:47593,Workers: 0
Dashboard: http://172.25.0.65:8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [None]:
import dask.dataframe as dd
import dask

daily_dtype = {
    "State Name": "string",
    "county Name": "string",
    "State Code": "string",
    "County Code": "string",
    "Date": "string",
    "AQI": "float64",
    "Defining Parameter": "string"
}    

hourly_dtype = {
    "State Name": "string",
    "County Name": "string",
    "State Code": "string",
    "County Code": "string",
    "Date Local": "string",
    "Time Local": "string",
    "Sample Measurement": "float64",
    "Units of Measure": "string"
}

CNETID = "mnghiem"

daily_aqi = dd.read_csv(f"/scratch/midway3/{CNETID}/project-aqi-data/daily_aqi_by_county/*.csv", 
                        usecols=["State Name", "county Name", "State Code", "County Code", "Date", "AQI", "Defining Parameter"],
                        dtype=daily_dtype)
hourly_pm2 = dd.read_csv(f"/scratch/midway3/{CNETID}/project-aqi-data/hourly_pm25_by_county/*.csv",
                         usecols=["State Name", "County Name", "State Code", "County Code", "Date Local", "Time Local", "Sample Measurement", "Units of Measure"],
                        dtype=hourly_dtype)

In [None]:
import re

# Convert to snake case columns
def to_snake_case(colname):
    colname = re.sub(r'[\s\-]+', '_', colname)
    colname = re.sub(r'([a-z])([A-Z])', r'\1_\2', colname)
    return colname.lower()

hourly_pm2.columns = [to_snake_case(col) for col in hourly_pm2.columns]
daily_aqi.columns = [to_snake_case(col) for col in daily_aqi.columns]

In [None]:
# Reengineer olumns
tract_daily_pm25["date"] = dd.to_datetime(tract_daily_pm25["date"])
daily_aqi["date"] = dd.to_datetime(daily_aqi["date"])
daily_aqi["year"] = daily_aqi["date"].dt.year
daily_aqi["fips"] = daily_aqi["state_code"] + daily_aqi["county_code"]
hourly_pm2["date_local"] = dd.to_datetime(hourly_pm2["date_local"])
hourly_pm2["hour"] = hourly_pm2["time_local"].str.slice(0, 2).astype("int64")
hourly_pm2["time"] = hourly_pm2["date_local"].dt.year
hourly_pm2["fips"] = hourly_pm2["state_code"] + hourly_pm2["county_code"]
hourly_pm2 = hourly_pm2.drop(columns="time_local")

In [43]:
daily_aqi.head(5)

Unnamed: 0,State Name,county Name,State Code,County Code,Date,AQI,Defining Parameter
73418,New Jersey,Bergen,34,3,1980-09-07,56.0,NO2
18003,California,Ventura,6,111,1980-06-27,245.0,Ozone
120487,Utah,Utah,49,49,1980-03-19,32.0,Ozone
118879,Texas,Travis,48,453,1980-01-01,43.0,Ozone
48225,Kentucky,Muhlenberg,21,177,1980-05-14,58.0,Ozone


In [45]:
hourly_pm2.head(5)

Unnamed: 0,State Code,County Code,Date Local,Time Local,Sample Measurement,Units of Measure,State Name,County Name
139979,6,19,2010-09-03,06:00,35.0,Micrograms/cubic meter (LC),California,Fresno
2293,1,33,2010-04-15,08:00,22.6,Micrograms/cubic meter (LC),Alabama,Colbert
143604,6,27,2010-02-05,11:00,8.4,Micrograms/cubic meter (LC),California,Inyo
137371,6,19,2010-05-15,18:00,12.0,Micrograms/cubic meter (LC),California,Fresno
146800,6,27,2010-06-19,17:00,-6.9,Micrograms/cubic meter (LC),California,Inyo


In [None]:
# Use 5-year ACS 2023 household median income
from census import Census
from us import states
import pandas as pd

API_KEY = "9df5183d7032ec5b9690b0ca901ef955922d1fa1"

census = Census(API_KEY)

ACS_YEAR = 2023
ACS_SOURCE = 'acs5'
variable = "B19013_001E"

# Fetch data
results = census.acs5.state_county(
    fields=(variable,),
    state_fips="*",
    county_fips="*",
    year=ACS_YEAR
)

county_income = pd.DataFrame(results)

# Reformat to DF
county_income["state_fips"] = county_income["state"].str.zfill(2)
county_income["county_fips"] = county_income["county"].str.zfill(3)
county_income["fips"] = county_income["state_fips"] + county_income["county_fips"]
county_income["median_income"] = county_income[variable].astype(float)

state_fips_to_code = {s.fips.zfill(2): s.abbr for s in states.STATES}
county_income["state_code"] = county_income["state_fips"].map(state_fips_to_code)

county_income = county_income[["fips", "median_income"]]

In [None]:
# Use 5-year ACS 2023 county majority race

total_var = "B02001_001E"
race_vars = {
    "white": "B02001_002E",
    "black": "B02001_003E",
    "native": "B02001_004E",
    "asian": "B02001_005E",
    "pacific": "B02001_006E",
    "other": "B02001_007E",
    "2plus": "B02001_008E"
}

# Fields to pull from census
fields = list(race_vars.values()) + [total_var]

# Fetch data
data = census.acs5.state_county(
    fields=fields,
    state_fips="*",
    county_fips="*",
    year=ACS_YEAR
)

majority_race = pd.DataFrame(data)

# Engineer FIPS to correct format
majority_race["state_fips"] = majority_race["state"].str.zfill(2)
majority_race["county_fips"] = majority_race["county"].str.zfill(3)
majority_race["fips"] = majority_race["state_fips"] + majority_race["county_fips"]

# Reformat cols
majority_race = majority_race.rename(columns={v: k for k, v in race_vars.items()})
majority_race = majority_race.rename(columns={total_var: "total_pop"})
race_cols = list(race_vars.keys())
majority_race[race_cols + ["total_pop"]] = majority_race[race_cols + ["total_pop"]].astype(float)

# County majority race and population
majority_race["race"] = majority_race[race_cols].idxmax(axis=1)
majority_race["count"] = majority_race.lookup(majority_race.index, majority_race["race"])
majority_race["percentage"] = (majority_race["count"] / majority_race["total_pop"]) * 100

majority_race = majority_race[["fips", "race", "percentage"]]

  majority_race["count"] = majority_race.lookup(majority_race.index, majority_race["race"])


In [14]:
# Average daily AQI time series (US)
us_yearly_avg_aqi = daily_aqi.groupby(["year"])["aqi"].mean().reset_index()
us_yearly_avg_aqi = us_yearly_avg_aqi

In [15]:
# Daily highest AQI time series (US)
us_highest_daily_aqi = daily_aqi.groupby("year")["aqi"].max().reset_index()
us_highest_daily_aqi = us_highest_daily_aqi #.compute()

In [16]:
# Counties that see a worsening trend in AQI (how many improved? any patterns?)
from sklearn.linear_model import LinearRegression

def compute_slope(df, val):
    if df.shape[0] < 2:
        return pd.Series({"slope": float("nan")})
    X = df["year"].values.reshape(-1, 1)
    y = df[val].values
    model = LinearRegression().fit(X, y)
    return pd.Series({"slope": model.coef_[0]})

county_aqi_trend = daily_aqi.groupby("fips").apply(compute_slope, val="aqi", meta={"slope": "f8"}).reset_index()

Delayed('truediv-eabc57b835387c9d1912fa57c7072fc4')

In [28]:
# Days over unhealthy AQI level (US)
daily_aqi["unhealthy"] = (daily_aqi["aqi"] >= 101).astype(int)
county_days_unhealthy = daily_aqi.groupby(["fips", "year"])["unhealthy"].sum().reset_index()
us_days_unhealthy = county_days_unhealthy.groupby(["year"])["unhealthy"].mean().reset_index()

In [None]:
# Criteria gas
us_unhealthy_param = daily_aqi.groupby(["year", "defining_parameter"])["unhealthy"].count().reset_index()
us_total_unhealthy_param = us_unhealthy_param.groupby(["year"])["unhealthy"].sum().reset_index()
us_unhealthy_param = us_unhealthy_param.merge(us_total_unhealthy_param, on="year", how="left")

Unnamed: 0_level_0,year,defining_parameter,unhealthy
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,int64,string,int64
,...,...,...


In [19]:
us_yearly_avg_aqi, us_highest_daily_aqi, county_aqi_trend, us_days_unhealthy, us_unhealthy_param = dask.compute(us_yearly_avg_aqi, us_highest_daily_aqi, county_aqi_trend, us_days_unhealthy, us_unhealthy_param)

In [20]:
import altair as alt
line = alt.Chart(us_yearly_avg_aqi).mark_line().encode(
    x=alt.X("year:N"),
    y="aqi:Q"
).properties(
    width=600,
    height=300,
    title="Yearly Average AQI in the US"
)

regression = line.transform_regression(
    "year", "aqi", method="linear"
).mark_line(color="orange", strokeDash=[5, 5]).encode(
    tooltip=["year:Q", "aqi:Q"]
)

line + regression

In [22]:
bar = alt.Chart(us_highest_daily_aqi).mark_bar().encode(
    x="year:N",
    y="aqi:Q"
)

bar

In [24]:
county_negative_trend = county_aqi_trend[county_aqi_trend["slope"] < 0].sort_values("slope")
county_negative_trend.shape[0]/county_aqi_trend.shape[0]

0.5844155844155844

In [32]:
alt.Chart(us_days_unhealthy).mark_line().encode(
    x=alt.X("year:N"),
    y="unhealthy:Q"
).properties(
    width=600,
    height=300,
    title="Average days with unhealthy AQI level (over 101)"
)

In [34]:
us_unhealthy_param["percentage"] = (us_unhealthy_param["unhealthy_x"] / us_unhealthy_param["unhealthy_y"]).astype(float)

alt.Chart(us_unhealthy_param).mark_bar().encode(
    x="percentage:Q",
    y="year:N",
    color="defining_parameter:N"
)

In [36]:
# PM2.5 by time of day (national average)
hourly_pm2["year"] = hourly_pm2["date_local"].dt.year
us_avg_pm2_by_hour = hourly_pm2.groupby(["hour"])["sample_measurement"].mean().reset_index()

us_avg_pm2_by_hour = us_avg_pm2_by_hour #.compute()

In [37]:
# Yearly average concentration for PM2.5
us_avg_pm2_by_year = hourly_pm2.groupby(["year"])["sample_measurement"].mean().reset_index()
us_avg_pm2_by_year = us_avg_pm2_by_year #.compute()

In [38]:
us_avg_pm2_by_hour, us_avg_pm2_by_year = dask.compute(us_avg_pm2_by_hour, us_avg_pm2_by_year)

In [39]:
alt.Chart(us_avg_pm2_by_hour).mark_bar().encode(
    x=alt.X("hour:N"),
    y="sample_measurement:Q"
).properties(
    title="PM2.5 by time of day (national average)"
)

In [40]:
alt.Chart(us_avg_pm2_by_year).mark_bar().encode(
    x=alt.X("year:N"),
    y="sample_measurement:Q"
).properties(
    title="Yearly average concentration for PM2.5"
)

In [None]:
# Environmental justice (race+income)
county_avg_aqi = daily_aqi.groupby(["fips"])["aqi"].mean().reset_index()
county_stats = majority_race.merge(county_income, on="fips", how="inner")
county_aqi_ri = county_avg_aqi.merge(county_stats, on="fips", how="inner")
county_avg_aqi, county_aqi_ri = dask.compute(county_avg_aqi, county_aqi_ri)

In [44]:
aqi_by_race = county_aqi_ri.groupby("race")["aqi"].mean().reset_index()
aqi_by_race

Unnamed: 0,race,aqi
0,2plus,32.882466
1,asian,45.549299
2,black,46.43381
3,native,24.02625
4,other,37.954689
5,white,41.258698


In [45]:
# AQI by race
alt.Chart(county_aqi_ri).mark_boxplot(extent='min-max').encode(
    y='race:N',
    x='aqi:Q'
)

In [46]:
aqi_by_income = county_aqi_ri.groupby("median_income")["aqi"].mean().reset_index()
aqi_by_income["income_quantile"] = pd.qcut(aqi_by_income["median_income"], q=10, labels=range(1, 11))
aqi_by_income

Unnamed: 0,median_income,aqi,income_quantile
0,17531.0,30.014563,1
1,17963.0,35.164031,1
2,18605.0,29.080745,1
3,18827.0,33.291005,1
4,19323.0,31.719057,1
...,...,...,...
1491,150113.0,59.335251,10
1492,154734.0,31.100000,10
1493,156000.0,40.735830,10
1494,159674.0,55.378712,10


In [47]:
# AQI by income
alt.Chart(aqi_by_income).mark_boxplot(extent='min-max').encode(
    y='income_quantile:N',
    x='aqi:Q'
)