# Data download and cleanup of the `radon` dataset

Links
- https://www.pymc.io/projects/examples/en/latest/generalized_linear_models/multilevel_modeling.html
  (older version available [here](https://github.com/fonnesbeck/multilevel_modeling/blob/master/multilevel_modeling.ipynb)
- https://bambinos.github.io/bambi/notebooks/radon_example.html
- https://mc-stan.org/users/documentation/case-studies/radon_cmdstanpy_plotnine.html#data-prep

In [1]:
import pandas as pd
import pymc as pm
import numpy as np



## Raw data URLs

- http://www.stat.columbia.edu/~gelman/arm/examples/radon/srrs2.dat  
  ALT. https://raw.githubusercontent.com/pymc-devs/pymc-examples/main/examples/data/srrs2.dat
- http://www.stat.columbia.edu/~gelman/arm/examples/radon/cty.dat
  

## Bambi tutorial steps

In [2]:
# Get radon data
path = "https://raw.githubusercontent.com/pymc-devs/pymc-examples/main/examples/data/srrs2.dat"
radon_df = pd.read_csv(path)

import pymc as pm
# Get city data
city_df = pd.read_csv(pm.get_data("cty.dat"))

# Strip spaces from column names
radon_df.columns = radon_df.columns.map(str.strip)

# Filter to keep observations for "MN" state only
df = radon_df[radon_df.state == "MN"].copy()
city_mn_df = city_df[city_df.st == "MN"].copy()

# Compute fips
df["fips"] = 1_000 * df.stfips + df.cntyfips
city_mn_df["fips"] = 1_000 * city_mn_df.stfips + city_mn_df.ctfips

# Merge data
df = df.merge(city_mn_df[["fips", "Uppm"]], on="fips")
df = df.drop_duplicates(subset="idnum")

# Clean county names
df.county = df.county.map(str.strip)


# ORIGINAL
# Compute log(radon + 0.1)
# df["log_radon"] = np.log(df["activity"] + 0.1)

# MODIFIED
# Add 0.1 activity only for measurements that are 0
df.loc[df["activity"]==0,"activity"] = 0.1
# Compute log(radon activity)
df["log_radon"] = np.log(df["activity"])


# Compute log of Uranium
df["log_uranium"] = np.log(df["Uppm"])

# Let's map floor. 0 -> Basement and 1 -> Floor
df["floor"] = df["floor"].map({0: "basement", 1: "ground"})

# Sort values by floor
df = df.sort_values(by="floor")

# Reset index
df = df.reset_index(drop=True)
df

radon_bambi = df[["idnum", "state", "county", "floor", "log_radon", "log_uranium"]]
radon_bambi = radon_bambi.sort_values("idnum").reset_index(drop=True)
radon_bambi

Unnamed: 0,idnum,state,county,floor,log_radon,log_uranium
0,5081,MN,AITKIN,ground,0.788457,-0.689048
1,5082,MN,AITKIN,basement,0.788457,-0.689048
2,5083,MN,AITKIN,basement,1.064711,-0.689048
3,5084,MN,AITKIN,basement,0.000000,-0.689048
4,5085,MN,ANOKA,basement,1.131402,-0.847313
...,...,...,...,...,...,...
914,5995,MN,WRIGHT,basement,1.856298,-0.090024
915,5996,MN,WRIGHT,basement,1.504077,-0.090024
916,5997,MN,WRIGHT,basement,1.609438,-0.090024
917,5998,MN,YELLOW MEDICINE,basement,1.308333,0.355287


## Save the dataset

In [3]:
radon_bambi.to_csv("../datasets/radon.csv", index=False)

## PyMC tutorial steps (NOT USED)

In [4]:
srrs2 = pd.read_csv(pm.get_data("srrs2.dat"))
srrs2.columns = srrs2.columns.map(str.strip)
srrs_mn = srrs2[srrs2.state == "MN"].copy()

cty = pd.read_csv(pm.get_data("cty.dat"))

srrs_mn["fips"] = srrs_mn.stfips * 1000 + srrs_mn.cntyfips
cty_mn = cty[cty.st == "MN"].copy()
cty_mn["fips"] = 1000 * cty_mn.stfips + cty_mn.ctfips

# Use the merge method to combine home- and county-level information in a single DataFrame.
srrs_mn = srrs_mn.merge(cty_mn[["fips", "Uppm"]], on="fips")
srrs_mn = srrs_mn.drop_duplicates(subset="idnum")
u = np.log(srrs_mn.Uppm).unique()

n = len(srrs_mn)

# Let’s encode the county names and make local copies of the variables we will use. We also need a lookup table (dict) for each unique county, for indexing.
srrs_mn.county = srrs_mn.county.map(str.strip)
county, mn_counties = srrs_mn.county.factorize()
srrs_mn["county_code"] = county
radon = srrs_mn.activity
# ORIGINAL
# srrs_mn["log_radon"] = log_radon = np.log(radon + 0.1).values
# MODIFIED
radon = np.where(radon==0, 0.1, radon)
srrs_mn["log_radon"] = log_radon = np.log(radon)
# /MODIFIED
floor_measure = srrs_mn.floor.values

srrs_mn["log_u"] = np.log(srrs_mn["Uppm"])

radon_pm = srrs_mn[['idnum', 'state', 'county', 'floor', 'log_radon', 'log_u']]
radon_pm = radon_pm.sort_values("idnum").reset_index(drop=True)
radon_pm

Unnamed: 0,idnum,state,county,floor,log_radon,log_u
0,5081,MN,AITKIN,1,0.788457,-0.689048
1,5082,MN,AITKIN,0,0.788457,-0.689048
2,5083,MN,AITKIN,0,1.064711,-0.689048
3,5084,MN,AITKIN,0,0.000000,-0.689048
4,5085,MN,ANOKA,0,1.131402,-0.847313
...,...,...,...,...,...,...
914,5995,MN,WRIGHT,0,1.856298,-0.090024
915,5996,MN,WRIGHT,0,1.504077,-0.090024
916,5997,MN,WRIGHT,0,1.609438,-0.090024
917,5998,MN,YELLOW MEDICINE,0,1.308333,0.355287


In [5]:
# radon_pm.compare(radon_bambi).sample(30)

## Tutorial using brms (NOT USED)

https://github.com/mitzimorris/brms_feb_28_2023

via https://discourse.pymc.io/t/webinar-bayesian-data-analysis-with-brms-feb-28-2023/11471

In [6]:
mn_radon_url = "https://raw.githubusercontent.com/mitzimorris/brms_feb_28_2023/refs/heads/main/data/mn_radon.csv"
mn_radon = pd.read_csv(mn_radon_url)
mn_radon

Unnamed: 0,floor,county,log_radon,log_uranium,county_id
0,1,AITKIN,0.788457,-0.689048,1
1,0,AITKIN,0.788457,-0.689048,1
2,0,AITKIN,1.064711,-0.689048,1
3,0,AITKIN,0.000000,-0.689048,1
4,0,ANOKA,1.131402,-0.847313,2
...,...,...,...,...,...
914,0,WRIGHT,1.856298,-0.090024,84
915,0,WRIGHT,1.504077,-0.090024,84
916,0,WRIGHT,1.609438,-0.090024,84
917,0,YELLOW MEDICINE,1.308333,0.355287,85


## Debugging 0-values

In [7]:
# Problematic rows if we don't add 0.1
"""
     idnum state      county   floor  log_radon     log_u  activity
106   5187    MN      CARVER  ground       -inf  0.095865       0.0
144   5225    MN  COTTONWOOD  ground       -inf  0.339560       0.0
477   5558    MN      MCLEOD  ground       -inf  0.140423       0.0
""";

In [8]:
# Indeed these are zero in the orginal
"""
idnum, state, state2, stfips, zip, region, typebldg, floor, room, basement, windoor, rep, stratum, wave, starttm, stoptm, startdt, stopdt, activity, pcterr, adjwt, dupflag, zipflag, cntyfips, county
5187,MN,MN,27,55388, 5,1,1,3,Y, , 1, 4, 48,0900,0900,020288,020288,     0.0,     0.0,  1088.985661,0,0, 19,CARVER              
5225,MN,MN,27,56174, 5,1,1,3,Y, , 5, 4, 25,2300,1000,121487,121787,     0.0,     0.0,  1105.956867,0,0, 33,COTTONWOOD         
5558,MN,MN,27,55350, 5,1,1,3,Y, , 4, 3, 43,1809,1700,012088,012288,     0.0,     0.0,   990.411554,0,0, 85,MCLEOD 
""";

In [9]:
# Have been set to 0.1 selectively in R dataset
# 107,-2.30258509299405,1,0.0958645715553609,10,CARVER
# 145,-2.30258509299405,1,0.339560320720446,17,COTTONWOOD

So the correct fix is to add 0.1 **only for rows that have zero activity**, not everywhere.