# Preliminaries

In [None]:
!python --version

<br>

## Environment

In [None]:
import os
import pathlib

<br>

Set path

In [None]:
os.chdir(path=str(pathlib.Path(os.getcwd()).parent))

In [None]:
os.getcwd()

<br>

## Libraries

In [None]:
%matplotlib inline

In [None]:
import logging
import collections

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import arviz as az
import pymc as pm
import xarray as xr
import aesara.tensor as at
import graphviz

<br>

Versions

In [None]:
az.__version__

In [None]:
pm.__version__

In [None]:
np.__version__

<br>

## Custom

In [None]:
import src.graphics.settings
import src.graphics.sketch

<br>

Aesthetics

In [None]:
settings = src.graphics.settings.Settings()

settings.layout()
settings.aesthetics()

<br>

For diagrams/figures

In [None]:
sketch = src.graphics.sketch.Sketch()

In [None]:
Labels = collections.namedtuple(typename='Labels', field_names=['title', 'xlabel', 'ylabel'])

<br>

## Settings

Seed

In [None]:
RANDOM_SEED = 8924

<br>

Graphing

In [None]:
az.style.use('arviz-darkgrid')

<br>

## Logging

In [None]:
logging.basicConfig(level=logging.INFO, 
                    format='\n%(message)s\n%(asctime)s.%(msecs)03d\n', 
                    datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)

<br>
<br>

# Data

## Dwelling Level

In [None]:
try:
    data = pd.read_csv(filepath_or_buffer=pm.get_data('srrs2.dat'))
except FileNotFoundError as err:
    raise Exception(err.strerror)

In [None]:
data.columns

In [None]:
data.rename(mapper=str.strip, axis='columns', inplace=True)

In [None]:
data.head()

<br>

Structuring; concatenating the `pure state` & `pure county` codes

* [FIPS States](https://en.wikipedia.org/wiki/Federal_Information_Processing_Standard_state_code)
* [FIPS Counties](https://en.wikipedia.org/wiki/List_of_United_States_FIPS_codes_by_county)


In [None]:
data.loc[:, 'fips'] = data.stfips.astype(str).str.zfill(2) + data.cntyfips.astype(str).str.zfill(3)
data.head()

<br>

## County Level

In [None]:
try:
    counties = pd.read_csv(filepath_or_buffer=pm.get_data('cty.dat'))
except FileNotFoundError as err:
    raise Exception(err.strerror)

In [None]:
counties.loc[:, 'fips'] = counties.stfips.astype(str).str.zfill(2) + counties.ctfips.astype(str).str.zfill(3)

<br>

## Excerpting & Merging

Excerpt: Focus on Minnesota, MN, dwellings.

In [None]:
excerpt = data.loc[data['state'] == 'MN', :]
excerpt.head()

<br>

Merge dwelling & county level [uranium] data.

In [None]:
excerpt = excerpt.merge(counties[['fips', 'Uppm']], how='left', on='fips')
excerpt.columns

In [None]:
excerpt.head()

<br>

## Duplicates

In [None]:
logger.info(f'# of instances: {excerpt.shape}')
logger.info(f'# of unique instances: {excerpt.drop_duplicates().shape}')
logger.info(f"# of unique codes: {excerpt['idnum'].unique().shape}")

<br>

Hence

In [None]:
excerpt.drop_duplicates(inplace=True)
excerpt['idnum'].unique().shape

<br>
<br>

# Explore

In [None]:
excerpt.loc[:, 'ln_radon'] = np.log(excerpt['activity'] + 0.1)

In [None]:
ax = sketch.figure(width=3.1, height=2.7)
ax.hist(excerpt.ln_radon, bins=25)
sketch.annotation(ax, Labels(title='', xlabel='bins', ylabel='frequency'))

<br>
<br>

# Model

## Simple

<br>

Snippets:

>  ```python
len(coords.get('Dwelling'))

> ```python 
with complete:
    logger.info(at.shape(epsilon).eval())
    
>```python
epsilon.eval()

<br>

Add more notes:

* A simple linear regression model: an intercept, a gradient whereby the accompanying independent variable is *dwelling level*
* The depedet variable is *ln(radon)*
* Is the *intercept* implicit below?

<br>

<br>

Model

In [None]:
coords = {'Floors': ['Basement', 'Ground']}

In [None]:
with pm.Model(coords=coords) as complete:
    
    
    # the values of the <floor> field
    floor = pm.Data(name='floor', value=excerpt.floor.values, dims='N', mutable=True)
    logger.info(floor.get_value().shape)
    logger.info(floor.type())
    logger.info(floor.get_value())
    
    
    # <initial> probably has two elements - the object <Dwelling> has two elements, therefore two random values from a normal distribution
    initial = pm.Normal(name='initial', mu=0.0, sigma=10.0, dims='Floors')
    
    
    # shape(mu) === shape(floor)
    mu = initial[floor]
    
    
    # model
    sigma = pm.Exponential(name='sigma', lam=1.0)
    y = pm.Normal('y', mu=mu, sigma=sigma, observed=excerpt['ln_radon'].to_numpy(), dims='N')
    

<br>
<br>

Is this the correct seeding method/approach?

In [None]:
complete.initial_point(seed=RANDOM_SEED)

<br>

Illustration of model

In [None]:
pm.model_to_graphviz(complete)

In [None]:
diagram = pm.model_graph.ModelGraph(model=complete).make_graph()
diagram.node_attr.update(shape='circle')
diagram.save(os.path.join(os.getcwd(), 'notebooks', 'simple.gv'))
graphviz.render(engine='dot', format='pdf', filepath=os.path.join(os.getcwd(), 'notebooks', 'simple.gv'));

<br>

An inspection of a [prior predictiove distribution](https://www.pymc.io/projects/docs/en/stable/api/generated/pymc.sample_prior_predictive.html#pymc.sample_prior_predictive) sample

In [None]:
with complete:
    inspection = pm.sample_prior_predictive()
    

In [None]:
inspection.get('prior_predictive').keys()

In [None]:
type(inspection)

<br>
<br>

Hence

In [None]:
ax = sketch.figure(width=3.7, height=2.5)
inspection.prior.plot.scatter(x='Floors', y='mu', alpha=0.35, ax=ax)
sketch.annotation(ax, Labels(title='', xlabel='floor', ylabel='mean(ln(radon))'))

<br>

Alternatively

In [None]:
prior = inspection.get('prior_predictive')

In [None]:
prior.keys()

In [None]:
Y = prior.get('y').mean(axis=1)
Y.var()

In [None]:
np.squeeze(Y.values).shape

In [None]:
# ax = sketch.figure(width=3.7, height=2.5)