# 0. Settings

In [None]:
# Change these

locality = "nc-guilford"
verbose = True
clear_checkpoints = False

# 1. Basic setup

In [None]:
from init_notebooks import setup_environment
setup_environment()

In [None]:
# import a bunch of stuff

import os
import pickle
import pandas as pd
from openavmkit.pipeline import (
    NotebookState, 
    set_locality,
    load_settings,
    examine_sup,
    process_sales,
    mark_ss_ids_per_model_group_sup,
    run_sales_scrutiny_per_model_group_sup
)
from openavmkit.cleaning import (
    sup_fill_unknown_values
)
from openavmkit.checkpoint import (
    from_checkpoint,
    delete_checkpoints,
    write_checkpoint
)
from openavmkit.horizontal_equity_study import (
    mark_horizontal_equity_clusters_per_model_group_sup
)

In [None]:
if 'inited' not in globals():
    nbs: NotebookState = None
    inited = True
nbs = set_locality(nbs, locality)
settings = load_settings()

In [None]:
if clear_checkpoints:
    delete_checkpoints("2-clean")

## 1.1. Load data

In [None]:
# load the data
with open (f"out/sales_univ.pickle", "rb") as file:
    sales_univ = pickle.load(file)

## 1.2. Fill unknowns in data

In [None]:
sales_univ = sup_fill_unknown_values(sales_univ, settings)

In [None]:
examine_sup(sales_univ, settings)

# 2. Enrichment

In [None]:
sales_univ = from_checkpoint("2-clean-00-horizontal-equity", mark_horizontal_equity_clusters_per_model_group_sup,
    {
        "sup": sales_univ,
        "settings": settings,
        "verbose": verbose
    }
)

## 2.1 Process sales
- Select only valid sales
- Calculate time-adjusted sale prices

In [None]:
sales_univ = from_checkpoint("2-clean-01-process_sales", process_sales,
    {
        "sup": sales_univ,
        "settings": settings,
        "verbose": verbose
    }
)

## 2.2 Sales scrutiny
- Runs sales validity heuristic over sales clusters
- Identifies sales that are anomalously high or low for their local cluster & flags them

In [None]:
sales_univ = from_checkpoint("2-clean-02-mark-ss_ids", mark_ss_ids_per_model_group_sup,
    {
        "sup": sales_univ,
        "settings": settings,
        "verbose": verbose,
    }
)

In [None]:
sales_univ = from_checkpoint("2-clean-03-sales-scrutiny", run_sales_scrutiny_per_model_group_sup,
    {
        "sup": sales_univ,
        "settings": settings,
        "verbose": verbose
    }
)

In [None]:
write_checkpoint(sales_univ, "2-clean-04-out")

In [None]:
pd.set_option('display.max_columns',None)
df_univ = sales_univ["universe"]
df_univ[pd.isna(df_univ).any(axis=1)]