# 0. Settings

In [None]:
# Change these

locality = "nc-guilford"
verbose = True
clear_checkpoints = False

# 1. Basic setup

In [None]:
from init_notebooks import setup_environment
setup_environment()

In [None]:
# import a bunch of stuff

import os
import pickle
import pandas as pd
from openavmkit.pipeline import (
    NotebookState, 
    set_locality,
    load_settings,
    examine_sup,
    process_sales
)
from openavmkit.cleaning import (
    sup_fill_unknown_values
)
from openavmkit.checkpoint import (
    from_checkpoint,
    delete_checkpoints
)
from openavmkit.horizontal_equity_study import (
    mark_horizontal_equity_clusters_per_model_group
)

In [None]:
if 'inited' not in globals():
    nbs: NotebookState = None
    inited = True
nbs = set_locality(nbs, locality)
settings = load_settings()

In [None]:
if clear_checkpoints:
    delete_checkpoints("2-clean")

## 1.1. Load data

In [None]:
# load the data
with open (f"out/sales_univ.pickle", "rb") as file:
    sales_univ = pickle.load(file)

## 1.2. Fill unknowns in data

In [None]:
sales_univ = sup_fill_unknown_values(sales_univ, settings)

In [None]:
examine_sup(sales_univ, settings)

# 2. Enrichment

In [None]:
df_universe = sales_univ["universe"]

print(f"BEFORE = {len(df_universe)}")
df_universe = from_checkpoint("2-clean-00-horizontal-equity", mark_horizontal_equity_clusters_per_model_group,
    {
        "df_in": df_universe,
        "settings": settings,
        "verbose": verbose
    }
)
print(f"AFTER = {len(df_universe)}")

sales_univ["universe"] = df_universe

# Process sales
- Selects only valid sales
- Adds new fields for time adjusted sales

In [None]:
sup = from_checkpoint("2-clean-01-process_sales", process_sales,
    {
        "sup": sales_univ,
        "settings": settings,
        "verbose": verbose
    }
)

# Sales scrutiny
- 

In [None]:
df_sales = sales_univ["sales"]

df = from_checkpoint("2-clean-02-sales-scrutiny", run_sales_scrutiny_per_model_group,
    {
        "df_in": df,
        "settings": settings,
        "verbose": verbose
    }
)

sales_univ["sales"] = df_sales

In [None]:
write_checkpoint(sales_univ, "2-clean-03-out")

In [None]:
# from openavmkit.utilities.settings import get_fields_categorical, get_fields_boolean, get_fields_numeric
# from openavmkit.data import boolify_series

# gdf = gdf.rename(columns={"key_primary":"key", "key_secondary":"key2"})
# gdf2 = gdf[~gdf["key"].isin(df["key"])]
# gdf2

# cols = [col for col in df.columns if col in gdf2]

# gdf2 = gdf2[cols]
# gdf2

# df3 = pd.concat([df, gdf2])
# df3

# fields_cat = get_fields_categorical(settings, include_boolean=False)
# fields_bool = get_fields_boolean(settings)
# fields_num = get_fields_numeric(settings, include_boolean=False)

# for col in df3.columns:
#     if col in fields_cat:
#         df3[col] = df3[col].astype("string")
#     elif col in fields_bool:
#         df3[col] = boolify_series(df3[col])
#     elif col in fields_num:
#         df3[col] = df3[col].astype("Float64")

# df3.loc[df3["model_group"].ne("residential_sf"),"model_group"] = None
# df3.to_parquet("universe_merge.parquet")