In [None]:
# =========================
# Environment Setup
# =========================

# Install uv for dependency management
!pip install -q uv
# Retrieve the list of dependencies
!wget https://raw.githubusercontent.com/oliviermeslin/crospint/main/pyproject.toml
# Install dependencies
!uv pip install -r pyproject.toml
# Install crospint
!uv pip install crospint

## What is this notebook about?

This notebook illustrates the _conditional tail removal_ procedure defined in [Meslin, 2026]. 

⚠️⚠️⚠️
This notebook is computationally intensive. Read the section on "How to use this notebook" before running the code.  
⚠️⚠️⚠️

### The CTR procedure

The CTR procedure has four steps:

- A random forest is trained to predict the (log) price per square meter $p_{it}$. This model writes: $p_{it} = h\left(g\left(x_{i}, y_{i}\right), d_i, t, S_i\right)$ with $p_{it}$ the log-price per square meter, $x_i, y_i$ the coordinates of housing unit $i$, $d_i$ the shortest distance to the sea, $t$ the transaction date and $S_i$ the floor area of the housing unit. Coordinate rotation $g$ is applied to coordinates before training the model.
- Once the random forest is trained, residuals are computed for all transactions as the difference between the (log) price per square meter and the model out-of-bag prediction: $\hat{\epsilon}_{it} = p_{it} - \hat{h}^{\text{OOB}}\left(g\left(x_{i}, y_{i}\right), d_i, t, S_i\right)$
- Statistical tests are conducted to determine which distribution of reference best fits the distribution of residuals. In the present case, the distribution residuals matches closely an asymmetric Laplace distribution.
- Comparing the score distribution with the distribution of reference using a quantile-quantile plot allows identifying thresholds where the tails of the score distribution start to diverge from the reference distribution.

### How to use this notebook

In this notebook, the model hyperparameters are set to values compatible with proper outlier detection, but may induce a long training time as training random forest is computationally demanding, and resources on Colab are limited. If you just want to make a quick test, I advise you to adjust the hyperparameters so that training is much faster:

- Reduce `model__n_estimators` to 40;
- Increase `model__min_samples_split` to 200;
- Increase `model__min_samples_leaf` to 100.

At some point, you can use data on `houses` or `flats`. Modify the line `df_transactions = retrieve_transaction_data("flats")` depending on what you want to look at.


In [None]:
# =========================
# The notebook starts here
# =========================

import polars as pl
from polars import col as c
import numpy as np

# Modules for modelling
from sklearn.ensemble import RandomForestRegressor
from crospint import create_model_pipeline

# Measuring execution time
import time

# Function to model distributions
from scipy.stats import laplace_asymmetric

# Import plotting functions
import matplotlib.pyplot as plt

In [None]:
# Define a function to retrieve the data from S3
def retrieve_transaction_data(type_housing_unit):
    assert type_housing_unit in["houses", "flats"], "type_housing_unit must be either 'houses' or 'flats'"
    df = (
        pl.read_parquet(
            f"https://minio.lab.sspcloud.fr/oliviermeslin/diffusion/DVF/transaction_data_{type_housing_unit}_open_data.parquet"
        )
    )
    return(df)

In [None]:
# Download data from S3
df_transactions = retrieve_transaction_data("flats")

In [None]:
# Show transactions
df_transactions.head(10)

In [None]:
# Build the target
df_transactions = (
    df_transactions
    .with_columns(
        log_price_sqm = np.log(c.transaction_amount/c.floor_area)
    )
)

In [None]:
df_transactions.columns

In [None]:
# Define model hyperparameters
params = {
    "coord_rotation__coordinates_names": ("x", "y"),
    "coord_rotation__number_axis": 11,
    "date_conversion__date_name": "transaction_date",
    "date_conversion__reference_date": "2010-01-01",
    "model__n_estimators": 120,
    "model__min_samples_split": 20,
    "model__min_samples_leaf": 10,
    "model__n_jobs": -1,
    "model__max_features": 0.4,
    "model__verbose": 3,
    "model__oob_score": True,
    "model__random_state": 20230516
}
model_features = ['floor_area', 'transaction_date', 'x', 'y', 'seashore_distance']
target_variable_name = "log_price_sqm"

In [None]:
# Instantiate the model
model = create_model_pipeline(model=RandomForestRegressor())

In [None]:
# Pass parameters to the models
model.set_params(
    **params
)

In [None]:
# Train the model
start_time = time.perf_counter()
model.fit(
    df_transactions.select(model_features),
    df_transactions.select(target_variable_name).to_numpy().ravel()
)
end_time = time.perf_counter()
print(f'    Training of the outlier detection model took {end_time - start_time}')


In [None]:
model["model"].oob_score_

In [None]:
# Add the OOB prediction to the data and compute residuals
df_transactions2 = (
    df_transactions
    .with_columns(
        price_prediction_oob = pl.Series(model["model"].oob_prediction_)
    )
    .with_columns(
        residual_oob = pl.col(target_variable_name) - c.price_prediction_oob
    )
)

In [None]:
# Fit an asymmetric Laplace distribution
kappa_la, loc_la, scale_la = laplace_asymmetric.fit(
    df_transactions2["residual_oob"]
)
print(kappa_la, loc_la, scale_la)

In [None]:
# Compare the distribution of residuals with an asymmetric Laplace Distribution
# and find appropriate thresholds for cleaning
x_r1 = [q / 10000 for q in range(1, 10000)]
x_r100 = np.linspace(-4, 4, 10000)

plt.close()
fig, ax = plt.subplots()
fig.set_figwidth(7)
ax.plot(
    laplace_asymmetric.ppf(x_r1, kappa_la, loc=loc_la, scale=scale_la),
    # quantiles of Laplace distribution using the parameter estimates from our data
    np.quantile(df_transactions2["residual_oob"], x_r1),
    "k",
)

# actual quantiles of our data
ax.plot(x_r100, x_r100, "--")

# Quantile of laplace distribution
ax.axvline(x=laplace_asymmetric.ppf(0.01, kappa_la, loc=loc_la, scale=scale_la), color='red', linestyle='--')
ax.axvline(x=laplace_asymmetric.ppf(0.999, kappa_la, loc=loc_la, scale=scale_la), color='green', linestyle='--')

# aesthetics
ax.set_xlim(-2.5, 2.5)
ax.set_ylim(-4.5, 4)
plt.annotate(
    'P01',
    (laplace_asymmetric.ppf(0.009, kappa_la, loc=loc_la, scale=scale_la), 0.9),
    xycoords=('data', 'axes fraction'),
    ha='right',
    va='top',
    rotation=90)
plt.annotate(
    'P99.9',
    (laplace_asymmetric.ppf(0.9985, kappa_la, loc=loc_la, scale=scale_la), 0.9),
    xycoords=('data', 'axes fraction'),
    ha='right',
    va='top',
    rotation=90)
ax.set_xlabel("Quantiles of asymmetric Laplace distribution")
ax.set_ylabel("Quantiles of OOB residuals distribution")

ax.grid(True)
plt.show()

In [None]:
# Compute thresholds
th_low  = laplace_asymmetric.ppf(0.01, kappa_la, loc=loc_la, scale=scale_la)
th_high = laplace_asymmetric.ppf(0.999, kappa_la, loc=loc_la, scale=scale_la)