In [1]:
# Modules for data manipulation
import polars as pl
from polars import col as c
import numpy as np

# Modules for modelling
from sklearn.ensemble import RandomForestRegressor
from crospint import create_model_pipeline

# Measuring execution time
import time

# Function to model distributions
from scipy.stats import laplace_asymmetric

# Import plotting functions
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'crospint'

In [None]:
# Define a function to retrieve the data from S3
def retrieve_transaction_data(type_housing_unit):
    assert type_housing_unit in["houses", "flats"], "type_housing_unit must be either 'houses' or 'flats'"
    df = (
        pl.read_parquet(
            f"https://minio.lab.sspcloud.fr/oliviermeslin/diffusion/DVF/transaction_data_{type_housing_unit}_open_data.parquet"
        )
    )
    return(df)

In [None]:
# Download data from S3
df_transactions = retrieve_transaction_data("flats")

In [None]:
# Show transactions
df_transactions.head(10)

In [None]:
# Build the target
df_transactions = (
    df_transactions
    .with_columns(
        log_price_sqm = np.log(c.transaction_amount/c.floor_area)
    )
)

In [None]:
# Define model hyperparameters
params = {
    "coord_rotation__coordinates_names": ("x", "y"),
    "coord_rotation__number_axis": 11,
    "date_conversion__date_name": "transaction_date",
    "date_conversion__reference_date": "2010-01-01",
    "model__n_estimators": 120,
    "model__min_samples_split": 20,
    "model__min_samples_leaf": 10,
    "model__n_jobs": -1,
    "model__max_features": 0.4,
    "model__verbose": 3,
    "model__oob_score": True,
    "model__random_state": 20230516
}
model_features = ['floor_area', 'transaction_amount', 'x', 'y', 'seashore_distance']
target_variable_name = "log_price_sqm"

In [None]:
# Instantiate the model
model = create_model_pipeline(model=RandomForestRegressor())

In [None]:
# Pass parameters to the models
model.set_params(
    **params
)

In [None]:
# Train the model
start_time = time.perf_counter()
model.fit(
    df_transactions.select(model_features),
    df_transactions.select(target_variable_name).to_numpy().ravel()
)
end_time = time.perf_counter()
print(f'    Training of the outlier detection model took {end_time - start_time}')


In [None]:
model["model"].oob_score_

In [None]:
# Add the OOB prediction to the data and compute residuals
df_transactions2 = (
    df_transactions
    .with_columns(
        price_prediction_oob = pl.Series(model["model"].oob_prediction_)
    )
    .with_columns(
        residual_oob = pl.col(target_variable_name) - c.price_prediction_oob
    )
)

In [None]:
# Fit an asymmetric Laplace distribution
kappa_la, loc_la, scale_la = laplace_asymmetric.fit(
    df_transactions2["residual_oob"]
)
print(kappa_la, loc_la, scale_la)

In [None]:
# Compare the distribution of residuals with an asymmetric Laplace Distribution
# and find appropriate thresholds for cleaning
x_r1 = [q / 10000 for q in range(1, 10000)]
x_r100 = np.linspace(-4, 4, 10000)

plt.close()
fig, ax = plt.subplots()
fig.set_figwidth(7)
ax.plot(
    laplace_asymmetric.ppf(x_r1, kappa_la, loc=loc_la, scale=scale_la),
    # quantiles of Laplace distribution using the parameter estimates from our data
    np.quantile(df_transactions2["residual_oob"], x_r1),
    "k",
)

# actual quantiles of our data
ax.plot(x_r100, x_r100, "--")

# Quantile of laplace distribution
ax.axvline(x=laplace_asymmetric.ppf(0.01, kappa_la, loc=loc_la, scale=scale_la), color='red', linestyle='--')
ax.axvline(x=laplace_asymmetric.ppf(0.999, kappa_la, loc=loc_la, scale=scale_la), color='green', linestyle='--')

# aesthetics
ax.set_xlim(-2.5, 2.5)
ax.set_ylim(-4.5, 4)
plt.annotate(
    'P01',
    (laplace_asymmetric.ppf(0.009, kappa_la, loc=loc_la, scale=scale_la), 0.9),
    xycoords=('data', 'axes fraction'),
    ha='right',
    va='top',
    rotation=90)
plt.annotate(
    'P99.9',
    (laplace_asymmetric.ppf(0.9985, kappa_la, loc=loc_la, scale=scale_la), 0.9),
    xycoords=('data', 'axes fraction'),
    ha='right',
    va='top',
    rotation=90)
ax.set_xlabel("Quantiles of asymmetric Laplace distribution")
ax.set_ylabel("Quantiles of OOB residuals distribution")

ax.grid(True)
plt.show()

In [None]:
# Compute thresholds
th_low  = laplace_asymmetric.ppf(0.01, kappa_la, loc=loc_la, scale=scale_la)
th_high = laplace_asymmetric.ppf(0.999, kappa_la, loc=loc_la, scale=scale_la)