In [None]:
import pandas as pd
import numpy as np
import glob

# 1. Import & clean
fichiers = sorted(glob.glob('../data/dvf20*.csv'))
dfs = []
for f in fichiers:
    dftmp = pd.read_csv(f, dtype=str)
    dftmp.columns = [col.strip().replace(" ", "_").replace("-", "_").replace("é", "e").replace("É", "E").lower() for col in dftmp.columns]
    if 'insee_com' in dftmp.columns:
        dftmp = dftmp.rename(columns={'insee_com': 'code_commune_insee'})
    if 'codepostal' in dftmp.columns:
        dftmp = dftmp.rename(columns={'codepostal': 'code_postal'})
    dfs.append(dftmp)

df = pd.concat(dfs, ignore_index=True)
df = df.loc[:, ~df.columns.duplicated()]
for col in ['code_commune_insee', 'code_postal']:
    if col in df.columns:
        df[col] = df[col].astype(str).str.zfill(5)
df['annee'] = pd.to_numeric(df['annee'], errors='coerce').fillna(0).astype(int)
for c in ['nb_mutations', 'nbmaisons', 'nbapparts', 'propmaison', 'propappart', 'prixmoyen', 'prixm2moyen', 'surfacemoy']:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0)

colonnes_utiles = [
    'code_commune_insee', 'annee', 'nb_mutations', 'nbmaisons', 'nbapparts',
    'propmaison', 'propappart', 'prixmoyen', 'prixm2moyen', 'surfacemoy'
]
df = df[[col for col in colonnes_utiles if col in df.columns]].drop_duplicates()

# Export prix clean
df.to_csv('df_prix_clean.csv', index=False, encoding='utf-8')
print("Export : df_prix_clean.csv")

In [None]:
# 2. Fusion coordonnées
df_coord = pd.read_csv('../data/ref_espace_communes.csv', dtype=str)
df_coord.columns = [col.strip().replace(" ", "_").replace("-", "_").replace("é", "e").replace("É", "E").lower() for col in df_coord.columns]
col_insee_coord = [col for col in df_coord.columns if "insee" in col][0]
df_coord = df_coord.rename(columns={col_insee_coord: 'code_commune_insee'})
df_coord['code_commune_insee'] = df_coord['code_commune_insee'].astype(str).str.zfill(5)
df_coord = df_coord.drop_duplicates(subset=['code_commune_insee'])

df_merged = pd.merge(df, df_coord[['code_commune_insee', 'latitude', 'longitude']], on='code_commune_insee', how='left')
df_merged['latitude'] = pd.to_numeric(df_merged['latitude'], errors='coerce')
df_merged['longitude'] = pd.to_numeric(df_merged['longitude'], errors='coerce')

# Export merged clean
df_merged.to_csv('df_merged_clean.csv', index=False, encoding='utf-8')
print("Export : df_merged_clean.csv")

In [None]:
# On repart du merged clean
df = df_merged.copy()
# On limite 2014-2024 (10 ans d'historique pour tout le monde)
df = df.dropna(subset=['code_commune_insee', 'annee', 'prixm2moyen', 'latitude', 'longitude'])
df = df[(df['annee'] >= 2014) & (df['annee'] <= 2024)].sort_values(['code_commune_insee', 'annee'])

# On ajoute 5 lags (2019-2023) pour prédire 2024 puis rolling
for lag in range(1, 6):
    df[f'prixm2moyen_lag{lag}'] = df.groupby('code_commune_insee')['prixm2moyen'].shift(lag)

In [None]:
from lightgbm import LGBMRegressor, early_stopping
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Features pour le modèle
features = ['annee', 'latitude', 'longitude'] + [f'prixm2moyen_lag{lag}' for lag in range(1, 6)]
df_train = df[df['annee'] <= 2023].dropna(subset=[f'prixm2moyen_lag{lag}' for lag in range(1, 6)] + ['latitude', 'longitude'])
X = df_train[features]
y = df_train['prixm2moyen']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.12, random_state=42)

model = LGBMRegressor(
    n_estimators=350,
    objective='regression',
    random_state=42,
    verbose=-1
)
model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='mae',
    callbacks=[early_stopping(20)]
)

# Affichage clair
best_iter = model.best_iteration_
best_mae = model.best_score_['valid_0']['l1']
best_mse = model.best_score_['valid_0']['l2']
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print("\n================ Résumé entraînement LightGBM ================")
print(f"👉 Early stopping : arrêt automatique après 20 itérations sans amélioration.")
print(f"👉 Meilleure itération atteinte : {best_iter}")
print(f"    - Erreur absolue moyenne (MAE, l1) sur validation : {best_mae:.2f} €")
print(f"    - Erreur quadratique moyenne (MSE, l2) sur validation : {best_mse:.2f}")
print("---------------------------------------------------------------")
print(f"MAE  global : {mae:.2f} €")
print(f"RMSE global : {rmse:.2f} €")
print(f"R²   global : {r2:.3f}")
print("===============================================================\n")

In [None]:
# Pour chaque commune, on part des prix 2020-2024 pour prédire 2025, puis rolling
df_2024 = df[df['annee'] == 2024][['code_commune_insee', 'latitude', 'longitude', 'prixm2moyen']]
for lag in range(1, 6):
    df_2024[f'prixm2moyen_lag{lag}'] = df.groupby('code_commune_insee')['prixm2moyen'].shift(lag).loc[df['annee'] == 2024].values

df_pred = df_2024.dropna(subset=[f'prixm2moyen_lag{lag}' for lag in range(1, 6)] + ['latitude', 'longitude']).copy()
df_pred['prixm2moyen_2024'] = df_pred['prixm2moyen'].astype(float)

for an in range(2025, 2030):
    X_pred = pd.DataFrame({
        'annee': [an]*len(df_pred),
        'latitude': df_pred['latitude'],
        'longitude': df_pred['longitude'],
        'prixm2moyen_lag1': df_pred['prixm2moyen_lag1'],
        'prixm2moyen_lag2': df_pred['prixm2moyen_lag2'],
        'prixm2moyen_lag3': df_pred['prixm2moyen_lag3'],
        'prixm2moyen_lag4': df_pred['prixm2moyen_lag4'],
        'prixm2moyen_lag5': df_pred['prixm2moyen_lag5'],
    })
    df_pred[f'prixm2moyen_{an}_pred'] = model.predict(X_pred)
    # On décale les lags pour l’année suivante (rolling forecast)
    for lag in range(5, 1, -1):
        df_pred[f'prixm2moyen_lag{lag}'] = df_pred[f'prixm2moyen_lag{lag-1}']
    df_pred['prixm2moyen_lag1'] = df_pred[f'prixm2moyen_{an}_pred']

In [None]:
import plotly.express as px
import json
import urllib.request

# Colonnes finales et variation
cols_out = (
    ['code_commune_insee', 'latitude', 'longitude', 'prixm2moyen_2024'] +
    [f'prixm2moyen_{an}_pred' for an in range(2025, 2030)]
)
df_pred = df_pred[cols_out]
df_pred['variation_%'] = 100 * (df_pred['prixm2moyen_2029_pred'] - df_pred['prixm2moyen_2024']) / df_pred['prixm2moyen_2024']

# Carte
geojson_url = "https://france-geojson.gregoiredavid.fr/repo/communes.geojson"
with urllib.request.urlopen(geojson_url) as response:
    communes_geojson = json.load(response)

df_pred['code_commune_insee'] = df_pred['code_commune_insee'].astype(str).str.zfill(5)
def prix_fmt(val): return f"€ {int(round(val))}" if pd.notna(val) else "NA"
custom_data = [
    df_pred['prixm2moyen_2024'].apply(prix_fmt),
    df_pred['prixm2moyen_2025_pred'].apply(prix_fmt),
    df_pred['prixm2moyen_2026_pred'].apply(prix_fmt),
    df_pred['prixm2moyen_2027_pred'].apply(prix_fmt),
    df_pred['prixm2moyen_2028_pred'].apply(prix_fmt),
    df_pred['prixm2moyen_2029_pred'].apply(prix_fmt),
    df_pred['variation_%'].round(1)
]
fig = px.choropleth_map(
    df_pred,
    geojson=communes_geojson,
    locations='code_commune_insee',
    featureidkey='properties.code',
    color='variation_%',
    color_continuous_scale="RdYlGn",
    range_color=(df_pred['variation_%'].min(), df_pred['variation_%'].max()),
    center={"lat": 46.6, "lon": 2.6},
    zoom=5,
    opacity=0.80,
    hover_name='code_commune_insee',
    hover_data=None,
    custom_data=custom_data,
    title="Variation % du prix au m² entre 2024 et 2029 (LightGBM, rolling trend historique)"
)
fig.update_layout(
    autosize=False,
    width=1200,
    height=900,
    margin={"r":0,"t":50,"l":0,"b":0},
    legend_title_text='Variation (%)',
    font=dict(size=16),
    title_x=0.5,
    updatemenus=[dict(type="buttons", showactive=False,
        buttons=[dict(label="Plein écran", method="relayout", args=[{"width":1800, "height":1000}])])]
)
fig.update_traces(
    hovertemplate=
        "<b>INSEE = %{location}</b><br><br>" +
        "Prix 2024 = %{customdata[0]}<br>" +
        "Prix 2025 = %{customdata[1]}<br>" +
        "Prix 2026 = %{customdata[2]}<br>" +
        "Prix 2027 = %{customdata[3]}<br>" +
        "Prix 2028 = %{customdata[4]}<br>" +
        "Prix 2029 = %{customdata[5]}<br>" +
        "Variation 2024→2029 = %{customdata[6]} %<br>" +
        "<extra></extra>"
)
fig.show()

# Export CSV prévisions
df_pred.to_csv('predictions_prix_2025_2029_commune_lgbm.csv', index=False, encoding='utf-8')
print("Export : predictions_prix_2025_2029_commune_lgbm.csv")