In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
import json
import urllib.request
import glob
import plotly.express as px
import mlflow
import os
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
fichiers = sorted(glob.glob('../data/dvf20*.csv'))
dfs = []
for f in fichiers:
    dftmp = pd.read_csv(f, dtype=str)
    dftmp.columns = [col.strip().replace(" ", "_").replace("-", "_").replace("é", "e").replace("É", "E").lower() for col in dftmp.columns]
    if 'insee_com' in dftmp.columns:
        dftmp = dftmp.rename(columns={'insee_com': 'code_commune_insee'})
    if 'codepostal' in dftmp.columns:
        dftmp = dftmp.rename(columns={'codepostal': 'code_postal'})
    for col in dftmp.columns:
        if col != 'annee' and col.lower() == 'annee':
            dftmp = dftmp.rename(columns={col: 'annee'})
    dfs.append(dftmp)

df = pd.concat(dfs, ignore_index=True)
df = df.loc[:, ~df.columns.duplicated()]
for col in ['code_commune_insee', 'code_postal']:
    if col in df.columns:
        df[col] = df[col].astype(str).str.zfill(5)
if 'annee' in df.columns and df['annee'].ndim == 1:
    df['annee'] = pd.to_numeric(df['annee'], errors='coerce').fillna(0).astype(int)

colonnes_utiles = [
    'code_commune_insee', 'code_postal', 'annee', 'nb_mutations', 'nbmaisons', 'nbapparts',
    'propmaison', 'propappart', 'prixmoyen', 'prixm2moyen', 'surfacemoy'
]
df = df[[col for col in colonnes_utiles if col in df.columns]]
df = df.drop_duplicates()

cols_num = ['nb_mutations', 'nbmaisons', 'nbapparts', 'propmaison', 'propappart',
            'prixmoyen', 'prixm2moyen', 'surfacemoy']
for col in cols_num:
    if col in df.columns and df[col].ndim == 1:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

df.to_csv('df_prix_clean.csv', index=False, encoding='utf-8')


df_prix = pd.read_csv('df_prix_clean.csv', dtype=str)
df_coord = pd.read_csv('../data/ref_espace_communes.csv', dtype=str)

df_prix.columns = [col.strip().replace(" ", "_").replace("-", "_").replace("é", "e").replace("É", "E").lower() for col in df_prix.columns]
df_coord.columns = [col.strip().replace(" ", "_").replace("-", "_").replace("é", "e").replace("É", "E").lower() for col in df_coord.columns]

col_insee_prix = [col for col in df_prix.columns if "insee" in col][0]
col_insee_coord = [col for col in df_coord.columns if "insee" in col][0]
df_prix[col_insee_prix] = df_prix[col_insee_prix].astype(str).str.zfill(5)
df_coord[col_insee_coord] = df_coord[col_insee_coord].astype(str).str.zfill(5)
df_prix = df_prix.drop_duplicates(subset=[col_insee_prix, 'annee'])
df_coord = df_coord.drop_duplicates(subset=[col_insee_coord])

df_merged = pd.merge(
    df_prix, df_coord,
    left_on=col_insee_prix,
    right_on=col_insee_coord,
    how='left',
    suffixes=('', '_coord')
)

if col_insee_coord in df_merged.columns and col_insee_coord != col_insee_prix:
    df_merged = df_merged.drop(columns=[col_insee_coord])
df_merged.to_csv('df_merged_clean.csv', index=False, encoding='utf-8')

In [2]:
df = pd.read_csv('df_merged_clean.csv', dtype=str)
df.columns = [c.lower() for c in df.columns]
for col in ['code_commune_insee', 'latitude', 'longitude']:
    if col in df.columns:
        df[col] = df[col].astype(str)
num_cols = ['annee', 'prixm2moyen', 'latitude', 'longitude']
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df = df.dropna(subset=['code_commune_insee', 'annee', 'prixm2moyen', 'latitude', 'longitude'])
df = df[(df['annee'] >= 2014) & (df['annee'] <= 2024)]
df = df.sort_values(['code_commune_insee', 'annee'])
df['prixm2moyen_annee_moins1'] = df.groupby('code_commune_insee')['prixm2moyen'].shift(1)
df_train = df[df['annee'] <= 2023].dropna(subset=['prixm2moyen_annee_moins1'])

In [None]:
# features = ['latitude', 'longitude', 'annee', 'prixm2moyen_annee_moins1']
# X = df_train[features]
# y = df_train['prixm2moyen']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# pipe = Pipeline([
#     ('scaler', StandardScaler()),
#     ('rf', RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))
# ])
# pipe.fit(X_train, y_train)
# print("Performance sur test :")
# print("MAE :", np.round(np.abs(pipe.predict(X_test) - y_test).mean(), 2))
# print("R2 :", np.round(pipe.score(X_test, y_test), 3))

In [3]:
EXPERIMENT_NAME="01_oasis_real_estate_RFG"
mlflow.set_tracking_uri(os.environ['MLFLOW_TRACKING_URI'])
mlflow.set_experiment(EXPERIMENT_NAME)
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
mlflow.lightgbm.autolog()

with mlflow.start_run(experiment_id = experiment.experiment_id):
    test_size = 0.1
    random_state = 42
    
    features = ['latitude', 'longitude', 'annee', 'prixm2moyen_annee_moins1']
    X = df_train[features]
    y = df_train['prixm2moyen']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    n_estimators=200
    n_jobs=-1
    pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestRegressor(n_estimators=n_estimators, random_state=random_state, n_jobs=n_jobs))
    ])
    
    pipe.fit(X_train, y_train)
    print("Performance sur test :")
    print("MAE :", np.round(np.abs(pipe.predict(X_test) - y_test).mean(), 2))
    print("R2 :", np.round(pipe.score(X_test, y_test), 3))


2025/07/31 11:35:02 INFO mlflow.tracking.fluent: Experiment with name '01_oasis_real_estate_RFG' does not exist. Creating a new experiment.


Performance sur test :
MAE : 305.16
R2 : 0.692
🏃 View run stately-calf-866 at: https://oasisorg-oasis-mlflow.hf.space/#/experiments/878796768417794893/runs/acac9a64aa5146d596d51505f7fb6955
🧪 View experiment at: https://oasisorg-oasis-mlflow.hf.space/#/experiments/878796768417794893


In [None]:
df_2024 = df[df['annee'] == 2024][['code_commune_insee', 'latitude', 'longitude', 'prixm2moyen']]
df_2024 = df_2024.dropna(subset=['prixm2moyen', 'latitude', 'longitude'])

latitudes = df_2024['latitude'].astype(float).values
longitudes = df_2024['longitude'].astype(float).values
prixs = df_2024['prixm2moyen'].astype(float).values
codes_insee = df_2024['code_commune_insee'].values

for year in range(2025, 2030):
    X_pred = pd.DataFrame({
        'latitude': latitudes,
        'longitude': longitudes,
        'annee': np.full_like(prixs, year, dtype=float),
        'prixm2moyen_annee_moins1': prixs
    })
    prixs = pipe.predict(X_pred)

df_pred = pd.DataFrame({
    'code_commune_insee': codes_insee,
    'latitude': latitudes,
    'longitude': longitudes,
    'prixm2moyen_2024': df_2024['prixm2moyen'].astype(float).values,
    'prixm2moyen_2029_pred': prixs
})

# 6. Calcul du taux de variation en %
df_pred['variation_%'] = 100 * (df_pred['prixm2moyen_2029_pred'] - df_pred['prixm2moyen_2024']) / df_pred['prixm2moyen_2024']

# 7. Carte choroplèthe interactive
geojson_url = "https://france-geojson.gregoiredavid.fr/repo/communes.geojson"
with urllib.request.urlopen(geojson_url) as response:
    communes_geojson = json.load(response)

df_pred['code_commune_insee'] = df_pred['code_commune_insee'].astype(str).str.zfill(5)

fig = px.choropleth_map(
    df_pred,
    geojson=communes_geojson,
    locations='code_commune_insee',
    featureidkey='properties.code',
    color='variation_%',
    color_continuous_scale="RdYlGn",
    range_color=(df_pred['variation_%'].min(), df_pred['variation_%'].max()),
    center={"lat": 46.6, "lon": 2.6},
    zoom=5,
    opacity=0.75,
    hover_name='code_commune_insee',
    hover_data={
        'prixm2moyen_2024': ':.0f',
        'prixm2moyen_2029_pred': ':.0f',
        'variation_%': ':.1f'
    },
    title="Variation % du prix au m² entre 2024 et 2029 (RandomForest ML)"
)
fig.show()

df_pred.to_csv('predictions_prix_2029_commune.csv', index=False, encoding='utf-8')
print("Pipeline complet terminé. Prédictions exportées dans predictions_prix_2029_commune.csv")