In [160]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor, early_stopping
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [161]:
df = pd.read_csv('../weather/data_risk_weather.csv')

In [162]:
print(df.shape)
df.head()

(29878, 7)


Unnamed: 0,code_commune_INSEE,year,drought_score,heat_score,rainfall_score,extrem_events_score,department
0,1014,2014,1,1,2,1,1
1,1014,2015,2,2,2,1,1
2,1014,2016,1,1,2,1,1
3,1014,2017,1,1,2,1,1
4,1014,2018,1,1,2,1,1


In [163]:
df.isna().sum()

code_commune_INSEE     0
year                   0
drought_score          0
heat_score             0
rainfall_score         0
extrem_events_score    0
department             0
dtype: int64

In [164]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29878 entries, 0 to 29877
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   code_commune_INSEE   29878 non-null  int64
 1   year                 29878 non-null  int64
 2   drought_score        29878 non-null  int64
 3   heat_score           29878 non-null  int64
 4   rainfall_score       29878 non-null  int64
 5   extrem_events_score  29878 non-null  int64
 6   department           29878 non-null  int64
dtypes: int64(7)
memory usage: 1.6 MB


In [165]:
df['code_commune_INSEE'].astype(str).str.len().value_counts()

code_commune_INSEE
5    26524
4     3354
Name: count, dtype: int64

In [166]:
df['department'].astype(str).str.len().value_counts()

department
2    26524
1     3354
Name: count, dtype: int64

In [167]:
df['department'] = df['department'].astype(str).str.zfill(2)
df['code_commune_INSEE'] = df['code_commune_INSEE'].astype(str).str.zfill(5)

In [168]:
print(df['code_commune_INSEE'].astype(str).str.len().value_counts())
df['department'].astype(str).str.len().value_counts()

code_commune_INSEE
5    29878
Name: count, dtype: int64


department
2    29878
Name: count, dtype: int64

In [169]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29878 entries, 0 to 29877
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   code_commune_INSEE   29878 non-null  object
 1   year                 29878 non-null  int64 
 2   drought_score        29878 non-null  int64 
 3   heat_score           29878 non-null  int64 
 4   rainfall_score       29878 non-null  int64 
 5   extrem_events_score  29878 non-null  int64 
 6   department           29878 non-null  object
dtypes: int64(5), object(2)
memory usage: 1.6+ MB


In [170]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29878 entries, 0 to 29877
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   code_commune_INSEE   29878 non-null  object
 1   year                 29878 non-null  int64 
 2   drought_score        29878 non-null  int64 
 3   heat_score           29878 non-null  int64 
 4   rainfall_score       29878 non-null  int64 
 5   extrem_events_score  29878 non-null  int64 
 6   department           29878 non-null  object
dtypes: int64(5), object(2)
memory usage: 1.6+ MB


In [171]:
df['year'].nunique()

11

In [172]:
df.head()

Unnamed: 0,code_commune_INSEE,year,drought_score,heat_score,rainfall_score,extrem_events_score,department
0,1014,2014,1,1,2,1,1
1,1014,2015,2,2,2,1,1
2,1014,2016,1,1,2,1,1
3,1014,2017,1,1,2,1,1
4,1014,2018,1,1,2,1,1


In [173]:
#prediction sur les anciennes colonnes
for lag in range(1, 4):
    df[f'heat_score_lag{lag}'] = df.groupby('code_commune_INSEE')['heat_score'].shift(lag)

In [174]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29878 entries, 0 to 29877
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   code_commune_INSEE   29878 non-null  object 
 1   year                 29878 non-null  int64  
 2   drought_score        29878 non-null  int64  
 3   heat_score           29878 non-null  int64  
 4   rainfall_score       29878 non-null  int64  
 5   extrem_events_score  29878 non-null  int64  
 6   department           29878 non-null  object 
 7   heat_score_lag1      26378 non-null  float64
 8   heat_score_lag2      22964 non-null  float64
 9   heat_score_lag3      19638 non-null  float64
dtypes: float64(3), int64(5), object(2)
memory usage: 2.3+ MB


In [65]:
df.head()

Unnamed: 0,code_commune_INSEE,year,drought_score,heat_score,rainfall_score,extrem_events_score,department,heat_score_lag1,heat_score_lag2,heat_score_lag3
0,1014,2014,1.0,1.0,2.0,1,1,,,
1,1014,2015,2.0,2.0,2.0,1,1,1.0,,
2,1014,2016,1.0,1.0,2.0,1,1,2.0,1.0,
3,1014,2017,1.0,1.0,2.0,1,1,1.0,2.0,1.0
4,1014,2018,1.0,1.0,2.0,1,1,1.0,1.0,2.0


In [156]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29878 entries, 0 to 29877
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   code_commune_INSEE   29878 non-null  object  
 1   year                 29878 non-null  int64   
 2   drought_score        29878 non-null  int64   
 3   heat_score           29878 non-null  category
 4   rainfall_score       29878 non-null  int64   
 5   extrem_events_score  29878 non-null  int64   
 6   department           29878 non-null  object  
 7   heat_score_lag1      26378 non-null  category
 8   heat_score_lag2      22964 non-null  category
 9   heat_score_lag3      19638 non-null  category
dtypes: category(4), int64(4), object(2)
memory usage: 1.5+ MB


In [184]:
df_train = df[df['year'] < 2022].dropna(subset=[f'heat_score_lag{lag}' for lag in range(1, 4)])
df_val = df[(df['year'] >= 2022) & (df['year'] <= 2023)].dropna(subset=[f'heat_score_lag{lag}' for lag in range(1, 4)])
df_test = df[df['year'] == 2024].dropna(subset=[f'heat_score_lag{lag}' for lag in range(1, 4)])

In [176]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29878 entries, 0 to 29877
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   code_commune_INSEE   29878 non-null  object 
 1   year                 29878 non-null  int64  
 2   drought_score        29878 non-null  int64  
 3   heat_score           29878 non-null  int64  
 4   rainfall_score       29878 non-null  int64  
 5   extrem_events_score  29878 non-null  int64  
 6   department           29878 non-null  object 
 7   heat_score_lag1      26378 non-null  float64
 8   heat_score_lag2      22964 non-null  float64
 9   heat_score_lag3      19638 non-null  float64
dtypes: float64(3), int64(5), object(2)
memory usage: 2.3+ MB


In [185]:
df_test.isna().sum()

code_commune_INSEE     0
year                   0
drought_score          0
heat_score             0
rainfall_score         0
extrem_events_score    0
department             0
heat_score_lag1        0
heat_score_lag2        0
heat_score_lag3        0
dtype: int64

In [187]:
for i in range(1, 4):
    df_train[f'heat_score_lag{i}'] = df_train[f'heat_score_lag{i}'].astype(int)
    df_val[f'heat_score_lag{i}'] = df_val[f'heat_score_lag{i}'].astype(int)
    df_test[f'heat_score_lag{i}'] = df_test[f'heat_score_lag{i}'].astype(int)

In [188]:
features = ['year'] + [f'heat_score_lag{lag}' for lag in range(1, 4)]

X_train = df_train[features]
y_train = df_train['heat_score']

X_val = df_val[features]
y_val = df_val['heat_score']

X_test = df_test[features]
y_test = df_test['heat_score']

In [189]:
model = LGBMRegressor(
    n_estimators=500,
    objective='regression',
    random_state=42,
    verbose=-1
)
model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='mae',
    callbacks=[early_stopping(20)]
)

# Affichage clair
best_iter = model.best_iteration_
best_mae = model.best_score_['valid_0']['l1']
best_mse = model.best_score_['valid_0']['l2']
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print("\n================ Résumé entraînement LightGBM ================")
print(f"👉 Early stopping : arrêt automatique après 20 itérations sans amélioration.")
print(f"👉 Meilleure itération atteinte : {best_iter}")
print(f"    - Erreur absolue moyenne (MAE, l1) sur validation : {best_mae:.2f} ")
print(f"    - Erreur quadratique moyenne (MSE, l2) sur validation : {best_mse:.2f}")
print("---------------------------------------------------------------")
print(f"MAE  global : {mae:.2f} ")
print(f"RMSE global : {rmse:.2f} ")
print(f"R²   global : {r2:.3f}")
print('R2 score:', r2)
print("===============================================================\n")

Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[12]	valid_0's l1: 0.262798	valid_0's l2: 0.180149

👉 Early stopping : arrêt automatique après 20 itérations sans amélioration.
👉 Meilleure itération atteinte : 12
    - Erreur absolue moyenne (MAE, l1) sur validation : 0.26 
    - Erreur quadratique moyenne (MSE, l2) sur validation : 0.18
---------------------------------------------------------------
MAE  global : 0.26 
RMSE global : 0.42 
R²   global : 0.354
R2 score: 0.3540434126929811



In [191]:
print(y_test)
print(y_pred)

10       2
28       2
56       1
78       2
96       1
        ..
29808    1
29837    1
29855    1
29866    1
29877    1
Name: heat_score, Length: 1802, dtype: int64
[1.08370583 1.08370583 1.08370583 ... 1.10681559 1.10681559 1.10681559]


In [53]:
y_pred.nunique()

AttributeError: 'numpy.ndarray' object has no attribute 'nunique'

In [110]:
df_2024 = df[df['year'] == 2024][['code_commune_INSEE', 'avg_weather_risk_score']]
for lag in range(1, 6):
    df_2024[f'avg_weather_risk_score_lag{lag}'] = (
        df
        .groupby(['code_commune_INSEE'])['avg_weather_risk_score']
        .shift(lag).loc[df['year'] == 2024].values
    )
    

In [111]:
print(df_2024.shape)
df_2024.head()

(141736, 7)


Unnamed: 0,code_commune_INSEE,avg_weather_risk_score,avg_weather_risk_score_lag1,avg_weather_risk_score_lag2,avg_weather_risk_score_lag3,avg_weather_risk_score_lag4,avg_weather_risk_score_lag5
16400,1001,0.22,0.31,1.0,0.33,0.11,0.2
16401,1002,0.22,0.31,1.0,0.33,0.11,0.2
16402,1004,0.22,0.31,1.0,0.33,0.11,0.2
16403,1005,0.22,0.31,1.0,0.33,0.11,0.2
16404,1006,0.22,0.31,1.0,0.33,0.11,0.2


In [114]:
model = LGBMRegressor(
    n_estimators=350,
    objective='regression',
    random_state=42,
    verbose=-1
)


for lag in range(1, 6):
    df_2024[f'avg_weather_risk_score_lag{lag}'] = (
        df
        .groupby(['code_commune_INSEE'])['avg_weather_risk_score']
        .shift(lag)
        .loc[df['year'] == 2024].values
    )

df_pred = df_2024.dropna(subset=[f'avg_weather_risk_score_lag{lag}' for lag in range(1, 6)] + ['code_commune_INSEE']).copy()
df_pred['avg_weather_risk_score_2024'] = df_pred['avg_weather_risk_score'].astype(float)

model = LGBMRegressor(n_estimators=350, 
                      objective='regression', 
                      random_state=42, 
                      verbose=-1)

X_train = df_pred[[f'avg_weather_risk_score_lag{lag}' for lag in range(1, 6)]]
y_train = df_pred['avg_weather_risk_score_2024']
model.fit(X_train, y_train)

for an in range(2025, 2030):
    X_pred = pd.DataFrame({
        'avg_weather_risk_score_lag1': df_pred['avg_weather_risk_score_lag1'],
        'avg_weather_risk_score_lag2': df_pred['avg_weather_risk_score_lag2'],
        'avg_weather_risk_score_lag3': df_pred['avg_weather_risk_score_lag3'],
        'avg_weather_risk_score_lag4': df_pred['avg_weather_risk_score_lag4'],
        'avg_weather_risk_score_lag5': df_pred['avg_weather_risk_score_lag5'],
    })
    df_pred[f'avg_weather_risk_score_{an}_pred'] = model.predict(X_pred)
    # On décale les lags pour l’année suivante (rolling forecast)
    for lag in range(5, 1, -1):
        df_pred[f'avg_weather_risk_score_lag{lag}'] = df_pred[f'avg_weather_risk_score_lag{lag-1}']
    df_pred['avg_weather_risk_score_lag1'] = df_pred[f'avg_weather_risk_score_{an}_pred']

In [115]:
print(df_pred.shape)

(141736, 13)


In [116]:
df_2024[[f'avg_weather_risk_score_lag{lag}' for lag in range(1, 6)]].isna().sum()

avg_weather_risk_score_lag1    0
avg_weather_risk_score_lag2    0
avg_weather_risk_score_lag3    0
avg_weather_risk_score_lag4    0
avg_weather_risk_score_lag5    0
dtype: int64

In [117]:
print(df_pred.shape)
df_pred.head()

(141736, 13)


Unnamed: 0,code_commune_INSEE,avg_weather_risk_score,avg_weather_risk_score_lag1,avg_weather_risk_score_lag2,avg_weather_risk_score_lag3,avg_weather_risk_score_lag4,avg_weather_risk_score_lag5,avg_weather_risk_score_2024,avg_weather_risk_score_2025_pred,avg_weather_risk_score_2026_pred,avg_weather_risk_score_2027_pred,avg_weather_risk_score_2028_pred,avg_weather_risk_score_2029_pred
16400,1001,0.22,0.350384,0.259918,0.820079,0.469925,0.219948,0.22,0.219948,0.469925,0.820079,0.259918,0.350384
16401,1002,0.22,0.350384,0.259918,0.820079,0.469925,0.219948,0.22,0.219948,0.469925,0.820079,0.259918,0.350384
16402,1004,0.22,0.350384,0.259918,0.820079,0.469925,0.219948,0.22,0.219948,0.469925,0.820079,0.259918,0.350384
16403,1005,0.22,0.350384,0.259918,0.820079,0.469925,0.219948,0.22,0.219948,0.469925,0.820079,0.259918,0.350384
16404,1006,0.22,0.350384,0.259918,0.820079,0.469925,0.219948,0.22,0.219948,0.469925,0.820079,0.259918,0.350384


In [None]:
df_pred['code_commune_INSEE'] = df_pred['code_commune_INSEE'].astype(str).str.zfill(5)
df_pred.head()

Unnamed: 0,code_commune_INSEE,avg_weather_risk_score,avg_weather_risk_score_lag1,avg_weather_risk_score_lag2,avg_weather_risk_score_lag3,avg_weather_risk_score_lag4,avg_weather_risk_score_lag5,avg_weather_risk_score_2024,avg_weather_risk_score_2025_pred,avg_weather_risk_score_2026_pred,avg_weather_risk_score_2027_pred,avg_weather_risk_score_2028_pred,avg_weather_risk_score_2029_pred
16400,1001,0.22,0.350384,0.259918,0.820079,0.469925,0.219948,0.22,0.219948,0.469925,0.820079,0.259918,0.350384
16401,1002,0.22,0.350384,0.259918,0.820079,0.469925,0.219948,0.22,0.219948,0.469925,0.820079,0.259918,0.350384
16402,1004,0.22,0.350384,0.259918,0.820079,0.469925,0.219948,0.22,0.219948,0.469925,0.820079,0.259918,0.350384
16403,1005,0.22,0.350384,0.259918,0.820079,0.469925,0.219948,0.22,0.219948,0.469925,0.820079,0.259918,0.350384
16404,1006,0.22,0.350384,0.259918,0.820079,0.469925,0.219948,0.22,0.219948,0.469925,0.820079,0.259918,0.350384


In [120]:
import geopandas as gpd

gdf = gpd.read_file(
    "https://raw.githubusercontent.com/gregoiredavid/france-geojson/master/departements.geojson"
)

print(gdf.columns)

Index(['code', 'nom', 'geometry'], dtype='object')


In [122]:
df_pred['code_department'] = df_pred['code_commune_INSEE'].str[:2]
print(df_pred.shape)
df_pred.head()

(141736, 14)


Unnamed: 0,code_commune_INSEE,avg_weather_risk_score,avg_weather_risk_score_lag1,avg_weather_risk_score_lag2,avg_weather_risk_score_lag3,avg_weather_risk_score_lag4,avg_weather_risk_score_lag5,avg_weather_risk_score_2024,avg_weather_risk_score_2025_pred,avg_weather_risk_score_2026_pred,avg_weather_risk_score_2027_pred,avg_weather_risk_score_2028_pred,avg_weather_risk_score_2029_pred,code_department
16400,1001,0.22,0.350384,0.259918,0.820079,0.469925,0.219948,0.22,0.219948,0.469925,0.820079,0.259918,0.350384,1
16401,1002,0.22,0.350384,0.259918,0.820079,0.469925,0.219948,0.22,0.219948,0.469925,0.820079,0.259918,0.350384,1
16402,1004,0.22,0.350384,0.259918,0.820079,0.469925,0.219948,0.22,0.219948,0.469925,0.820079,0.259918,0.350384,1
16403,1005,0.22,0.350384,0.259918,0.820079,0.469925,0.219948,0.22,0.219948,0.469925,0.820079,0.259918,0.350384,1
16404,1006,0.22,0.350384,0.259918,0.820079,0.469925,0.219948,0.22,0.219948,0.469925,0.820079,0.259918,0.350384,1


In [125]:
gdf_merged = (
    df_pred
    .merge(
        gdf, 
        left_on='code_department', 
        right_on='code', 
        how='inner'
        )
    .drop(columns='code')
)
print(gdf_merged.shape)
gdf_merged.head()

(141736, 16)


Unnamed: 0,code_commune_INSEE,avg_weather_risk_score,avg_weather_risk_score_lag1,avg_weather_risk_score_lag2,avg_weather_risk_score_lag3,avg_weather_risk_score_lag4,avg_weather_risk_score_lag5,avg_weather_risk_score_2024,avg_weather_risk_score_2025_pred,avg_weather_risk_score_2026_pred,avg_weather_risk_score_2027_pred,avg_weather_risk_score_2028_pred,avg_weather_risk_score_2029_pred,code_department,nom,geometry
0,1001,0.22,0.350384,0.259918,0.820079,0.469925,0.219948,0.22,0.219948,0.469925,0.820079,0.259918,0.350384,1,Ain,"POLYGON ((4.78021 46.17668, 4.78024 46.18905, ..."
1,1002,0.22,0.350384,0.259918,0.820079,0.469925,0.219948,0.22,0.219948,0.469925,0.820079,0.259918,0.350384,1,Ain,"POLYGON ((4.78021 46.17668, 4.78024 46.18905, ..."
2,1004,0.22,0.350384,0.259918,0.820079,0.469925,0.219948,0.22,0.219948,0.469925,0.820079,0.259918,0.350384,1,Ain,"POLYGON ((4.78021 46.17668, 4.78024 46.18905, ..."
3,1005,0.22,0.350384,0.259918,0.820079,0.469925,0.219948,0.22,0.219948,0.469925,0.820079,0.259918,0.350384,1,Ain,"POLYGON ((4.78021 46.17668, 4.78024 46.18905, ..."
4,1006,0.22,0.350384,0.259918,0.820079,0.469925,0.219948,0.22,0.219948,0.469925,0.820079,0.259918,0.350384,1,Ain,"POLYGON ((4.78021 46.17668, 4.78024 46.18905, ..."


In [None]:
import pandas as pd
import plotly.express as px

# Étape 1 : préparer un DataFrame long avec les prédictions
cols_pred = [f'avg_weather_risk_score_{year}_pred' for year in range(2025, 2030)]

df_long = gdf_merged.melt(
    id_vars=['code_department', 'nom', 'geometry'],
    value_vars=cols_pred,
    var_name='year',
    value_name='avg_weather_risk_score_pred'
)

# Étape 2 : nettoyer la colonne année (ex : "avg_weather_risk_score_2025_pred" → 2025)
df_long['year'] = df_long['year'].str.extract(r'(\d{4})').astype(int)

# Étape 3 : agréger au niveau du département (moyenne des communes)
gdf_plot = df_long.groupby(['code_department', 'nom', 'year']).agg({
    'avg_weather_risk_score_pred': 'mean'
}).reset_index()

# Étape 4 : fusionner avec le GeoJSON de départements (gdf = GeoDataFrame des départements)
gdf_plot = gdf_plot.merge(gdf[['code', 'geometry']], left_on='code_department', right_on='code')

# Étape 5 : afficher la carte interactive
fig = px.choropleth(
    gdf_plot,
    geojson=gdf,
    locations='code_department',
    featureidkey='properties.code',
    color='avg_weather_risk_score_pred',
    animation_frame='year',
    color_continuous_scale='Reds',
    range_color=(0, 3),
    labels={'avg_weather_risk_score_pred': 'Predicted Risk Score'},
    title='Predicted Weather Risk Score by Department (2025–2029)'
)

fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(margin={"r":0,"t":40,"l":0,"b":0})
fig.show()
