# **Preprocessing : Données Météo & Liste des Stations**

**Objectif**

Ce notebook vise à construire une version cohérente des features météo, alignée avec le fichier train.

L’objectif est de produire un dataset df_train_meteo.csv sans valeurs manquantes, agrégé par semaine et par région, prêt à être utilisé dans les modèles.

In [120]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot') 

from pathlib import Path

import sys
sys.path.append("../src")
import unicodedata
from preprocessing import normalize_region_name, split_meteo_files

import warnings 
warnings.filterwarnings ('ignore') 

NameError: name 'Path' is not defined

In [212]:
# Definition des chemins 
ROOT = Path("..").resolve()
METEO_PATH = ROOT / "data" / "raw" / "DonneesMeteorologiques" / "DonneesMeteorologiques"
STATIONS_PATH = ROOT / "data" / "raw" / "ListedesStationsMeteo.csv"
OUTPUT_PATH = ROOT / "data" / "processed"

print("Meteo path:", METEO_PATH)
print("Stations path:", STATIONS_PATH)

Meteo path: /Users/anastasiia/Documents/mosef/ml_app_project/Projet-Kaggle/data/raw/DonneesMeteorologiques/DonneesMeteorologiques
Stations path: /Users/anastasiia/Documents/mosef/ml_app_project/Projet-Kaggle/data/raw/ListedesStationsMeteo.csv


In [213]:
df_train = pd.read_csv(ROOT / "data" / "raw" / "train.csv")
df_test  = pd.read_csv(ROOT / "data" / "raw" / "test.csv")  

In [122]:
files = sorted(METEO_PATH.glob("synop.*.csv"))

print(f"Nombre total de fichiers météo : {len(files)}")
files[:5]

Nombre total de fichiers météo : 154


[PosixPath('/Users/anastasiia/Documents/mosef/ml_app_project/Projet-Kaggle/data/raw/DonneesMeteorologiques/DonneesMeteorologiques/synop.200401.csv'),
 PosixPath('/Users/anastasiia/Documents/mosef/ml_app_project/Projet-Kaggle/data/raw/DonneesMeteorologiques/DonneesMeteorologiques/synop.200402.csv'),
 PosixPath('/Users/anastasiia/Documents/mosef/ml_app_project/Projet-Kaggle/data/raw/DonneesMeteorologiques/DonneesMeteorologiques/synop.200403.csv'),
 PosixPath('/Users/anastasiia/Documents/mosef/ml_app_project/Projet-Kaggle/data/raw/DonneesMeteorologiques/DonneesMeteorologiques/synop.200404.csv'),
 PosixPath('/Users/anastasiia/Documents/mosef/ml_app_project/Projet-Kaggle/data/raw/DonneesMeteorologiques/DonneesMeteorologiques/synop.200405.csv')]

# Les synop : séparation pour le train/test & concaténation

In [133]:
# PREPROCESS Fonction de split (normalement est rajoutée dans preprocessing mais les chemins fonct pas)
def split_meteo_files(files, start_year, end_year):
    selected_files = []
    
    for file in files:
        year = int(file.stem.split(".")[1][:4])
        
        if start_year <= year <= end_year:
            selected_files.append(file)
    
    return selected_files

In [134]:
# Train (2004-2011) & test (2012-2013)
meteo_train_files = split_meteo_files(files, 2004, 2011)
meteo_test_files  = split_meteo_files(files, 2012, 2013)

print(f"Fichiers météo TRAIN (2004–2011) : {len(meteo_train_files)}")
print(f"Fichiers météo TEST  (2012–2013) : {len(meteo_test_files)}")

Fichiers météo TRAIN (2004–2011) : 96
Fichiers météo TEST  (2012–2013) : 24


In [141]:
# PREPROCESS Chargement des données météo + concaténation + mq en nan
def read_and_concat_synop(files):
    df = pd.concat(
        [pd.read_csv(f, sep=";", decimal=",") for f in files],
        ignore_index=True
    )
    return df.replace("mq", np.nan)

In [142]:
df_synop = pd.concat([
    read_and_concat_synop(meteo_train_files),
    read_and_concat_synop(meteo_test_files)
])

  return df.replace("mq", np.nan)
  return df.replace("mq", np.nan)


In [None]:
# Conversion de la colonne date en datetime
df_synop["date"] = pd.to_datetime(
    df_synop["date"],
    format="%Y%m%d%H%M%S",
    errors="coerce"
)

df_synop["year"] = df_synop["date"].dt.year
df_synop["month"] = df_synop["date"].dt.month

In [146]:
df_synop.head()

Unnamed: 0,numer_sta,date,pmer,tend,cod_tend,dd,ff,t,td,u,...,hnuage2,nnuage3,ctype3,hnuage3,nnuage4,ctype4,hnuage4,Unnamed: 59,year,month
0,7005,2004-01-01,102160,-350,8,170,5.1,273.25,272.25,93,...,,,,,,,,,2004,1
1,7015,2004-01-01,102190,-290,8,160,3.1,272.95,271.95,92,...,,,,,,,,,2004,1
2,7020,2004-01-01,101760,-380,6,180,13.9,279.95,275.45,73,...,,,,,,,,,2004,1
3,7027,2004-01-01,102080,-370,6,170,5.1,273.75,272.85,94,...,7500.0,,,,,,,,2004,1
4,7037,2004-01-01,102240,-310,6,180,4.1,272.75,271.65,91,...,,,,,,,,,2004,1


In [145]:
df_synop.tail()

Unnamed: 0,numer_sta,date,pmer,tend,cod_tend,dd,ff,t,td,u,...,hnuage2,nnuage3,ctype3,hnuage3,nnuage4,ctype4,hnuage4,Unnamed: 59,year,month
339650,81401,2013-12-31 21:00:00,100930,,,70.0,1.0,299.85,297.15,85,...,,,,,,,,,2013,12
339651,81405,2013-12-31 21:00:00,100940,,,90.0,2.6,299.85,295.45,77,...,,,,,,,,,2013,12
339652,81408,2013-12-31 21:00:00,100900,,,60.0,2.6,301.75,296.25,72,...,,,,,,,,,2013,12
339653,81415,2013-12-31 21:00:00,100900,,,,,298.95,296.85,88,...,,,,,,,,,2013,12
339654,89642,2013-12-31 21:00:00,99150,110.0,3.0,80.0,10.8,277.15,275.05,86,...,,,,,,,,,2013,12


# Prep Liste des Stations

In [168]:
df_stations = pd.read_csv(STATIONS_PATH, sep=";").rename(columns={"ID": "numer_sta"})

In [169]:
df_stations.sample(5)

Unnamed: 0,numer_sta,Nom,Latitude,Longitude,Altitude
59,81408,SAINT GEORGES,3.890667,-51.804667,6
15,7222,NANTES-BOUGUENAIS,47.15,-1.608833,26
24,7471,LE PUY-LOUDES,45.0745,3.764,833
10,7149,ORLY,48.716833,2.384333,89
28,7558,MILLAU,44.1185,3.0195,712


# Mapping à la main : 

In [170]:
# PREPROCESS
def normalize_city_name(text):
    if not isinstance(text, str):
        return None
    text = unicodedata.normalize("NFD", text).encode("ascii", "ignore").decode("utf-8")
    text = text.upper().replace("-", " ").replace("'", "").strip()
    return " ".join(text.split())

In [171]:
df_stations["city"] = df_stations["Nom"].apply(normalize_city_name)

In [172]:
df_stations.head()

Unnamed: 0,numer_sta,Nom,Latitude,Longitude,Altitude,city
0,7005,ABBEVILLE,50.136,1.834,69,ABBEVILLE
1,7015,LILLE-LESQUIN,50.57,3.0975,47,LILLE LESQUIN
2,7020,PTE DE LA HAGUE,49.725167,-1.939833,6,PTE DE LA HAGUE
3,7027,CAEN-CARPIQUET,49.18,-0.456167,67,CAEN CARPIQUET
4,7037,ROUEN-BOOS,49.383,1.181667,151,ROUEN BOOS


In [173]:
city_to_region = {
    "ABBEVILLE": "PICARDIE",
    "LILLE LESQUIN": "NORD-PAS-DE-CALAIS",
    "PTE DE LA HAGUE": "BASSE-NORMANDIE",
    "CAEN CARPIQUET": "BASSE-NORMANDIE",
    "ROUEN BOOS": "HAUTE-NORMANDIE",
    "REIMS PRUNAY": "CHAMPAGNE-ARDENNE",
    "BREST GUIPAVAS": "BRETAGNE",
    "PLOUMANACH": "BRETAGNE",
    "RENNES ST JACQUES": "BRETAGNE",
    "ALENCON": "BASSE-NORMANDIE",
    "ORLY": "ILE-DE-FRANCE",
    "TROYES BARBEREY": "CHAMPAGNE-ARDENNE",
    "NANCY OCHEY": "LORRAINE",
    "STRASBOURG ENTZHEIM": "ALSACE",
    "BELLE ILE LE TALUT": "BRETAGNE",
    "NANTES BOUGUENAIS": "PAYS-DE-LA-LOIRE",
    "TOURS": "CENTRE",
    "BOURGES": "CENTRE",
    "DIJON LONGVIC": "BOURGOGNE",
    "POITIERS BIARD": "POITOU-CHARENTES",
    "BALE MULHOUSE": "ALSACE",
    "PTE DE CHASSIRON": "POITOU-CHARENTES",
}

In [176]:
df_stations["region_name"] = df_stations["city"].map(city_to_region)

In [177]:
print("Stations sans region :", df_stations["region_name"].isna().sum(), "/", len(df_stations))
df_stations[df_stations["region_name"].isna()][["Nom","city"]]

Stations sans region : 40 / 62


Unnamed: 0,Nom,city
22,LIMOGES-BELLEGARDE,LIMOGES BELLEGARDE
23,CLERMONT-FD,CLERMONT FD
24,LE PUY-LOUDES,LE PUY LOUDES
25,LYON-ST EXUPERY,LYON ST EXUPERY
26,BORDEAUX-MERIGNAC,BORDEAUX MERIGNAC
27,GOURDON,GOURDON
28,MILLAU,MILLAU
29,MONTELIMAR,MONTELIMAR
30,EMBRUN,EMBRUN
31,MONT-DE-MARSAN,MONT DE MARSAN


In [178]:
city_to_region.update({
    "PLOUMANAC H": "BRETAGNE",
    "LIMOGES BELLEGARDE": "LIMOUSIN",
    "CLERMONT FD": "AUVERGNE",
    "LE PUY LOUDES": "AUVERGNE",
    "LYON ST EXUPERY": "RHONE-ALPES",
    "BORDEAUX MERIGNAC": "AQUITAINE",
    "GOURDON": "MIDI-PYRENEES",
    "MILLAU": "MIDI-PYRENEES",
    "MONTELIMAR": "RHONE-ALPES",
    "EMBRUN": "PROVENCE-ALPES-COTE-D-AZUR",
    "MONT DE MARSAN": "AQUITAINE",
    "TARBES OSSUN": "MIDI-PYRENEES",
    "ST GIRONS": "MIDI-PYRENEES",
    "TOULOUSE BLAGNAC": "MIDI-PYRENEES",
    "MONTPELLIER": "LANGUEDOC-ROUSSILLON",
    "MARIGNANE": "PROVENCE-ALPES-COTE-D-AZUR",
    "CAP CEPET": "PROVENCE-ALPES-COTE-D-AZUR",
    "NICE": "PROVENCE-ALPES-COTE-D-AZUR",
    "PLOUMANAC H": "BRETAGNE",
    "PERPIGNAN": "LANGUEDOC-ROUSSILLON",
    "AJACCIO": "CORSE",
    "BASTIA": "CORSE",

    
    "GILLOT AEROPORT": "REUNION",
    "ST PIERRE": "REUNION",
    "PAMANDZI": "MAYOTTE",

   
    "LE RAIZET AERO": "GUADELOUPE",
    "LA DESIRADE METEO": "GUADELOUPE",
    "ST BARTHELEMY METEO": "SAINT-BARTHELEMY",
    "TRINITE CARAVEL": "MARTINIQUE",
    "LAMENTIN AERO": "MARTINIQUE",

    
    "CAYENNE MATOURY": "GUYANE",
    "SAINT LAURENT": "GUYANE",
    "SAINT GEORGES": "GUYANE",
    "MARIPASOULA": "GUYANE",

    
    "GLORIEUSES": "TAAF",
    "JUAN DE NOVA": "TAAF",
    "EUROPA": "TAAF",
    "TROMELIN": "TAAF",
    "NOUVELLE AMSTERDAM": "TAAF",
    "CROZET": "TAAF",
    "KERGUELEN": "TAAF",
    "DUMONT DURVILLE": "TAAF",
})

In [179]:
df_stations["region_name"] = df_stations["city"].map(city_to_region)

In [180]:
print("Stations with no region :", df_stations["region_name"].isna().sum(), "/", len(df_stations))
df_stations[df_stations["region_name"].isna()][["Nom","city"]]

Stations with no region : 0 / 62


Unnamed: 0,Nom,city


In [181]:
# Final check
print(
    "Stations with no region :",
    df_stations["region_name"].isna().sum(),
    "/",
    len(df_stations)
)

df_stations[df_stations["region_name"].isna()][["Nom", "city"]]

Stations with no region : 0 / 62


Unnamed: 0,Nom,city


In [182]:
df_stations.sample(5)

Unnamed: 0,numer_sta,Nom,Latitude,Longitude,Altitude,city,region_name,region_normalized
41,7790,BASTIA,42.540667,9.485167,10,BASTIA,CORSE,
31,7607,MONT-DE-MARSAN,43.909833,-0.500167,59,MONT DE MARSAN,AQUITAINE,
18,7280,DIJON-LONGVIC,47.267833,5.088333,219,DIJON LONGVIC,BOURGOGNE,BOURGOGNE
15,7222,NANTES-BOUGUENAIS,47.15,-1.608833,26,NANTES BOUGUENAIS,PAYS-DE-LA-LOIRE,PAYS-DE-LA-LOIRE
10,7149,ORLY,48.716833,2.384333,89,ORLY,ILE-DE-FRANCE,ILE-DE-FRANCE


## Merge Stations & Synop par région

In [184]:
df_synop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1701094 entries, 0 to 1701093
Data columns (total 63 columns):
 #   Column       Dtype         
---  ------       -----         
 0   numer_sta    int64         
 1   date         datetime64[ns]
 2   pmer         object        
 3   tend         object        
 4   cod_tend     object        
 5   dd           object        
 6   ff           object        
 7   t            object        
 8   td           object        
 9   u            object        
 10  vv           object        
 11  ww           object        
 12  w1           object        
 13  w2           object        
 14  n            object        
 15  nbas         object        
 16  hbas         object        
 17  cl           object        
 18  cm           object        
 19  ch           object        
 20  pres         object        
 21  niv_bar      object        
 22  geop         object        
 23  tend24       object        
 24  tn12         object     

In [183]:
df_synop = df_synop.merge(
    df_stations[["numer_sta", "region_name"]],
    on="numer_sta",
    how="left"
)

## Keeping only 5 features

In [229]:
meteo_features = ["t","pmer","ff","rr24","u"]
for feature in meteo_features:
    if feature in df_synop.columns:
        df_synop[feature] = pd.to_numeric(df_synop[feature], errors="coerce")

## Aggregation mensuelle par région

In [230]:
meteo_monthly = (
    df_synop
    .groupby(["region_name","year","month"], as_index=False)
    .agg({
        "t":"mean",
        "pmer":"mean",
        "ff":"mean",
        "rr24":"sum",
        "u":"mean"
    })
)

## Filtrage des train/test par régions et semaines

In [215]:
# Data Transformation : extraire year et week_num de la colonne week 
df_train['week'] = df_train['week'].astype(str) 
df_train['year'] = df_train['week'].str[:4].astype(int) 
df_train['week_num'] = df_train['week'].str[4:].astype(int)

In [216]:
df_train.head()

Unnamed: 0,Id,week,region_code,region_name,TauxGrippe,year,week_num,date,month
0,5523,201152,42,ALSACE,66,2011,52,2011-12-26,12
1,5524,201152,72,AQUITAINE,24,2011,52,2011-12-26,12
2,5525,201152,83,AUVERGNE,91,2011,52,2011-12-26,12
3,5526,201152,25,BASSE-NORMANDIE,49,2011,52,2011-12-26,12
4,5527,201152,26,BOURGOGNE,33,2011,52,2011-12-26,12


In [217]:
# Data Transformation : extraire year et week_num de la colonne week 
df_test['week'] = df_test['week'].astype(str) 
df_test['year'] = df_test['week'].str[:4].astype(int) 
df_test['week_num'] = df_test['week'].str[4:].astype(int)

In [218]:
df_test.head()

Unnamed: 0,Id,week,region_code,region_name,year,week_num,date,month
0,3235,201352,42,ALSACE,2013,52,2013-12-23,12
1,3236,201352,72,AQUITAINE,2013,52,2013-12-23,12
2,3237,201352,83,AUVERGNE,2013,52,2013-12-23,12
3,3238,201352,25,BASSE-NORMANDIE,2013,52,2013-12-23,12
4,3239,201352,26,BOURGOGNE,2013,52,2013-12-23,12


In [221]:
meteo_train = meteo_monthly.merge(
    df_train[["region_name", "year", "month"]].drop_duplicates(),
    on=["region_name", "year", "month"],
    how="inner"
)

meteo_test = meteo_monthly.merge(
    df_test[["region_name", "year", "month"]].drop_duplicates(),
    on=["region_name", "year", "month"],
    how="inner"
)

In [223]:
df_train_meteo = df_train.merge(
    meteo_train,
    on=["region_name", "year", "month"],
    how="left"
)

df_test_meteo = df_test.merge(
    meteo_test,
    on=["region_name", "year", "month"],
    how="left"
)

In [225]:
print("TRAIN FINAL SHAPE:", df_train_meteo.shape)
print("TEST FINAL SHAPE :", df_test_meteo.shape)

df_train_meteo.info()

TRAIN FINAL SHAPE: (9196, 14)
TEST FINAL SHAPE : (2288, 13)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9196 entries, 0 to 9195
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Id           9196 non-null   int64         
 1   week         9196 non-null   object        
 2   region_code  9196 non-null   int64         
 3   region_name  9196 non-null   object        
 4   TauxGrippe   9196 non-null   int64         
 5   year         9196 non-null   int64         
 6   week_num     9196 non-null   int64         
 7   date         9196 non-null   datetime64[ns]
 8   month        9196 non-null   int32         
 9   t            8778 non-null   float64       
 10  pmer         8778 non-null   float64       
 11  ff           8778 non-null   float64       
 12  rr24         8778 non-null   float64       
 13  u            8778 non-null   float64       
dtypes: datetime64[ns](1), float64(5), int32(1), 

In [226]:
df_train_meteo.shape

(9196, 14)

## Imputation des valeurs manquantes après pour éviter data leakage (on ne prend pas des informations d'avenir)

In [228]:
df_train_meteo.isna().sum()

Id               0
week             0
region_code      0
region_name      0
TauxGrippe       0
year             0
week_num         0
date             0
month            0
t              418
pmer           418
ff             418
rr24           418
u              418
dtype: int64

In [232]:
medians = df_train_meteo[meteo_features].median()
print("Medians used for imputation:")
print(medians)

Medians used for imputation:
t          285.485666
pmer    101715.249560
ff           3.635223
rr24        75.200000
u           77.341730
dtype: float64


In [236]:
df_train_meteo[meteo_features] = df_train_meteo[meteo_features].fillna(medians)
df_test_meteo[meteo_features]  = df_test_meteo[meteo_features].fillna(medians)

In [237]:
print("NaN after imputation (train):")
print(df_train_meteo[meteo_features].isna().sum())

print("NaN after imputation (test):")
print(df_test_meteo[meteo_features].isna().sum())

NaN after imputation (train):
t       0
pmer    0
ff      0
rr24    0
u       0
dtype: int64
NaN after imputation (test):
t       0
pmer    0
ff      0
rr24    0
u       0
dtype: int64


In [239]:
df_train_meteo.to_csv("train_meteo.csv", index=False)
df_test_meteo.to_csv("test_meteo.csv", index=False)