In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Weather

Source donnée : https://www.ncei.noaa.gov/cdo-web/customoptions et https://www.infoclimat.fr/previsions-meteo/details/STA/07156/paris-montsouris.html pour les prévisions

PRCP = précipitation (en pouce)

SNWD = épaisseur de neige (snow depth) en pouce

TMAX = température maximale (en degré F)

NAME = nom de la station

In [None]:
weather = pd.read_csv("data/raw_data/weather.csv")
weather = weather.loc[weather["NAME"] == "PARIS MONTSOURIS, FR"]
weather["SNWD"] = weather["SNWD"].replace(np.NaN, 0.)
weather["DATE"] = pd.to_datetime(weather["DATE"])

#Remplacer les nans de TMAX à partir des valeurs de TMIN et TAVG
weather["TMAX Backup"] = 2*weather["TAVG"] - weather["TMIN"]
weather["TMAX"] = np.where(weather["TMAX"] == weather["TMAX"], weather["TMAX"], weather["TMAX Backup"])
weather["TMAX Backup 2"] = weather["TAVG"] * np.mean(weather["TMAX"]/weather["TAVG"])
weather["TMAX"] = np.where(weather["TMAX"] == weather["TMAX"], weather["TMAX"], weather["TMAX Backup 2"])

weather = weather.drop(columns=["STATION", "LATITUDE", "LONGITUDE", "ELEVATION", "PRCP_ATTRIBUTES", "SNWD_ATTRIBUTES", "TAVG_ATTRIBUTES", "TMAX_ATTRIBUTES", "TMIN_ATTRIBUTES", "TMAX Backup", "TMAX Backup 2", "TAVG", "TMIN"])
weather

Unnamed: 0,NAME,DATE,PRCP,SNWD,TMAX
0,"PARIS MONTSOURIS, FR",2010-01-01,0.01,0.0,36.0
1,"PARIS MONTSOURIS, FR",2010-01-02,0.00,0.0,38.0
2,"PARIS MONTSOURIS, FR",2010-01-03,0.00,0.0,40.0
3,"PARIS MONTSOURIS, FR",2010-01-04,0.00,0.0,33.0
4,"PARIS MONTSOURIS, FR",2010-01-05,0.00,0.0,33.0
...,...,...,...,...,...
5405,"PARIS MONTSOURIS, FR",2024-11-23,0.01,0.4,50.0
5406,"PARIS MONTSOURIS, FR",2024-11-24,0.00,0.0,63.0
5407,"PARIS MONTSOURIS, FR",2024-11-25,0.02,0.0,63.0
5408,"PARIS MONTSOURIS, FR",2024-11-26,0.44,0.0,51.0


In [9]:
weather_forcast = pd.DataFrame(
    [["PARIS MONTSOURIS, FR", pd.to_datetime("2024-12-06"), 0.08, 0., 54.],
     ["PARIS MONTSOURIS, FR", pd.to_datetime("2024-12-07"), 0.08, 0., 52.],
     ["PARIS MONTSOURIS, FR", pd.to_datetime("2024-12-08"), 0.04, 0., 41.],
     ["PARIS MONTSOURIS, FR", pd.to_datetime("2024-12-09"), 0.03, 0., 43.],
     ["PARIS MONTSOURIS, FR", pd.to_datetime("2024-12-10"), 0., 0., 43.]
     ],
    columns = ["NAME", "DATE", "PRCP", "SNWD", "TMAX"])
weather_forcast

Unnamed: 0,NAME,DATE,PRCP,SNWD,TMAX
0,"PARIS MONTSOURIS, FR",2024-12-06,0.08,0.0,54.0
1,"PARIS MONTSOURIS, FR",2024-12-07,0.08,0.0,52.0
2,"PARIS MONTSOURIS, FR",2024-12-08,0.04,0.0,41.0
3,"PARIS MONTSOURIS, FR",2024-12-09,0.03,0.0,43.0
4,"PARIS MONTSOURIS, FR",2024-12-10,0.0,0.0,43.0


In [10]:
weather_forcast.to_csv("data/weather_forcast.csv")

# Holidays

In [82]:
holidays = pd.read_csv("data/raw_data/holidays.csv")
holidays["date"] = pd.to_datetime(holidays["date"])
holidays

Unnamed: 0,date,vacances_zone_a,vacances_zone_b,vacances_zone_c,nom_vacances
0,1990-01-01,False,False,False,
1,1990-01-02,False,False,False,
2,1990-01-03,False,False,False,
3,1990-01-04,False,False,False,
4,1990-01-05,False,False,False,
...,...,...,...,...,...
13509,2026-12-27,False,False,False,
13510,2026-12-28,False,False,False,
13511,2026-12-29,False,False,False,
13512,2026-12-30,False,False,False,


In [83]:
bank_holidays = pd.read_csv("data/raw_data/om-referentiel-jours-feries.csv", sep = ";")
bank_holidays["date"] = pd.to_datetime(bank_holidays["date"])
bank_holidays = bank_holidays.sort_values("date").reset_index().drop(columns = ["index", "année"])
bank_holidays = bank_holidays.set_index("date")

#Add all dates
bank_holidays = bank_holidays.reindex(index = pd.date_range(pd.to_datetime("2003-01-01"), pd.to_datetime("2024-12-20"), freq = pd.Timedelta(1, "day")))

# 0-1 format
bank_holidays["nom_jour_ferie"] = bank_holidays["férié"].copy()
bank_holidays["férié"] = np.where(bank_holidays["férié"] == bank_holidays["férié"], 1, 0)
bank_holidays.columns = ["ferie", "nom_jour_ferie"]
bank_holidays.index.names = ["date"]

bank_holidays

Unnamed: 0_level_0,ferie,nom_jour_ferie
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2003-01-01,1,1er janvier
2003-01-02,0,
2003-01-03,0,
2003-01-04,0,
2003-01-05,0,
...,...,...
2024-12-16,0,
2024-12-17,0,
2024-12-18,0,
2024-12-19,0,


In [84]:
#Group holidays and bank holidays in a single array
holidays = pd.merge(holidays, bank_holidays.reset_index(), left_on = "date", right_on = "date", how="inner")
holidays = holidays.set_index("date").replace({True: 1, False: 0})
holidays

Unnamed: 0_level_0,vacances_zone_a,vacances_zone_b,vacances_zone_c,nom_vacances,ferie,nom_jour_ferie
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2003-01-01,1,1,1,Vacances de Noël,1,1er janvier
2003-01-02,1,1,1,Vacances de Noël,0,
2003-01-03,1,1,1,Vacances de Noël,0,
2003-01-04,1,1,1,Vacances de Noël,0,
2003-01-05,1,1,1,Vacances de Noël,0,
...,...,...,...,...,...,...
2024-12-16,0,0,0,,0,
2024-12-17,0,0,0,,0,
2024-12-18,0,0,0,,0,
2024-12-19,0,0,0,,0,


In [85]:
holidays.to_csv("data/holidays_bank_holidays.csv")