# Implémentation d'un Modèle CatBoost pour l'Optimisation des Ressources d'un Système de Vélos Partagés

Ce projet vise à développer un modèle prédictif multi-cibles pour anticiper en temps réel l'offre et la demande dans les stations de vélos en libre-service (ex: Vélib'). En exploitant des données OpenData (Paris) incluant des variables temporelles, météorologiques et l'historique d'utilisation, l'objectif principal est de prédire simultanément le nombre de vélos disponibles et le nombre de bornettes libres par station.

In [223]:
import pandas as pd
import warnings
import os
import json

warnings.filterwarnings('ignore')

%matplotlib inline

In [224]:
data_folder = '../data/'

parquet_folder = data_folder + 'parquet/'
json_folder = data_folder + 'json/'
meteo_folder = data_folder + 'meteo/'

parquet_file_names = os.listdir(parquet_folder)
json_file_names = os.listdir(json_folder)
meteo_file_names = os.listdir(meteo_folder)

print("parquet file : ", len(parquet_file_names))
print("json file : ", len(json_file_names))
print("meteo file : ", len(json_file_names))

parquet file :  1414
json file :  110
meteo file :  110


In [225]:
parquet_data = pd.read_parquet(parquet_folder + parquet_file_names[0])
parquet_data.columns

Index(['stationcode', 'name', 'is_installed', 'capacity', 'numdocksavailable',
       'numbikesavailable', 'mechanical', 'ebike', 'is_renting',
       'is_returning', 'duedate', 'coordonnees_geo',
       'nom_arrondissement_communes', 'code_insee_commune'],
      dtype='object')

In [226]:
def format_geo_coordinates(x):
    return [float(x['lat']), float(x['lon'])]

parquet_data['coordonnees_geo'] = parquet_data['coordonnees_geo'].apply(format_geo_coordinates)
parquet_data.head()

Unnamed: 0,stationcode,name,is_installed,capacity,numdocksavailable,numbikesavailable,mechanical,ebike,is_renting,is_returning,duedate,coordonnees_geo,nom_arrondissement_communes,code_insee_commune
0,16107,Benjamin Godard - Victor Hugo,OUI,35,30,5,2,3,OUI,OUI,2024-10-21T12:49:53+00:00,"[48.865983, 2.275725]",Paris,75056
1,9020,Toudouze - Clauzel,OUI,21,18,2,0,2,OUI,OUI,2024-10-21T12:46:01+00:00,"[48.87929591733507, 2.3373600840568547]",Paris,75056
2,14111,Cassini - Denfert-Rochereau,OUI,25,21,1,1,0,OUI,OUI,2024-10-21T12:47:41+00:00,"[48.837525839067, 2.3360354080796]",Paris,75056
3,13007,Le Brun - Gobelins,OUI,48,40,4,1,3,OUI,OUI,2024-10-21T12:49:56+00:00,"[48.835092787824, 2.3534681351338]",Paris,75056
4,5110,Lacépède - Monge,OUI,23,6,17,7,10,OUI,OUI,2024-10-21T12:48:48+00:00,"[48.84389286531899, 2.3519663885235786]",Paris,75056


In [227]:
def load_and_format_json(json_file_path):
    with open(json_file_path) as json_file:
        data = json.load(json_file)
        return pd.DataFrame([record['fields'] for record in data['records']])

json_data = load_and_format_json(json_folder+json_file_names[0])
json_data.columns

Index(['name', 'stationcode', 'ebike', 'mechanical', 'coordonnees_geo',
       'duedate', 'numbikesavailable', 'numdocksavailable', 'capacity',
       'is_renting', 'is_installed', 'nom_arrondissement_communes',
       'is_returning', 'code_insee_commune'],
      dtype='object')

In [228]:
# Remove first element
parquet_file_names.pop(0)
json_file_names.pop(0)

'velib_2025_12_17T12_00_02_956z.json'

In [229]:
print("parquet dataframe length: ", len(parquet_data))
print("json dataframe length: ", len(json_data))

parquet dataframe length:  10
json dataframe length:  1503


In [230]:
data = pd.concat([parquet_data, json_data])
len(data)

1513

In [231]:
all_files = parquet_file_names + json_file_names
len(all_files)

1522

In [234]:
for file in all_files:
    if file.endswith('.parquet'):
        parquet = pd.read_parquet(parquet_folder + file)
        if parquet.empty:
            continue
        parquet['coordonnees_geo'] = parquet['coordonnees_geo'].apply(format_geo_coordinates)
        data = pd.concat([data, parquet])
    if file.endswith('.json'):
        data = pd.concat([data, load_and_format_json(json_folder + file)])

len(data)

183930

In [235]:
meteo = pd.read_csv(meteo_folder + meteo_file_names[0])
meteo_file_names.pop(0)

for file in meteo_file_names:
    if file.endswith('.csv'):
        meteo = pd.concat([meteo, pd.read_csv(meteo_folder + file)])

len(meteo)

3192

In [236]:
meteo.rename(columns={'date': 'duedate'}, inplace=True)
meteo.columns

Index(['duedate', 'temperature_2m', 'precipitation', 'rain', 'snowfall',
       'relative_humidity_2m'],
      dtype='object')

In [237]:
data['duedate'] = pd.to_datetime(data['duedate'])
meteo['duedate'] = pd.to_datetime(meteo['duedate'])

data['date_only'] = data['duedate'].dt.strftime('%Y-%m-%d %H:00:00')
meteo['date_only'] = meteo['duedate'].dt.strftime('%Y-%m-%d %H:00:00')
data_with_meteo = data.merge(meteo, on='date_only', suffixes=('_data', '_meteo'))

data_with_meteo.drop(['date_only', 'duedate_meteo'], axis=1, inplace=True)
data_with_meteo.rename(columns={'duedate_data': 'duedate'}, inplace=True)

data_with_meteo.to_parquet('../data/data_with_meteo.parquet', index=False)