<h1 style="text-align: center;">Forecasting traffic in Paris</h1>

---

In [6]:
import numpy as np
import pandas as pd
from prophet import Prophet
import sys


sys.path.append('../module')

from dataprep import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


##### Setting parameters

In [13]:
date_debut = '2023-12-08 00:00:00'
date_fin = '2023-12-12 23:00:00'

In [7]:
df_convention_raw = load_traffic_data(arc='convention')
df_champs_raw = load_traffic_data(arc='champs')
df_sts_raw = load_traffic_data(arc='sts')

df_convention_clean = traiter_donnees(df=df_convention_raw, arc='convention')
df_champs_clean = traiter_donnees(df_champs_raw, arc='champs')
df_sts_clean = traiter_donnees(df_sts_raw, arc='sts')

loading data for convention [...]
loading data for champs [...]
loading data for sts [...]


#### Filling missing value with **RandomForestRegressor**

In [8]:
df_sts_clean = encode_categorical(df_sts_clean)
df_champs_clean = encode_categorical(df_champs_clean)
df_convention_clean = encode_categorical(df_champs_clean)

df_champs_full, _ = impute_missing_values(df_champs_clean, verbose=True)
df_convention_full, _ = impute_missing_values(df_convention_clean)
df_sts_full, _ = impute_missing_values(df_sts_clean)

8454.069595446746
62253.401494549675
13122.411200477152
10442.496585128052
3623.673423436121


In [10]:
df_champs_full.head()

Unnamed: 0,libelle,taux_occupation,etat_arc,debit_horaire
2022-11-01 04:00:00,0.0,8.89223,1.0,672.0
2022-11-01 05:00:00,0.0,7.12889,1.0,513.0
2022-11-01 06:00:00,0.0,6.225,1.0,494.0
2022-11-01 07:00:00,0.0,5.80723,1.0,513.0
2022-11-01 08:00:00,0.0,5.57111,1.0,484.0


### External sources

In [6]:
df_worksites_raw = load_worksites()
df_worksites_clean = clean_worksites_data(df=df_worksites_raw)
df_worksites_clean

loading data for worksites [...]


Unnamed: 0,date_debut,date_fin,niveau_perturbation,voie,impact_circulation
35,2022-11-03,2023-04-14,2.0,Avenue des Champs Elysées,RESTREINTE
89,2023-04-03,2023-06-16,,rue de la Convention,RESTREINTE
97,2021-07-12,2023-05-31,2.0,40 rue des Sts-Pères,RESTREINTE
159,2022-07-04,2023-05-12,1.0,Rue de la Convention,RESTREINTE
188,2022-06-13,2023-12-01,2.0,Avenue des Champs Elysées,RESTREINTE
190,2023-02-27,2023-04-28,2.0,Avenue des Champs Elysées,RESTREINTE


### Running predictions

In [59]:
df_prophet = pd.DataFrame()
df_prophet['ds'] = df_convention_full.index
df_prophet['y'] = df_convention_full['debit_horaire'].values

m = Prophet()
m.fit(df_prophet)
future = m.make_future_dataframe(periods=120, freq='H')

forecast = m.predict(future)

df_prediction_conv_deb = forecast.loc[forecast['ds']>= date_debut]
df_prediction_conv_deb.rename(columns={'yhat':'debit_horaire', 'ds':'datetime'}, inplace=True)
df_prediction_conv_deb = df_prediction_conv_deb[['debit_horaire', 'datetime']]

17:57:55 - cmdstanpy - INFO - Chain [1] start processing
17:57:56 - cmdstanpy - INFO - Chain [1] done processing
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_prediction_conv_deb.rename(columns={'yhat':'debit_horaire', 'ds':'datetime'}, inplace=True)


In [62]:
df_prophet = pd.DataFrame()
df_prophet['ds'] = df_convention_full.index
df_prophet['y'] = df_convention_full['taux_occupation'].values

m = Prophet()
m.fit(df_prophet)
future = m.make_future_dataframe(periods=120, freq='H')

forecast = m.predict(future)

df_prediction_conv_taux = forecast.loc[forecast['ds']>= date_debut]
df_prediction_conv_taux.rename(columns={'yhat':'taux_occupation', 'ds':'datetime'}, inplace=True,)
df_prediction_conv_taux = df_prediction_conv_taux[['taux_occupation']]

17:58:16 - cmdstanpy - INFO - Chain [1] start processing
17:58:17 - cmdstanpy - INFO - Chain [1] done processing
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_prediction_conv_taux.rename(columns={'yhat':'taux_occupation', 'ds':'datetime'}, inplace=True,)


In [94]:
df_convention = pd.concat([df_prediction_conv_deb, df_prediction_conv_taux], axis=1)
df_convention['arc']='Convention'
df_convention = df_convention.head(120)

In [64]:
df_prophet = pd.DataFrame()
df_prophet['ds'] = df_sts_full.index
df_prophet['y'] = df_sts_full['debit_horaire'].values

m = Prophet()
m.fit(df_prophet)
future = m.make_future_dataframe(periods=120, freq='H')

forecast = m.predict(future)

df_prediction_sts_debit = forecast.loc[forecast['ds']>= date_debut]
df_prediction_sts_debit.rename(columns={'yhat':'debit_horaire', 'ds':'datetime'}, inplace=True)
df_prediction_sts_debit = df_prediction_sts_debit[['debit_horaire', 'datetime']]


17:58:51 - cmdstanpy - INFO - Chain [1] start processing
17:58:52 - cmdstanpy - INFO - Chain [1] done processing
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_prediction_sts_debit.rename(columns={'yhat':'debit_horaire', 'ds':'datetime'}, inplace=True)


In [65]:
df_prophet = pd.DataFrame()
df_prophet['ds'] = df_sts_full.index
df_prophet['y'] = df_sts_full['taux_occupation'].values

m = Prophet()
m.fit(df_prophet)
future = m.make_future_dataframe(periods=120, freq='H')

forecast = m.predict(future)

df_prediction_sts_taux = forecast.loc[forecast['ds']>= date_debut]
df_prediction_sts_taux.rename(columns={'yhat':'taux_occupation'}, inplace=True,)
df_prediction_sts_taux = df_prediction_sts_taux['taux_occupation']

17:59:01 - cmdstanpy - INFO - Chain [1] start processing
17:59:03 - cmdstanpy - INFO - Chain [1] done processing
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_prediction_sts_taux.rename(columns={'yhat':'taux_occupation'}, inplace=True,)


In [93]:
df_sts = pd.concat([df_prediction_sts_debit, df_prediction_sts_taux], axis=1)
df_sts['arc'] = 'Saint-Pères'
df_sts = df_sts.head(120)

In [68]:
df_prophet = pd.DataFrame()
df_prophet['ds'] = df_champs_full.index
df_prophet['y'] = df_champs_full['debit_horaire'].values

m = Prophet()
m.fit(df_prophet)
future = m.make_future_dataframe(periods=120, freq='H')

forecast = m.predict(future)

df_prediction_champs_debit = forecast.loc[forecast['ds']>= date_debut]
df_prediction_champs_debit.rename(columns={'yhat':'debit_horaire', 'ds':'datetime'}, inplace=True)
df_prediction_champs_debit = df_prediction_champs_debit[['debit_horaire', 'datetime']]

17:59:44 - cmdstanpy - INFO - Chain [1] start processing
17:59:44 - cmdstanpy - INFO - Chain [1] done processing
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_prediction_champs_debit.rename(columns={'yhat':'debit_horaire', 'ds':'datetime'}, inplace=True)


In [70]:
df_prophet = pd.DataFrame()
df_prophet['ds'] = df_champs_full.index
df_prophet['y'] = df_champs_full['taux_occupation'].values

m = Prophet()
m.fit(df_prophet)
future = m.make_future_dataframe(periods=120, freq='H')

forecast = m.predict(future)

df_prediction_champs_taux = forecast.loc[forecast['ds']>= date_debut]
df_prediction_champs_taux.rename(columns={'yhat':'taux_occupation'}, inplace=True,)
df_prediction_champs_taux = df_prediction_champs_taux['taux_occupation']

18:00:31 - cmdstanpy - INFO - Chain [1] start processing
18:00:32 - cmdstanpy - INFO - Chain [1] done processing
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_prediction_champs_taux.rename(columns={'yhat':'taux_occupation'}, inplace=True,)


In [92]:
df_champs = pd.concat([df_prediction_sts_debit, df_prediction_sts_taux], axis=1)
df_champs['arc'] = 'Champs-Elysées'
df_champs = df_champs.head(120)

In [95]:
df_predictions = pd.concat([df_champs, df_sts, df_convention], axis=0)

df_predictions.head()

Unnamed: 0,debit_horaire,datetime,taux_occupation,arc
9644,427.846555,2023-12-08 00:00:00,6.246563,Champs-Elysées
9645,381.808007,2023-12-08 01:00:00,5.552406,Champs-Elysées
9646,321.087564,2023-12-08 02:00:00,4.708809,Champs-Elysées
9647,245.417773,2023-12-08 03:00:00,3.78617,Champs-Elysées
9648,177.382058,2023-12-08 04:00:00,3.02464,Champs-Elysées


### Export and check output

In [96]:
from datetime import datetime
from pathlib import Path
def test_format_and_export_output(output_df, output_directory, name_of_the_group):
    output_columns = {"arc": object, "datetime": object, "debit_horaire": float, "taux_occupation": float}
# 1. Check relevant columns are in output dataframe 
    assert sorted(list(output_df.columns)) == list(        
    output_columns.keys()   ), "Some columns are missing or unnecessary columns are in output"
    #2. Check types    
    for col, col_type in output_columns.items():
        assert output_df[col].dtype == col_type, f"Column {col} does not have type {col_type}"
    # 3. Check datetime string has right format    
    try:
        output_df.datetime.apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M"))
    except ValueError as e:
        raise e
    # 4. Check `arc` columns has right values
    assert sorted(list(output_df["arc"].unique())) == [        
        "Champs-Elysées",
           "Convention",
           "Saint-Pères",
           ], "Output does not have expected unique values for column `arc`"
    # 5. Check dataframe has right number of rows
    assert output_df.shape[0] == 360, f"Expected number of rows is 360, output has {output_df.shape[0]}"
    # 6. Export output    
    output_path = Path(output_directory) / f"output_{name_of_the_group}.csv"
    print(f"[SAVE OUTPUT] Saving output here: {output_path}")  
    output_df[output_columns.keys()].to_csv(output_path, sep=";")

test_format_and_export_output(df_predictions, "../../datathon_bcg/", "ColissiMONSTRE")


AssertionError: Column datetime does not have type <class 'object'>

In [104]:
df_predictions['datetime'] = pd.to_datetime(df_predictions['datetime'], format=r'%Y-%m-%d %H:%M').dt.floor('min')

In [109]:
df_predictions['datetime'] = df_predictions['datetime'].astype(str)

In [116]:
df_predictions['datetime'] = (df_predictions['datetime'])]

KeyError: 'key of type tuple not found and not a MultiIndex'

In [115]:
df_predictions

Unnamed: 0,debit_horaire,datetime,taux_occupation,arc
9644,427.846555,2023-12-08 00:00:00,6.246563,Champs-Elysées
9645,381.808007,2023-12-08 01:00:00,5.552406,Champs-Elysées
9646,321.087564,2023-12-08 02:00:00,4.708809,Champs-Elysées
9647,245.417773,2023-12-08 03:00:00,3.786170,Champs-Elysées
9648,177.382058,2023-12-08 04:00:00,3.024640,Champs-Elysées
...,...,...,...,...
9759,1358.353982,,36.183168,Convention
9760,1339.582191,,34.792718,Convention
9761,1264.628612,,32.111628,Convention
9762,1165.773798,,29.006364,Convention


In [100]:
df_predictions.to_csv('output_ColissiMONSTRE.csv', sep=';')

In [105]:
df_predictions_final = df_predictions.head(360)

In [99]:
df_predictions.head()

Unnamed: 0,debit_horaire,datetime,taux_occupation,arc
9644,427.846555,2023-12-08 00:00:00,6.246563,Champs-Elysées
9645,381.808007,2023-12-08 01:00:00,5.552406,Champs-Elysées
9646,321.087564,2023-12-08 02:00:00,4.708809,Champs-Elysées
9647,245.417773,2023-12-08 03:00:00,3.78617,Champs-Elysées
9648,177.382058,2023-12-08 04:00:00,3.02464,Champs-Elysées


In [98]:
df_predictions_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 360 entries, 9644 to 9761
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   debit_horaire    360 non-null    float64       
 1   datetime         360 non-null    datetime64[ns]
 2   taux_occupation  360 non-null    float64       
 3   arc              360 non-null    object        
dtypes: datetime64[ns](1), float64(2), object(1)
memory usage: 14.1+ KB
