In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

In [None]:
# Read the CSV file

filepath = 'data/comptages-routiers-permanents-convention.csv'

df = pd.read_csv(filepath, delimiter=';')

if "washington" in filepath.lower():
    # Remove all rows with "Libelle noeud amont" not equal to "Av_Champs_Elysees-Washington" or "Libelle noeud aval" not equal to "Av_Champs_Elysees-Berri"
    df = df[(df['Libelle noeud amont'] == 'Av_Champs_Elysees-Washington') & (df['Libelle noeud aval'] == 'Av_Champs_Elysees-Berri')]
elif "convention" in filepath.lower():
    # Remove all rows with "Libelle noeud amont" not equal to "Convention-Blomet" or "Libelle noeud aval" not equal to "Lecourbe-Convention"
    df = df[(df['Libelle noeud amont'] == 'Convention-Blomet') & (df['Libelle noeud aval'] == 'Lecourbe-Convention')]
elif "st_antoine" in filepath.lower():
    # Remove all rows with "Libelle noeud amont" not equal to "Bastille-St_Antoine" or "Libelle noeud aval" not equal to "St_Antoine-Jacques_Coeur"
    df = df[(df['Libelle noeud amont'] == 'Bastille-St_Antoine') & (df['Libelle noeud aval'] == 'St_Antoine-Jacques_Coeur')]

# Convert "Date et heure de comptage" to datetime
df['Date et heure de comptage'] = pd.to_datetime(df['Date et heure de comptage'], errors='coerce', utc=True)
df.set_index('Date et heure de comptage', inplace=True)
df.sort_index(inplace=True)

# Save file into {filename}-filtered.csv
df.to_csv(filepath.replace(".csv", "-filtered.csv"), sep=";", index=True)

print(f"Number of rows: {df.shape[0]}")

In [None]:
# Tracer un histogramme du débit horaire en fonction de la date
plt.figure(figsize=(12, 6))
plt.hist(df.index, bins=50, weights=df['Débit horaire'], color='skyblue', edgecolor='black')  # Remplacez "Débit horaire" par le nom exact
plt.title('Histogramme du débit horaire en fonction de la date')
plt.xlabel('Date')
plt.ylabel('Débit total')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
#Selectionner une portion du df pour une période donnée
sept_debit_df = df.loc['2024-01-14 00:00:00+00:00':'2024-01-14 23:00:00+00:00']

# Tracer les données
plt.figure(figsize=(12, 6))
plt.plot(sept_debit_df.index, sept_debit_df['Débit horaire'], label='Débit horaire', color='blue')  # Remplacez "Débit horaire" par le nom exact de la colonne

# Configurer l'affichage des ticks pour ne montrer que l'heure
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))  # Format heure:minute
plt.gca().xaxis.set_major_locator(mdates.HourLocator(interval=1))  # Tick toutes les heures

# Incliner les étiquettes pour qu'elles soient lisibles
plt.xticks(rotation=45, fontsize=10)

# Ajouter les détails du graphique
plt.title('Évolution du débit horaire')
plt.xlabel('Heure')
plt.ylabel('Débit')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Display the first few rows
df.head()

In [None]:
# Get basic information about the DataFrame
df.info()

In [None]:
# Check for missing values in each column
df.isnull().sum()

In [None]:
# Convert date columns to datetime format
df['Date et heure de comptage'] = pd.to_datetime(df['Date et heure de comptage'], utc=True)
df['Date debut dispo data'] = pd.to_datetime(df['Date debut dispo data'], utc=True)
df['Date fin dispo data'] = pd.to_datetime(df['Date fin dispo data'], utc=True)

In [None]:
# Extract datetime features from 'Date et heure de comptage'
df['Year'] = df['Date et heure de comptage'].dt.year
df['Month'] = df['Date et heure de comptage'].dt.month
df['Day'] = df['Date et heure de comptage'].dt.day
df['Hour'] = df['Date et heure de comptage'].dt.hour
df['DayOfWeek'] = df['Date et heure de comptage'].dt.dayofweek

In [None]:
# Display statistical summaries of numerical columns
df.describe()

In [None]:
# Visualize the distribution of 'Débit horaire'
plt.figure(figsize=(10, 6))
sns.histplot(df['Débit horaire'], kde=True, bins=30)
plt.title('Distribution of Débit horaire')
plt.xlabel('Débit horaire')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Visualize the distribution of 'Taux d\'occupation'
plt.figure(figsize=(10, 6))
sns.histplot(df['Taux d\'occupation'], kde=True, bins=30)
plt.title('Distribution of Taux d\'occupation')
plt.xlabel('Taux d\'occupation')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Correlation matrix of numerical features
corr_matrix = df.corr(numeric_only=True)
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Check for duplicate rows
duplicate_rows = df[df.duplicated()]
print(f"Number of duplicate rows: {duplicate_rows.shape[0]}")

In [None]:
# Handle missing values by dropping rows with missing target variables
df_clean = df.dropna(subset=['Débit horaire', 'Taux d\'occupation'])

In [None]:
# Visualize 'Débit horaire' over different hours of the day
plt.figure(figsize=(10, 6))
sns.lineplot(data=df_clean, x='Hour', y='Débit horaire', estimator='mean')
plt.title('Average Débit horaire by Hour')
plt.xlabel('Hour of the Day')
plt.ylabel('Average Débit horaire')
plt.show()

In [None]:
# Visualize 'Taux d\'occupation' over different hours of the day
plt.figure(figsize=(10, 6))
sns.lineplot(data=df_clean, x='Hour', y='Taux d\'occupation', estimator='mean')
plt.title('Average Taux d\'occupation by Hour')
plt.xlabel('Hour of the Day')
plt.ylabel('Average Taux d\'occupation')
plt.show()

In [None]:
# Analyze 'Débit horaire' by 'Etat trafic'
plt.figure(figsize=(10, 6))
sns.barplot(data=df_clean, x='Etat trafic', y='Débit horaire')
plt.title('Average Débit horaire by Etat trafic')
plt.xlabel('Etat trafic')
plt.ylabel('Average Débit horaire')
plt.show()

In [None]:
# Analyze 'Taux d\'occupation' by 'Etat trafic'
plt.figure(figsize=(10, 6))
sns.barplot(data=df_clean, x='Etat trafic', y='Taux d\'occupation')
plt.title('Average Taux d\'occupation by Etat trafic')
plt.xlabel('Etat trafic')
plt.ylabel('Average Taux d\'occupation')
plt.show()