In [None]:
%pip install pandas numpy matplotlib seaborn autogluon

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from autogluon.timeseries import TimeSeriesDataFrame

In [None]:
import data_loader
df = data_loader.load_data("washington")

In [None]:
# Get basic information about the DataFrame
df.info()

In [None]:
# Check wether all the "Date et heure de comptage" are every 1 hour
print("Checking if all the 'Date et heure de comptage' are every 1 hour")
# Generate a complete range of hourly timestamps between the min and max dates in the column
full_range = pd.date_range(start=df['Date et heure de comptage'].min(), 
                           end=df['Date et heure de comptage'].max(), 
                           freq='h')

# Find missing timestamps
missing_dates = full_range.difference(df['Date et heure de comptage'])

# Print results
if missing_dates.empty:
    print("No missing dates!")
else:
    print(f"Missing dates ({len(missing_dates)}):")
    print(missing_dates)

In [None]:
# Assuming 'df' is your original DataFrame with the datetime column 'Date et heure de comptage'

# Step 1: Create a complete range of hourly timestamps
full_range = pd.date_range(start=df['Date et heure de comptage'].min(), 
                           end=df['Date et heure de comptage'].max(), 
                           freq='h')

# Step 2: Create a DataFrame with the full range and mark present/missing timestamps
full_df = pd.DataFrame({'Date et heure de comptage': full_range})
full_df['Data Present'] = full_df['Date et heure de comptage'].isin(df['Date et heure de comptage']).astype(int)

# Set the datetime column as the index for plotting
full_df.set_index('Date et heure de comptage', inplace=True)

# Step 3: Plot the data availability over time
plt.figure(figsize=(15, 5))
full_df['Data Present'].plot(drawstyle='steps-post', color='blue')
plt.xlabel('Date and Time')
plt.ylabel('Data Present (1) or Missing (0)')
plt.title('Data Availability Over Time')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Step 1: Create a full hourly datetime range
full_range = pd.date_range(start=df['Date et heure de comptage'].min(), 
                           end=df['Date et heure de comptage'].max(), 
                           freq='h')

# Step 2: Reindex the DataFrame
df = df.set_index('Date et heure de comptage')
df = df.reindex(full_range)

# Step 3: Rename the index to match the original column name
df.index.name = 'Date et heure de comptage'

df.reset_index(inplace=True)

# Step 4: Handle missing values (optional)
# Option 1: Leave NaNs (explicitly missing data)
# Option 2: Fill with a placeholder (e.g., 0)
# df_reindexed.fillna(0, inplace=False)  # Or specify a method like "ffill" or "bfill"

In [None]:
# Check for missing values in each column
df.isnull().sum()

In [None]:
# Extract datetime features from 'Date et heure de comptage'
df['Year'] = df['Date et heure de comptage'].dt.year
df['Month'] = df['Date et heure de comptage'].dt.month
df['Day'] = df['Date et heure de comptage'].dt.day
df['Hour'] = df['Date et heure de comptage'].dt.hour
df['DayOfWeek'] = df['Date et heure de comptage'].dt.dayofweek

In [None]:
# Display statistical summaries of numerical columns
df.describe()

In [None]:
# Visualize the distribution of 'Débit horaire'
plt.figure(figsize=(10, 6))
sns.histplot(df['Débit horaire'], kde=True, bins=30)
plt.title('Distribution of Débit horaire')
plt.xlabel('Débit horaire')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Visualize the distribution of 'Taux d\'occupation'
plt.figure(figsize=(10, 6))
sns.histplot(df['Taux d\'occupation'], kde=True, bins=30)
plt.title('Distribution of Taux d\'occupation')
plt.xlabel('Taux d\'occupation')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Correlation matrix of numerical features
corr_matrix = df.corr(numeric_only=True)
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Visualize 'Débit horaire' over different hours of the day
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='Hour', y='Débit horaire', estimator='mean')
plt.title('Average Débit horaire by Hour')
plt.xlabel('Hour of the Day')
plt.ylabel('Average Débit horaire')
plt.show()

In [None]:
# Visualize 'Taux d\'occupation' over different hours of the day
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='Hour', y='Taux d\'occupation', estimator='mean')
plt.title('Average Taux d\'occupation by Hour')
plt.xlabel('Hour of the Day')
plt.ylabel('Average Taux d\'occupation')
plt.show()

In [None]:
# Analyze 'Débit horaire' by 'Etat trafic'
plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='Etat trafic', y='Débit horaire')
plt.title('Average Débit horaire by Etat trafic')
plt.xlabel('Etat trafic')
plt.ylabel('Average Débit horaire')
plt.show()

In [None]:
# Analyze 'Taux d\'occupation' by 'Etat trafic'
plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='Etat trafic', y='Taux d\'occupation')
plt.title('Average Taux d\'occupation by Etat trafic')
plt.xlabel('Etat trafic')
plt.ylabel('Average Taux d\'occupation')
plt.show()

## Training Chronos with Autogluon

In [None]:
# Create a TimeSeriesDataFrame
# df["timestamp"] = df["Date et heure de comptage"]
# df.set_index('Date et heure de comptage', inplace=True)

# Add the required 'item_id' column (since this is a single time series, we use a constant ID)
df['item_id'] = 'series_' + street

df["Date et heure de comptage"] = df["Date et heure de comptage"].dt.tz_localize(None)
# df["timestamp"] = df["timestamp"].dt.tz_localize(None)

# Rename "Date et heure de comptage" to "timestamp"
df.rename(columns={"Date et heure de comptage": "timestamp"}, inplace=True)

# Rename "Débit horaire" to "target"
df.rename(columns={"Débit horaire": "target"}, inplace=True)

In [None]:
ts_df = TimeSeriesDataFrame.from_data_frame(
    df,
    id_column='item_id',
)

In [None]:
# Define the cutoff point for the test set
cutoff_date = df["timestamp"].max() - pd.DateOffset(months=1) - pd.DateOffset(days=20)
print(f"Test set cutoff date: {cutoff_date}")

# Split the data
train_data = ts_df.loc[ts_df.index.get_level_values("timestamp") <= cutoff_date]
test_data = ts_df.loc[ts_df.index.get_level_values("timestamp") > cutoff_date]

In [None]:
import torch

if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS device is available.")
else:
    device = torch.device("cpu")
    print("MPS device is not available; using CPU.")

In [None]:
from autogluon.timeseries import TimeSeriesPredictor

# Define the prediction length (e.g., 24 hours)
prediction_length = 24 * 5

# Initialize the predictor
predictor = TimeSeriesPredictor(
    prediction_length=prediction_length,
    verbosity=3
)
                                

# Train the predictor with fine-tuning
predictor.fit(
    train_data=train_data,
    hyperparameters={
        "Chronos": {
            "use_mps": True,
            "model_path": "amazon/chronos-bolt-tiny",
            "fine_tune": True
        }
    }
)

In [None]:
# Generate predictions
predictions = predictor.predict(train_data.iloc[-100:])

In [None]:
predictions

In [None]:
# Evaluate the predictor
performance = predictor.evaluate(test_data)
print(performance)

In [None]:
predictions

In [None]:
test_data

In [None]:
print(cutoff_date)

In [None]:
import matplotlib.pyplot as plt

# Ensure predictions and test data have the same structure
# Assuming 'target' is the column in test_data and predictions

# Plot predictions vs. actual values
plt.figure(figsize=(15, 5))

# Plot actual values
plt.plot(test_data.index.get_level_values('timestamp'),
         test_data['target'],
         label='Actual Values',
         linestyle='-')

# Plot predicted values
plt.plot(predictions.index.get_level_values('timestamp'),
         predictions['mean'],
         label='Predicted Values',
         linestyle='--')

# Customize the plot
plt.xlabel('Date and Time')
plt.ylabel('Target Value')
plt.title('Predictions vs. Actual Values')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()