In [12]:
# Import necessary libraries
import pandas as pd
import numpy as np
import joblib
import xgboost as xgb

# Load the encoder, scaler, and model
encoder = joblib.load('encoder.joblib')
scaler = joblib.load('scaler.joblib')
model = joblib.load('best_xgboost_model_gridsearch.joblib')

# Load the dataset
df_raw = pd.read_csv('../../Train.csv')
df_raw = df_raw.tail(10)
# Convert the 'date_time' column to datetime and sort the dataset
df_raw['date_time'] = pd.to_datetime(df_raw['date_time'])
df_raw.sort_values('date_time', inplace=True)

# Extracting non-numeric columns
non_numeric_cols = ['is_holiday', 'weather_type', 'weather_description']

# Group by 'date_time' and aggregate: mean for numeric columns, mode for non-numeric columns
agg_funcs = {col: 'mean' for col in df_raw.columns if col not in non_numeric_cols}
agg_funcs.update({col: lambda x: x.mode()[0] if not x.mode().empty else np.nan for col in non_numeric_cols})

df_aggregated = df_raw.groupby('date_time').agg(agg_funcs)


In [13]:
df_aggregated

Unnamed: 0_level_0,date_time,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,clouds_all,traffic_volume,is_holiday,weather_type,weather_description
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2017-05-17 20:00:00,2017-05-17 20:00:00,173.666667,86.0,1.0,326.0,4.0,4.0,288.89,0.0,0.0,90.0,2733.0,,Mist,heavy intensity rain
2017-05-17 21:00:00,2017-05-17 21:00:00,122.666667,85.0,1.0,328.0,6.666667,6.666667,287.88,0.0,0.0,90.0,2348.0,,Mist,light rain
2017-05-17 22:00:00,2017-05-17 22:00:00,109.5,70.0,1.0,24.0,1.5,1.5,286.95,0.0,0.0,90.0,2194.0,,Mist,heavy intensity rain
2017-05-17 23:00:00,2017-05-17 23:00:00,184.5,64.5,1.0,34.5,7.0,7.0,285.75,0.0,0.0,90.0,1328.0,,Mist,heavy intensity rain


In [14]:
# One-hot encode categorical features
encoded_data = encoder.fit_transform(df_aggregated[non_numeric_cols])

df_encode = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out())




In [15]:
# Reset index of df_encode
df_encode.index = df_aggregated.index

In [16]:
df_encode.columns

Index(['is_holiday_None', 'weather_type_Mist',
       'weather_description_heavy intensity rain',
       'weather_description_light rain'],
      dtype='object')

In [17]:
# Concatenate with df_aggregated
df = pd.concat([df_aggregated, df_encode], axis=1)

# Add hour from the 'date_time' column
df['hour'] = df['date_time'].dt.hour
df = df.drop(columns=non_numeric_cols)

In [18]:
df.shape

(4, 17)

In [19]:
# Feature engineering: create lagged and rolling features
target = 'traffic_volume'
for i in range(1, 4):
    df[f'traffic_volume_lag_{i}'] = df[target].shift(i)
df['traffic_volume_rolling_mean'] = df[target].rolling(window=3).mean().shift(1)
df['traffic_volume_rolling_std'] = df[target].rolling(window=3).std().shift(1)

# Remove rows with NaN values resulting from lagged features
df.dropna(inplace=True)

# Split the dataset into features and the target
X = df.drop(target, axis=1)
y = df[target]

# Save 'date_time' for later use
date_time = df['date_time']

# Drop 'date_time' column before scaling
df = df.drop(columns=['date_time'])

In [20]:
df.shape

(1, 21)

In [21]:
scaler = joblib.load('scaler2.joblib')
# Check categories in encoder
print(scaler.get_feature_names_out())

['air_pollution_index' 'humidity' 'wind_speed' 'wind_direction'
 'visibility_in_miles' 'dew_point' 'temperature' 'rain_p_h' 'snow_p_h'
 'clouds_all' 'traffic_volume' 'is_holiday_Christmas Day'
 'is_holiday_Columbus Day' 'is_holiday_Independence Day'
 'is_holiday_Labor Day' 'is_holiday_Martin Luther King Jr Day'
 'is_holiday_Memorial Day' 'is_holiday_New Years Day' 'is_holiday_None'
 'is_holiday_State Fair' 'is_holiday_Thanksgiving Day'
 'is_holiday_Veterans Day' 'is_holiday_Washingtons Birthday'
 'weather_type_Clear' 'weather_type_Clouds' 'weather_type_Drizzle'
 'weather_type_Fog' 'weather_type_Haze' 'weather_type_Mist'
 'weather_type_Rain' 'weather_type_Snow' 'weather_type_Squall'
 'weather_type_Thunderstorm' 'weather_description_SQUALLS'
 'weather_description_Sky is Clear' 'weather_description_broken clouds'
 'weather_description_drizzle' 'weather_description_few clouds'
 'weather_description_fog' 'weather_description_freezing rain'
 'weather_description_haze' 'weather_description_he

In [22]:
# Scale the numerical features
df_scaled = scaler.transform(df)  # Use the previously loaded scaler

# Convert scaled data back to DataFrame
df_scaled = pd.DataFrame(df_scaled, columns=[col for col in df.columns if col != 'date_time'])
df_scaled['date_time'] = date_time.values

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- is_holiday_Christmas Day
- is_holiday_Columbus Day
- is_holiday_Independence Day
- is_holiday_Labor Day
- is_holiday_Martin Luther King Jr Day
- ...


In [None]:
df_scaled.shape

In [None]:
X = df_scaled.drop(columns=['date_time'])

In [None]:
X = X.drop(target, axis=1)

In [None]:
df = pd.concat([date_time, df], axis=1)

In [None]:
X

In [None]:
# Function to update the lagged features with the new prediction
def update_lagged_features(df, new_prediction, max_lags=3):
    for i in range(max_lags-1, 0, -1):
        df[f'traffic_volume_lag_{i+1}'] = df[f'traffic_volume_lag_{i}']
    df['traffic_volume_lag_1'] = new_prediction

# Initialize DataFrame for dynamic forecasting
df_dynamic_forecast = X.copy()
df_dynamic_forecast['forecasted_traffic_volume'] = np.nan

In [None]:
df_dynamic_forecast

In [None]:
# Number of steps to forecast
forecast_steps = 2000

for i in range(forecast_steps):
    # Predict the traffic volume for the next time step
    current_prediction = model.predict(df_dynamic_forecast.iloc[i:i+1].drop(columns=['forecasted_traffic_volume']))[0]
    df_dynamic_forecast.at[df_dynamic_forecast.index[i], 'forecasted_traffic_volume'] = current_prediction

    # Update lagged features with the new prediction for the next iteration
    if i + 1 < forecast_steps:
        update_lagged_features(df_dynamic_forecast.iloc[i + 1], current_prediction)

In [None]:
# Initialize the DataFrame
test_date_times = df['date_time'].reset_index(drop=True)

df_result = pd.DataFrame({
    'date_time': test_date_times,
    'actual_traffic_volume': y.reset_index(drop=True),
    'lag_1': np.nan,
    'lag_2': np.nan,
    'lag_3': np.nan,
    'forecasted_traffic_volume': df_dynamic_forecast['forecasted_traffic_volume'].reset_index(drop=True)
})

# Set the initial lagged values from the historical data
df_result.loc[0, 'lag_1'] = df.loc[df.index[-1], 'traffic_volume']
df_result.loc[0, 'lag_2'] = df.loc[df.index[-2], 'traffic_volume']
df_result.loc[0, 'lag_3'] = df.loc[df.index[-3], 'traffic_volume']


# Update the lagged values with the forecasted values in each step
for i in range(1, len(df_result)):
    df_result.loc[i, 'lag_1'] = df_result.loc[i - 1, 'forecasted_traffic_volume']
    df_result.loc[i, 'lag_2'] = df_result.loc[i - 1, 'lag_1']
    df_result.loc[i, 'lag_3'] = df_result.loc[i - 1, 'lag_2']

# Display the DataFrame
print(df_result.head(forecast_steps))
