In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import joblib
import xgboost as xgb

# Load the encoder, scaler, and model
encoder = joblib.load('encoder.joblib')
scaler = joblib.load('scaler.joblib')
model = joblib.load('best_xgboost_model_gridsearch.joblib')

# Load the dataset
df_raw = pd.read_csv('../../Train.csv')
#df_raw = df_raw.tail(6)
# Convert the 'date_time' column to datetime and sort the dataset
df_raw['date_time'] = pd.to_datetime(df_raw['date_time'])
df_raw.sort_values('date_time', inplace=True)

# Extracting non-numeric columns
non_numeric_cols = ['is_holiday', 'weather_type', 'weather_description']

# Group by 'date_time' and aggregate: mean for numeric columns, mode for non-numeric columns
agg_funcs = {col: 'mean' for col in df_raw.columns if col not in non_numeric_cols}
agg_funcs.update({col: lambda x: x.mode()[0] if not x.mode().empty else np.nan for col in non_numeric_cols})

df_aggregated = df_raw.groupby('date_time').agg(agg_funcs)


In [2]:
df_aggregated

Unnamed: 0_level_0,date_time,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,clouds_all,traffic_volume,is_holiday,weather_type,weather_description
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2012-10-02 09:00:00,2012-10-02 09:00:00,121.000000,89.0,2.0,329.0,1.000000,1.000000,288.28,0.0,0.0,40.0,5545.0,,Clouds,scattered clouds
2012-10-02 10:00:00,2012-10-02 10:00:00,178.000000,67.0,3.0,330.0,1.000000,1.000000,289.36,0.0,0.0,75.0,4516.0,,Clouds,broken clouds
2012-10-02 11:00:00,2012-10-02 11:00:00,113.000000,66.0,3.0,329.0,2.000000,2.000000,289.58,0.0,0.0,90.0,4767.0,,Clouds,overcast clouds
2012-10-02 12:00:00,2012-10-02 12:00:00,20.000000,66.0,3.0,329.0,5.000000,5.000000,290.13,0.0,0.0,90.0,5026.0,,Clouds,overcast clouds
2012-10-02 13:00:00,2012-10-02 13:00:00,281.000000,65.0,3.0,329.0,7.000000,7.000000,291.14,0.0,0.0,75.0,4918.0,,Clouds,broken clouds
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-05-17 19:00:00,2017-05-17 19:00:00,112.333333,84.0,1.0,321.0,5.000000,5.000000,290.56,0.0,0.0,90.0,3136.0,,Drizzle,light intensity drizzle
2017-05-17 20:00:00,2017-05-17 20:00:00,173.666667,86.0,1.0,326.0,4.000000,4.000000,288.89,0.0,0.0,90.0,2733.0,,Mist,heavy intensity rain
2017-05-17 21:00:00,2017-05-17 21:00:00,122.666667,85.0,1.0,328.0,6.666667,6.666667,287.88,0.0,0.0,90.0,2348.0,,Mist,light rain
2017-05-17 22:00:00,2017-05-17 22:00:00,109.500000,70.0,1.0,24.0,1.500000,1.500000,286.95,0.0,0.0,90.0,2194.0,,Mist,heavy intensity rain


In [3]:
# One-hot encode categorical features
encoded_data = encoder.fit_transform(df_aggregated[non_numeric_cols])

df_encode = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out())




In [4]:
# Reset index of df_encode
df_encode.index = df_aggregated.index

In [5]:
df_encode.columns

Index(['is_holiday_Christmas Day', 'is_holiday_Columbus Day',
       'is_holiday_Independence Day', 'is_holiday_Labor Day',
       'is_holiday_Martin Luther King Jr Day', 'is_holiday_Memorial Day',
       'is_holiday_New Years Day', 'is_holiday_None', 'is_holiday_State Fair',
       'is_holiday_Thanksgiving Day', 'is_holiday_Veterans Day',
       'is_holiday_Washingtons Birthday', 'weather_type_Clear',
       'weather_type_Clouds', 'weather_type_Drizzle', 'weather_type_Fog',
       'weather_type_Haze', 'weather_type_Mist', 'weather_type_Rain',
       'weather_type_Snow', 'weather_type_Squall', 'weather_type_Thunderstorm',
       'weather_description_SQUALLS', 'weather_description_Sky is Clear',
       'weather_description_broken clouds', 'weather_description_drizzle',
       'weather_description_few clouds', 'weather_description_fog',
       'weather_description_freezing rain', 'weather_description_haze',
       'weather_description_heavy intensity drizzle',
       'weather_description

In [6]:
# Concatenate with df_aggregated
df = pd.concat([df_aggregated, df_encode], axis=1)

# Add hour from the 'date_time' column
df['hour'] = df['date_time'].dt.hour
df = df.drop(columns=non_numeric_cols)

In [7]:
df.shape

(28589, 65)

In [8]:
# Feature engineering: create lagged and rolling features
target = 'traffic_volume'
for i in range(1, 4):
    df[f'traffic_volume_lag_{i}'] = df[target].shift(i)
df['traffic_volume_rolling_mean'] = df[target].rolling(window=3).mean().shift(1)
df['traffic_volume_rolling_std'] = df[target].rolling(window=3).std().shift(1)

In [9]:
df

Unnamed: 0_level_0,date_time,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,...,weather_description_thunderstorm with heavy rain,weather_description_thunderstorm with light drizzle,weather_description_thunderstorm with light rain,weather_description_very heavy rain,hour,traffic_volume_lag_1,traffic_volume_lag_2,traffic_volume_lag_3,traffic_volume_rolling_mean,traffic_volume_rolling_std
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-10-02 09:00:00,2012-10-02 09:00:00,121.000000,89.0,2.0,329.0,1.000000,1.000000,288.28,0.0,0.0,...,0.0,0.0,0.0,0.0,9,,,,,
2012-10-02 10:00:00,2012-10-02 10:00:00,178.000000,67.0,3.0,330.0,1.000000,1.000000,289.36,0.0,0.0,...,0.0,0.0,0.0,0.0,10,5545.0,,,,
2012-10-02 11:00:00,2012-10-02 11:00:00,113.000000,66.0,3.0,329.0,2.000000,2.000000,289.58,0.0,0.0,...,0.0,0.0,0.0,0.0,11,4516.0,5545.0,,,
2012-10-02 12:00:00,2012-10-02 12:00:00,20.000000,66.0,3.0,329.0,5.000000,5.000000,290.13,0.0,0.0,...,0.0,0.0,0.0,0.0,12,4767.0,4516.0,5545.0,4942.666667,536.520581
2012-10-02 13:00:00,2012-10-02 13:00:00,281.000000,65.0,3.0,329.0,7.000000,7.000000,291.14,0.0,0.0,...,0.0,0.0,0.0,0.0,13,5026.0,4767.0,4516.0,4769.666667,255.010457
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-05-17 19:00:00,2017-05-17 19:00:00,112.333333,84.0,1.0,321.0,5.000000,5.000000,290.56,0.0,0.0,...,0.0,0.0,0.0,0.0,19,4864.0,5763.0,6426.0,5684.333333,783.965773
2017-05-17 20:00:00,2017-05-17 20:00:00,173.666667,86.0,1.0,326.0,4.000000,4.000000,288.89,0.0,0.0,...,0.0,0.0,0.0,0.0,20,3136.0,4864.0,5763.0,4587.666667,1335.122591
2017-05-17 21:00:00,2017-05-17 21:00:00,122.666667,85.0,1.0,328.0,6.666667,6.666667,287.88,0.0,0.0,...,0.0,0.0,0.0,0.0,21,2733.0,3136.0,4864.0,3577.666667,1132.074350
2017-05-17 22:00:00,2017-05-17 22:00:00,109.500000,70.0,1.0,24.0,1.500000,1.500000,286.95,0.0,0.0,...,0.0,0.0,0.0,0.0,22,2348.0,2733.0,3136.0,2739.000000,394.034262


In [8]:


# Remove rows with NaN values resulting from lagged features
df.dropna(inplace=True)

# Split the dataset into features and the target
X = df.drop(target, axis=1)
y = df[target]

# Save 'date_time' for later use
date_time = df['date_time']

# Drop 'date_time' column before scaling
df = df.drop(columns=['date_time'])

In [9]:
df.shape

(28586, 69)

In [10]:
scaler = joblib.load('scaler2.joblib')
# Check categories in encoder
print(scaler.get_feature_names_out())

['air_pollution_index' 'humidity' 'wind_speed' 'wind_direction'
 'visibility_in_miles' 'dew_point' 'temperature' 'rain_p_h' 'snow_p_h'
 'clouds_all' 'traffic_volume' 'is_holiday_Christmas Day'
 'is_holiday_Columbus Day' 'is_holiday_Independence Day'
 'is_holiday_Labor Day' 'is_holiday_Martin Luther King Jr Day'
 'is_holiday_Memorial Day' 'is_holiday_New Years Day' 'is_holiday_None'
 'is_holiday_State Fair' 'is_holiday_Thanksgiving Day'
 'is_holiday_Veterans Day' 'is_holiday_Washingtons Birthday'
 'weather_type_Clear' 'weather_type_Clouds' 'weather_type_Drizzle'
 'weather_type_Fog' 'weather_type_Haze' 'weather_type_Mist'
 'weather_type_Rain' 'weather_type_Snow' 'weather_type_Squall'
 'weather_type_Thunderstorm' 'weather_description_SQUALLS'
 'weather_description_Sky is Clear' 'weather_description_broken clouds'
 'weather_description_drizzle' 'weather_description_few clouds'
 'weather_description_fog' 'weather_description_freezing rain'
 'weather_description_haze' 'weather_description_he

In [11]:
# Scale the numerical features
df_scaled = scaler.transform(df)  # Use the previously loaded scaler

# Convert scaled data back to DataFrame
df_scaled = pd.DataFrame(df_scaled, columns=[col for col in df.columns if col != 'date_time'])
df_scaled['date_time'] = date_time.values

In [12]:
df_scaled.shape

(28586, 70)

In [13]:
X = df_scaled.drop(columns=['date_time'])

In [14]:
X = X.drop(target, axis=1)

In [15]:
df = pd.concat([date_time, df], axis=1)

In [16]:
df

Unnamed: 0_level_0,date_time,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,...,weather_description_thunderstorm with heavy rain,weather_description_thunderstorm with light drizzle,weather_description_thunderstorm with light rain,weather_description_very heavy rain,hour,traffic_volume_lag_1,traffic_volume_lag_2,traffic_volume_lag_3,traffic_volume_rolling_mean,traffic_volume_rolling_std
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-10-02 12:00:00,2012-10-02 12:00:00,20.000000,66.0,3.0,329.0,5.000000,5.000000,290.13,0.0,0.0,...,0.0,0.0,0.0,0.0,12,4767.0,4516.0,5545.0,4942.666667,536.520581
2012-10-02 13:00:00,2012-10-02 13:00:00,281.000000,65.0,3.0,329.0,7.000000,7.000000,291.14,0.0,0.0,...,0.0,0.0,0.0,0.0,13,5026.0,4767.0,4516.0,4769.666667,255.010457
2012-10-02 14:00:00,2012-10-02 14:00:00,23.000000,65.0,3.0,328.0,6.000000,6.000000,291.72,0.0,0.0,...,0.0,0.0,0.0,0.0,14,4918.0,5026.0,4767.0,4903.666667,130.093556
2012-10-02 15:00:00,2012-10-02 15:00:00,184.000000,64.0,3.0,328.0,7.000000,7.000000,293.17,0.0,0.0,...,0.0,0.0,0.0,0.0,15,5181.0,4918.0,5026.0,5041.666667,132.198084
2012-10-02 16:00:00,2012-10-02 16:00:00,167.000000,64.0,3.0,327.0,7.000000,7.000000,293.86,0.0,0.0,...,0.0,0.0,0.0,0.0,16,5584.0,5181.0,4918.0,5227.666667,335.443488
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-05-17 19:00:00,2017-05-17 19:00:00,112.333333,84.0,1.0,321.0,5.000000,5.000000,290.56,0.0,0.0,...,0.0,0.0,0.0,0.0,19,4864.0,5763.0,6426.0,5684.333333,783.965773
2017-05-17 20:00:00,2017-05-17 20:00:00,173.666667,86.0,1.0,326.0,4.000000,4.000000,288.89,0.0,0.0,...,0.0,0.0,0.0,0.0,20,3136.0,4864.0,5763.0,4587.666667,1335.122591
2017-05-17 21:00:00,2017-05-17 21:00:00,122.666667,85.0,1.0,328.0,6.666667,6.666667,287.88,0.0,0.0,...,0.0,0.0,0.0,0.0,21,2733.0,3136.0,4864.0,3577.666667,1132.074350
2017-05-17 22:00:00,2017-05-17 22:00:00,109.500000,70.0,1.0,24.0,1.500000,1.500000,286.95,0.0,0.0,...,0.0,0.0,0.0,0.0,22,2348.0,2733.0,3136.0,2739.000000,394.034262


In [17]:
# Function to update the lagged features with the new prediction
def update_lagged_features(df, new_prediction, max_lags=3):
    for i in range(max_lags-1, 0, -1):
        df[f'traffic_volume_lag_{i+1}'] = df[f'traffic_volume_lag_{i}']
    df['traffic_volume_lag_1'] = new_prediction

# Initialize DataFrame for dynamic forecasting
df_dynamic_forecast = X.copy()
df_dynamic_forecast['forecasted_traffic_volume'] = np.nan

In [18]:
df_dynamic_forecast=df_dynamic_forecast.tail(10)

In [19]:
df_dynamic_forecast

Unnamed: 0,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,clouds_all,...,weather_description_thunderstorm with light drizzle,weather_description_thunderstorm with light rain,weather_description_very heavy rain,hour,traffic_volume_lag_1,traffic_volume_lag_2,traffic_volume_lag_3,traffic_volume_rolling_mean,traffic_volume_rolling_std,forecasted_traffic_volume
28576,-0.433287,-0.177709,-0.190144,1.436436,-1.609063,-1.609063,0.889263,-0.007519,-0.024626,1.150636,...,-0.005915,-0.013227,-0.016731,0.357029,0.765184,0.866182,0.703241,0.830092,-0.823108,
28577,-0.086039,-0.533796,-0.675389,1.375541,0.811913,0.811913,0.927236,-0.007519,-0.024626,-0.140999,...,-0.005915,-0.013227,-0.016731,0.500661,0.964249,0.765142,0.866109,0.922856,-0.762361,
28578,1.271951,-0.533796,-1.160634,1.279124,0.206669,0.206669,0.898577,-0.007519,-0.024626,1.150636,...,-0.005915,-0.013227,-0.016731,0.644292,1.167839,0.964207,0.765071,1.030097,-0.405282,
28579,0.899899,-0.622817,-0.675389,1.365392,0.004921,0.004921,0.851291,-0.007519,-0.024626,1.150636,...,-0.005915,-0.013227,-0.016731,0.787924,1.588088,1.167796,0.964132,1.322686,0.004418,
28580,0.103088,-0.207383,-0.675389,1.388227,-0.398575,-0.398575,0.798274,-0.007519,-0.024626,1.150636,...,-0.005915,-0.013227,-0.016731,0.931556,1.254804,1.588044,1.167717,1.425995,-0.333379,
28581,-0.528367,0.771856,-1.160634,1.233453,0.004921,0.004921,0.741674,-0.007519,-0.024626,1.150636,...,-0.005915,-0.013227,-0.016731,1.075187,0.802886,1.254761,1.587958,1.296233,0.270885,
28582,0.232273,0.890552,-1.160634,1.284199,-0.398575,-0.398575,0.622025,-0.007519,-0.024626,1.150636,...,-0.005915,-0.013227,-0.016731,1.218819,-0.065763,0.802843,1.254681,0.708195,1.242763,
28583,-0.400216,0.831204,-1.160634,1.304497,0.677414,0.677414,0.549663,-0.007519,-0.024626,1.150636,...,-0.005915,-0.013227,-0.016731,1.362451,-0.268348,-0.065804,0.802772,0.166627,0.88472,
28584,-0.563505,-0.059013,-1.160634,-1.780842,-1.407315,-1.407315,0.483033,-0.007519,-0.024626,1.150636,...,-0.005915,-0.013227,-0.016731,1.506082,-0.461883,-0.268388,-0.065859,-0.283071,-0.416698,
28585,0.366625,-0.385426,-1.160634,-1.674276,0.811913,0.811913,0.397058,-0.007519,-0.024626,1.150636,...,-0.005915,-0.013227,-0.016731,1.649714,-0.539298,-0.461923,-0.268439,-0.451439,-0.621963,


In [20]:
# Function to append new row with updated lagged features
def append_new_row(df, new_prediction, max_lags=3):
    new_row = df.iloc[-1].copy()  # Copy the last row to use as a base for the new row
    for i in range(max_lags-1, 0, -1):
        new_row[f'traffic_volume_lag_{i+1}'] = new_row[f'traffic_volume_lag_{i}']
    new_row['traffic_volume_lag_1'] = new_prediction
    new_row['forecasted_traffic_volume'] = np.nan  # Reset the forecasted value
    return df.append(new_row, ignore_index=True)

# Initialize DataFrame for dynamic forecasting
df_dynamic_forecast = X.copy()
df_dynamic_forecast['forecasted_traffic_volume'] = np.nan

# Number of steps to forecast
forecast_steps = 720

for i in range(forecast_steps):
    # Predict the traffic volume for the next time step
    current_prediction = model.predict(df_dynamic_forecast.iloc[i:i+1].drop(columns=['forecasted_traffic_volume']))[0]
    df_dynamic_forecast.at[df_dynamic_forecast.index[i], 'forecasted_traffic_volume'] = current_prediction

    # Append a new row with updated lagged features for the next prediction, if not at the last step
    if i + 1 < forecast_steps:
        df_dynamic_forecast = append_new_row(df_dynamic_forecast, current_prediction)


  return df.append(new_row, ignore_index=True)
  return df.append(new_row, ignore_index=True)
  return df.append(new_row, ignore_index=True)
  return df.append(new_row, ignore_index=True)
  return df.append(new_row, ignore_index=True)
  return df.append(new_row, ignore_index=True)
  return df.append(new_row, ignore_index=True)
  return df.append(new_row, ignore_index=True)
  return df.append(new_row, ignore_index=True)
  return df.append(new_row, ignore_index=True)
  return df.append(new_row, ignore_index=True)
  return df.append(new_row, ignore_index=True)
  return df.append(new_row, ignore_index=True)
  return df.append(new_row, ignore_index=True)
  return df.append(new_row, ignore_index=True)
  return df.append(new_row, ignore_index=True)
  return df.append(new_row, ignore_index=True)
  return df.append(new_row, ignore_index=True)
  return df.append(new_row, ignore_index=True)
  return df.append(new_row, ignore_index=True)
  return df.append(new_row, ignore_index=True)
  return df.a

In [21]:
# Initialize the DataFrame
test_date_times = df['date_time'].reset_index(drop=True)

df_result = pd.DataFrame({
    'date_time': test_date_times,
    'actual_traffic_volume': y.reset_index(drop=True),
    'lag_1': np.nan,
    'lag_2': np.nan,
    'lag_3': np.nan,
    'forecasted_traffic_volume': df_dynamic_forecast['forecasted_traffic_volume'].reset_index(drop=True)
})

# Set the initial lagged values from the historical data
df_result.loc[0, 'lag_1'] = df.loc[df.index[-1], 'traffic_volume']
df_result.loc[0, 'lag_2'] = df.loc[df.index[-2], 'traffic_volume']
df_result.loc[0, 'lag_3'] = df.loc[df.index[-3], 'traffic_volume']


# Update the lagged values with the forecasted values in each step
for i in range(1, len(df_result)):
    df_result.loc[i, 'lag_1'] = df_result.loc[i - 1, 'forecasted_traffic_volume']
    df_result.loc[i, 'lag_2'] = df_result.loc[i - 1, 'lag_1']
    df_result.loc[i, 'lag_3'] = df_result.loc[i - 1, 'lag_2']

# Display the DataFrame
print(df_result.head(forecast_steps))


              date_time  actual_traffic_volume        lag_1        lag_2  \
0   2012-10-02 12:00:00                 5026.0  1328.000000  2194.000000   
1   2012-10-02 13:00:00                 4918.0  4976.855957  1328.000000   
2   2012-10-02 14:00:00                 5181.0  5063.290527  4976.855957   
3   2012-10-02 15:00:00                 5584.0  5193.843750  5063.290527   
4   2012-10-02 16:00:00                 6015.0  5636.869629  5193.843750   
..                  ...                    ...          ...          ...   
715 2012-11-03 03:00:00                  343.0   513.710022   680.264587   
716 2012-11-03 04:00:00                  383.0   376.779419   513.710022   
717 2012-11-03 05:00:00                  703.0   384.547302   376.779419   
718 2012-11-03 06:00:00                 1229.0  1117.305298   384.547302   
719 2012-11-03 07:00:00                 2057.0  1327.210571  1117.305298   

           lag_3  forecasted_traffic_volume  
0    2348.000000                4976.8559