In [16]:
# %% imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


# Deep learning
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dense, Dropout, Conv1D, MaxPooling1D, Flatten, TimeDistributed
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


# SARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX

In [17]:
df = pd.read_csv("Metro_Interstate_Traffic_Volume.csv")
df.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,02/10/2012 09:00,5545
1,,289.36,0.0,0.0,75,Clouds,broken clouds,02/10/2012 10:00,4516
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,02/10/2012 11:00,4767
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,02/10/2012 12:00,5026
4,,291.14,0.0,0.0,75,Clouds,broken clouds,02/10/2012 13:00,4918


Dataset Description – Metro Interstate Traffic Volume

This dataset contains hourly traffic volume records from a highway station, combined with weather and holiday information. Each row represents a traffic observation at a given date and time.

🔑 Features

holiday: US holiday name if applicable (e.g., Columbus Day), otherwise None.

temp: Temperature in Kelvin (convertible to °C for interpretability).

rain_1h: Amount of rain in the past hour (mm). Mostly zero, but sometimes >0.

snow_1h: Amount of snow in the past hour (mm). Mostly zero.

clouds_all: Cloudiness percentage (0 = clear sky, 100 = fully cloudy).

weather_main: General weather condition (e.g., Clouds, Clear, Rain, Snow, Mist, Haze, Thunderstorm).

weather_description: More detailed description of weather (e.g., “few clouds”, “light rain”).

date_time: Timestamp of the observation (hourly frequency).

traffic_volume (target variable): Number of cars that passed the station during that hour.

⚠️ Observed Patterns & Quirks

Holiday entries

Named holidays appear in the holiday column.

Some holidays appear multiple times with different weather records (e.g., Labor Day with Rain and Thunderstorm).

Weather conditions

Weather descriptions sometimes vary in casing (sky is clear vs Sky is Clear).

Overlapping conditions exist (e.g., Rain + Thunderstorm).

Traffic volume distribution

Traffic ranges from very low (~0–500) to very high (>7000 in the full dataset).

Peak traffic is usually on workdays during rush hours.

Holidays and severe weather tend to lower traffic.

Temperature (Kelvin)

Winter values ≈ 260–275 K (~ -13°C to 2°C).

Summer values ≈ 290–300 K (~ 17°C to 27°C).

Matches seasonal expectations.

Date range & frequency

Data spans 2012 to 2018.

Frequency: hourly observations.

🧠 Intuition for Modeling

Time series cycles: Daily and weekly patterns strongly influence traffic (rush hour, weekends).

Weather: Rain, snow, and cloudy days tend to lower traffic.

Holidays: Reduce commuting traffic but may increase leisure travel traffic.

Duplicates with different weather: Not errors, but extra context.

📌 In short:
This dataset is like a logbook of city traffic, with each row saying:
"On this date and time, it was X°C, weather was Y, holiday was Z, and N cars passed."

✅ With this understanding, we can now preprocess the data:

Convert temp to °C

Clean holiday column (fill missing with "No Holiday")

Standardize weather text

Clip extreme rain/snow outliers

Extract datetime features (hour, day of week, month, etc.)

Set date_time as index for time series modeling

In [18]:
df['holiday'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 48204 entries, 0 to 48203
Series name: holiday
Non-Null Count  Dtype 
--------------  ----- 
61 non-null     object
dtypes: object(1)
memory usage: 376.7+ KB


In [19]:
# df['date_time'] = pd.to_datetime(df['date_time'], dayfirst=True, errors='coerce')

In [20]:
df.index

RangeIndex(start=0, stop=48204, step=1)

In [21]:
df.columns

Index(['holiday', 'temp', 'rain_1h', 'snow_1h', 'clouds_all', 'weather_main',
       'weather_description', 'date_time', 'traffic_volume'],
      dtype='object')

In [22]:
df.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,02/10/2012 09:00,5545
1,,289.36,0.0,0.0,75,Clouds,broken clouds,02/10/2012 10:00,4516
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,02/10/2012 11:00,4767
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,02/10/2012 12:00,5026
4,,291.14,0.0,0.0,75,Clouds,broken clouds,02/10/2012 13:00,4918


In [23]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48204 entries, 0 to 48203
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   holiday              61 non-null     object 
 1   temp                 48204 non-null  float64
 2   rain_1h              48204 non-null  float64
 3   snow_1h              48204 non-null  float64
 4   clouds_all           48204 non-null  int64  
 5   weather_main         48204 non-null  object 
 6   weather_description  48204 non-null  object 
 7   date_time            48204 non-null  object 
 8   traffic_volume       48204 non-null  int64  
dtypes: float64(3), int64(2), object(4)
memory usage: 3.3+ MB
None


In [24]:
df.describe()

Unnamed: 0,temp,rain_1h,snow_1h,clouds_all,traffic_volume
count,48204.0,48204.0,48204.0,48204.0,48204.0
mean,281.20587,0.334264,0.000222,49.362231,3259.818355
std,13.338232,44.789133,0.008168,39.01575,1986.86067
min,0.0,0.0,0.0,0.0,0.0
25%,272.16,0.0,0.0,1.0,1193.0
50%,282.45,0.0,0.0,64.0,3380.0
75%,291.806,0.0,0.0,90.0,4933.0
max,310.07,9831.3,0.51,100.0,7280.0


### **Preprocessing**

In [25]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    """Preprocess traffic dataset step by step."""
    
    # 1. Parse datetime
    df['date_time'] = pd.to_datetime(df['date_time'], errors='coerce')
    
    # 2. Handle temperature (convert from Kelvin to Celsius)
    df['temp_C'] = df['temp'] - 273.15
    
    # 3. Cap extreme outliers for rain_1h (values > 50mm/hr are unrealistic)
    df['rain_1h'] = np.where(df['rain_1h'] > 50, 50, df['rain_1h'])
    
    # 4. Snow stays as is (values are very small, no extreme issue)
    
    # 5. Ensure clouds_all is within 0–100 range
    df['clouds_all'] = df['clouds_all'].clip(0, 100)
    
    # 6. Extract datetime features
    df['hour'] = df['date_time'].dt.hour
    df['dayofweek'] = df['date_time'].dt.dayofweek  # 0=Mon, 6=Sun
    df['month'] = df['date_time'].dt.month
    df['year'] = df['date_time'].dt.year
    df['is_weekend'] = df['dayofweek'].isin([5,6]).astype(int)
    
    # 7. Optionally set datetime index (useful for time series models)
    df = df.set_index('date_time').sort_index()
    
    return df


### **Encoding**

In [32]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

def encode_cyclical_features(df: pd.DataFrame) -> pd.DataFrame:
    """Encode cyclical time features like hour and dayofweek using sine/cosine."""
    
    # Hour of day (24-hour cycle)
    if 'hour' in df.columns:
        df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
        df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    
    # Day of week (7-day cycle)
    if 'dayofweek' in df.columns:
        df['dow_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
        df['dow_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)
    
    return df


def encode_categoricals(df: pd.DataFrame) -> pd.DataFrame:
    """OneHotEncode categorical features."""
    
    # Handle missing holiday values
    if 'holiday' in df.columns:
        df['holiday'] = df['holiday'].fillna('None')
    
    # Detect all categorical (object or category) columns automatically
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    
    if categorical_cols:
        encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        encoded = encoder.fit_transform(df[categorical_cols])
        
        encoded_df = pd.DataFrame(
            encoded,
            columns=encoder.get_feature_names_out(categorical_cols),
            index=df.index
        )
        
        # Drop original categorical columns & add encoded ones
        df = df.drop(columns=categorical_cols)
        df = pd.concat([df, encoded_df], axis=1)
    
    return df


def preprocess_pipeline(df: pd.DataFrame):
    """Run full preprocessing: cyclical, categorical, scaling."""
    
    # Step 1: Cyclical encoding
    df = encode_cyclical_features(df)
    
    # Step 2: Categorical encoding
    df = encode_categoricals(df)
    
    # Step 3: Separate target
    y = df['traffic_volume']
    X = df.drop(columns=['traffic_volume'])
    
    # Step 4: Scale features
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled, y, scaler, X.columns


***calling the fxns***

In [None]:
# # 1. Basic preprocessing (temperature, rain, datetime, etc.)
# df_clean = preprocess(df)

# # 2. Encode cyclical time features
# df_clean = encode_cyclical_features(df_clean)

# # 3. Encode categorical features (holiday, weather_main)
# df_clean = encode_categoricals(df_clean)

# df.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume,temp_C,hour,dayofweek,month,year,is_weekend
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-02-10 09:00:00,5545,15.13,9.0,4.0,2.0,2012.0,0
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-02-10 10:00:00,4516,16.21,10.0,4.0,2.0,2012.0,0
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-02-10 11:00:00,4767,16.43,11.0,4.0,2.0,2012.0,0
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-02-10 12:00:00,5026,16.98,12.0,4.0,2.0,2012.0,0
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-02-10 13:00:00,4918,17.99,13.0,4.0,2.0,2012.0,0
