In [1]:
# If the libraries are not yet installed, they can be installed using conda commands similar to the below
# %conda install numpy
# %conda install pandas
# %conda install pandasql 
# %conda install openpyxl

# Something like the following may also work if the above does not
# import sys
# !conda install --yes --prefix {sys.prefix} numpy
# !conda install --yes --prefix {sys.prefix} pandas
# !conda install --yes --prefix {sys.prefix} pandasql
# !conda install --yes --prefix {sys.prefix} openpyxl

# To install a specific version, add the version to the install command
# E.g., %conda install numpy=1.20.3

# If all else fails, use pip or follow additional advice such as found at
# https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/

# If your plan to use pip (especially if you are not working within a specified conda environment), 
# the pip commands might look like:
# pip install numpy
# pip install pandas
# pip install pandasql
# pip install open pyxl

# To install a specific version, add the version to the pip install command
# E.g., pip install numpy==1.20.3

In [2]:
# Load necessary libraries
import pandas as pd
import numpy as np
import datetime
from datetime import timedelta
import pandasql as psql
import glob
import time
import os

In [3]:
# Create intermediate and processed directories for later use
if not os.path.exists("Data/Unseen Sensor/Intermediate"):
    os.mkdir("Data/Unseen Sensor/Intermediate")

if not os.path.exists("Data/Unseen Sensor/Processed"):
    os.mkdir("Data/Unseen Sensor/Processed")

# Functions to Read in Raw Data and Impute with Linear Interpolation

In [4]:
def full_time(df, interval="15min", min_date=None, max_date=None):
    """Function to generate full df of dates to determine which ones have missing data"""
    
    # Specify the start and end timestamps for the full date range
    if not min_date:
        start_date = df["timestamp"].min()
    else:
        start_date = min_date
    
    if not max_date:
        end_date = df["timestamp"].max()
    else:
        end_date = max_date
    
    # Create a full date range using specified freq, typically either 15min or 5 min depending on the data 
    date_list = pd.date_range(start=start_date, end=end_date, freq=interval)
    
    # Put this date range into a df
    date_df = pd.DataFrame({"timestamp": date_list})
    
    # Create columns for date, day of week, and day of year in addition to the timestamp column
    date_df["date"] = pd.to_datetime(date_df["timestamp"].astype("string").str[:10])
    date_df["day_of_week"] = date_df["date"].dt.dayofweek
    date_df["day_of_year"] = date_df["date"].dt.dayofyear
    
    return date_df

In [5]:
def read_highways_england(fname, min_date=None, max_date=None):
    """Function to read in csv file of highway sensor data"""
    
    # Read file into Pandas df
    df = pd.read_csv(fname)
    
    # Grab relevant columns from df
    df = df[["Site Name", "Report Date", "Time Period Ending", "Time Interval", "Avg mph", "Total Volume"]]
    
    # Re-format date field and cast to string
    df["Date"] = pd.to_datetime(df["Report Date"], format='%d/%m/%Y 00:00:00').astype("string") 
    
    # Grab the timestamp of the time-period in the hour
    df["Time Period Ending"] = df["Time Period Ending"].astype("string")
    
    # Create a true timestamp which includes both date and hour and minutes
    df["Timestamp"] = pd.to_datetime(df["Date"] + " " + df["Time Period Ending"])
    
    # Subset columns and rename to include _ to make columns easier to work with
    df = df[["Site Name", "Timestamp", "Time Interval", "Avg mph", "Total Volume"]]\
    .rename(columns={"Site Name": "site_name",
                     "Timestamp":"timestamp",
                     "Time Interval": "interval_of_day",
                     "Avg mph": "avg_mph",
                     "Total Volume": "total_volume"})
    
    # Compute dates for left join 
    dates = full_time(df, interval="15min", min_date=min_date, max_date=max_date)
    
    # Merge full date list with actual data
    df = dates.merge(df, how="left", on="timestamp")
    
    site = df["site_name"].unique()[0]
    
    df.fillna({"site_name": site}, inplace=True)
    
    # Use pandasql to impute the 'interval_of_day' field 
    interval_of_day_impute = """
    SELECT site_name,
           day_of_week,
           date(date) AS date,
           day_of_year,
           timestamp,
           DENSE_RANK() OVER (PARTITION BY DATE ORDER BY timestamp) - 1 AS interval_of_day,
           avg_mph,
           total_volume
    FROM df
    """
    df = psql.sqldf(interval_of_day_impute, locals())
    
    # Create field with T/F if speed data is missing
    df["missing_speed"] = np.where(df["avg_mph"].isnull(), True, False)
    
    # Create field with T/F if volume data is missing
    df["missing_volume"] = np.where(df["total_volume"].isnull(), True, False)

    # Set DateTime Index
#     df["timestamp"] = pd.to_datetime(df["timestamp"])
#     df = df.set_index("timestamp")
    
    # Use linear interoplation to fill in nulls
    df["avg_mph"] = df["avg_mph"].interpolate()
    df["total_volume"] = df["total_volume"].interpolate()
    
#     df = df.reset_index()
    
    return df

# Read in Raw Data, Impute, and Write to New Directory

In [6]:
# Set the min and max dates for the 5-min and 15-min data
min_date_5 = "2019-01-01 00:04:00"
min_date_15 = "2019-01-01 00:14:00"
max_date = "2019-12-31 23:59:00"

In [7]:
# Initialize a counter
c = 0

In [8]:
# Read in all raw highways england data files, impute, and write to the intermediate direcotry of the data folder
for fname in glob.glob("Data/Unseen Sensor/Raw/*.csv"):
    print("Reading {}".format(fname))
    
    fname_new = "Data/Unseen Sensor/Intermediate/{}_Intermediate.csv".format(fname.split("/")[-1].split(".")[0])
    
    df = read_highways_england(fname, min_date_15, max_date)
    
    df.to_csv(fname_new, index=False)
    
    c += 1

Reading Data/Unseen Sensor/Raw/A19-9336-1_Northbound_2019.csv
Reading Data/Unseen Sensor/Raw/A66-9521-1_Westbound.csv
Reading Data/Unseen Sensor/Raw/M40-7048-2_Southbound.csv
Reading Data/Unseen Sensor/Raw/M62-2056A_Eastbound.csv


In [9]:
# Check the counter
c

4