<a href="https://colab.research.google.com/github/nikolasleeb/INFO523_FinalProject/blob/main/ConsumptionData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime
import statsmodels.api as sm
from scipy.stats import skew, kurtosis
from sklearn.preprocessing import StandardScaler
import glob, os

# Attach Data

In [2]:
'''Connects to Google Drive to load in data'''

'''
# Connect to and mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Find the dataset in google drive and assign path to variable called 'Match'
matches = glob.glob('/content/drive/MyDrive/**/ConsumptionData.xlsx', recursive=True)
print("Found paths:", matches)

# Using 'Match' load in the dataset using the path and skip the first 2 rows
path = matches[0]
consumption = pd.read_excel(path, engine='openpyxl', skiprows=2)
consumption.head()
'''

# '''
# Load dataset from local file for demonstration purposes
consumption = pd.read_excel('ConsumptionData.xlsx', engine='openpyxl', skiprows=2)
consumption.head()
# '''

Unnamed: 0,Date,U.S. Product Supplied of Finished Motor Gasoline (Thousand Barrels),East Coast (PADD 1) Product Supplied of Finished Motor Gasoline (Thousand Barrels),Midwest (PADD 2) Product Supplied of Finished Motor Gasoline (Thousand Barrels),Gulf Coast (PADD 3) Product Supplied of Finished Motor Gasoline (Thousand Barrels),Rocky Mountain (PADD 4) Product Supplied of Finished Motor Gasoline (Thousand Barrels),West Coast (PADD 5) Product Supplied of Finished Motor Gasoline (Thousand Barrels)
0,1945-01-15,40310,,,,,
1,1945-02-15,38690,,,,,
2,1945-03-15,42511,,,,,
3,1945-04-15,45351,,,,,
4,1945-05-15,47515,,,,,


In [3]:
colors = { "consumption": "#9BC6E3" }

In [4]:
''' check for null and missing values'''
consumption.isnull().sum()

Date                                                                                        0
U.S. Product Supplied of Finished Motor Gasoline (Thousand Barrels)                         0
East Coast (PADD 1) Product Supplied of Finished Motor Gasoline (Thousand Barrels)        432
Midwest (PADD 2) Product Supplied of Finished Motor Gasoline (Thousand Barrels)           432
Gulf Coast (PADD 3) Product Supplied of Finished Motor Gasoline (Thousand Barrels)        432
Rocky Mountain (PADD 4) Product Supplied of Finished Motor Gasoline (Thousand Barrels)    432
West Coast (PADD 5) Product Supplied of Finished Motor Gasoline (Thousand Barrels)        432
dtype: int64

# Cleaning and Transforming

In [5]:
''' Verify the columns, datatypes, and number of entries'''
consumption.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 967 entries, 0 to 966
Data columns (total 7 columns):
 #   Column                                                                                  Non-Null Count  Dtype         
---  ------                                                                                  --------------  -----         
 0   Date                                                                                    967 non-null    datetime64[ns]
 1   U.S. Product Supplied of Finished Motor Gasoline (Thousand Barrels)                     967 non-null    int64         
 2   East Coast (PADD 1) Product Supplied of Finished Motor Gasoline (Thousand Barrels)      535 non-null    float64       
 3   Midwest (PADD 2) Product Supplied of Finished Motor Gasoline (Thousand Barrels)         535 non-null    float64       
 4   Gulf Coast (PADD 3) Product Supplied of Finished Motor Gasoline (Thousand Barrels)      535 non-null    float64       
 5   Rocky Mountain (PADD 4)

In [6]:
''' Data Cleaning for Consumption Data '''

# Keep only needed columns and rename
consumption = consumption.iloc[:, :2]
consumption.columns = ["Date", "Gasoline_Consumption_Thousand_Barrels"]

# Convert Date column to datetime and filter
consumption["Date"] = pd.to_datetime(consumption["Date"]).dt.to_period("M").dt.to_timestamp()

# Filter to keep data between April 1994 and December 2024
consumption = consumption[
    (consumption["Date"] >= "1995-01-01") &
    (consumption["Date"] <= "2021-12-31")
]

# Convert to Millions of Gallons
consumption["Gasoline_Consumption(Millions of Gallons)"] = (
    consumption["Gasoline_Consumption_Thousand_Barrels"] * 1000 * 42 / 1_000_000
)

# Drop the old column
consumption.drop(columns=["Gasoline_Consumption_Thousand_Barrels"], inplace=True)

# Preview
consumption.head(10)

Unnamed: 0,Date,Gasoline_Consumption(Millions of Gallons)
600,1995-01-01,9326.478
601,1995-02-01,8797.362
602,1995-03-01,10139.346
603,1995-04-01,9640.68
604,1995-05-01,10277.652
605,1995-06-01,10357.62
606,1995-07-01,10269.588
607,1995-08-01,10659.642
608,1995-09-01,9810.36
609,1995-10-01,10130.694


# Summary Stats

In [7]:
consumption.describe()

Unnamed: 0,Date,Gasoline_Consumption(Millions of Gallons)
count,324,324.0
mean,2008-06-16 02:04:26.666666752,11247.496296
min,1995-01-01 00:00:00,7390.866
25%,2001-09-23 12:00:00,10751.517
50%,2008-06-16 00:00:00,11363.016
75%,2015-03-08 18:00:00,11816.847
max,2021-12-01 00:00:00,12803.448
std,,803.788992


In [8]:
'''' View skewness and kurtosis of consumption data '''

# Compute skewness and kurtosis for consumption
cons_skew = skew(consumption["Gasoline_Consumption(Millions of Gallons)"], nan_policy='omit')
cons_kurt = kurtosis(consumption["Gasoline_Consumption(Millions of Gallons)"], nan_policy='omit')

# Display results
print(f"Gasoline Consumption — Skewness: {cons_skew:.2f}, Kurtosis: {cons_kurt:.2f}")

Gasoline Consumption — Skewness: -0.84, Kurtosis: 1.19


In [9]:
''' View consumption date range and total months of data '''

print("Earliest date:", consumption["Date"].min()) # Confirming data starts at Apr 1994 to match Prices data
print("Latest date:", consumption["Date"].max()) # Confirming data ends at Dec 2024 to match Prices data
print("Total months of data:", len(consumption))

Earliest date: 1995-01-01 00:00:00
Latest date: 2021-12-01 00:00:00
Total months of data: 324


In [10]:
''' Calculate and print average monthly consumption, standard deviation, and coefficient of variation '''

mean = consumption["Gasoline_Consumption(Millions of Gallons)"].mean()
std = consumption["Gasoline_Consumption(Millions of Gallons)"].std()
cv = (std / mean) * 100     # coefficient of variation (%)
                            # CV means how consistent the data is relative to the mean
                            # 7.5% means the standard deviation is 7.5% of the mean value

print(f"Average monthly consumption: {mean:,.2f} million gallons")
print(f"Standard deviation: {std:,.2f} million gallons")
print(f"Coefficient of variation: {cv:.2f}%")

Average monthly consumption: 11,247.50 million gallons
Standard deviation: 803.79 million gallons
Coefficient of variation: 7.15%


In [11]:
''' Create a histogram to visualize the distribution of monthly gasoline consumption '''

# Create histogram
figc1 = px.histogram(
    consumption,
    x="Gasoline_Consumption(Millions of Gallons)",
    nbins=25,
    title="Distribution of Monthly Gasoline Consumption",
    labels={"Gasoline_Consumption(Millions of Gallons)": "Millions of Gallons"},
    color_discrete_sequence=[colors["consumption"]]
)

# Customize layout
figc1.update_layout(
    template="plotly_white",
    title_x=0.5,
    xaxis_title="Millions of Gallons",
    yaxis_title="Frequency",
    xaxis=dict(showgrid=True, gridcolor="lightgray"),
    yaxis=dict(showgrid=True, gridcolor="lightgray"),
    hovermode="x unified",
    height=700,
    width=1100
)

figc1.update_traces(marker_line_color="black", marker_line_width=1, opacity=1)

figc1.show()

In [12]:
''' Outlier Detection '''

consumption["Z_Consumption"] = (
    (consumption["Gasoline_Consumption(Millions of Gallons)"] - consumption["Gasoline_Consumption(Millions of Gallons)"].mean())
    / consumption["Gasoline_Consumption(Millions of Gallons)"].std()
)

# Flag potential outliers (|Z| > 3)
consumption_outliers = consumption[np.abs(consumption["Z_Consumption"]) > 3]

print(f"Detected {len(consumption_outliers)} potential outliers in gasoline consumption.")
display(consumption_outliers[["Date", "Gasoline_Consumption(Millions of Gallons)"]])

# visual outlier detection with box plot
figc2 = px.box(
    consumption,
    y="Gasoline_Consumption(Millions of Gallons)",
    title="Outlier Detection: U.S. Monthly Gasoline Consumption",
    points="all",
    height=900,
    width=1100,
    color_discrete_sequence=[colors["consumption"]]
)
figc2.update_layout(template="plotly_white", title_x=0.5, height=500)
figc2.show()

Detected 2 potential outliers in gasoline consumption.


Unnamed: 0,Date,Gasoline_Consumption(Millions of Gallons)
601,1995-02-01,8797.362
903,2020-04-01,7390.866


# Visualizations

In [13]:
''' Visualize Consumption Over Time '''

figc3 = px.line(
    consumption,
    x="Date",
    y="Gasoline_Consumption(Millions of Gallons)",
    title="U.S. Monthly Gasoline Consumption (1995-2021)",
    labels={
        "Date": "Date",
        "Gasoline_Consumption(Millions of Gallons)": "Millions of Gallons"
    },
    color_discrete_sequence=[colors["consumption"]]
)

# Customize layout
figc3.update_layout(
    template="plotly_white",
    title_x=0.5,
    xaxis_title="Date",
    yaxis_title="Millions of Gallons",
    xaxis=dict(showgrid=True, gridcolor="lightgray"),
    yaxis=dict(showgrid=True, gridcolor="lightgray"),
    hovermode="x unified",
    height=700,
    width=1200
)

figc3.show()

# Export

In [14]:
# Save as CSV
consumption.to_csv("cleanedconsumption.csv", index=False)

# Optional: Save as Pickle for faster loading
consumption.to_pickle("cleanedconsumption.pkl")