# Libraries

In [41]:
import pandas as pd
import numpy as np

# Import

In [42]:
fueleconomy = pd.read_pickle("../FuelEconomy/cleanedfueleconomy.pkl")
registrations = pd.read_pickle("../VehicleRegistrations/cleanedregistrations.pkl")
consumption = pd.read_pickle("../FuelConsumption/cleanedconsumption.pkl")

In [43]:
''' Drop Unneeded Columns and Rename Needed Columns if Needed'''
fueleconomy = fueleconomy.iloc[:, :6]
registrations = registrations.iloc[:, :4]
consumption = consumption.iloc[:, :2]
consumption.columns = ["Date", "Gasoline_Consumption(Millions of Gallons)"]

In [44]:
consumption.info()
print("\n")
fueleconomy.info()
print("\n")
registrations.info()

<class 'pandas.core.frame.DataFrame'>
Index: 324 entries, 600 to 923
Data columns (total 2 columns):
 #   Column                                     Non-Null Count  Dtype         
---  ------                                     --------------  -----         
 0   Date                                       324 non-null    datetime64[ns]
 1   Gasoline_Consumption(Millions of Gallons)  324 non-null    float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 7.6 KB


<class 'pandas.core.frame.DataFrame'>
Index: 27 entries, 7 to 33
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Year          27 non-null     datetime64[ns]
 1   Car           27 non-null     float64       
 2   Car SUV       27 non-null     float64       
 3   Pickup truck  27 non-null     float64       
 4   Van           27 non-null     float64       
 5   Truck SUV     27 non-null     float64       
dtypes: datetime64[ns](1)

In [45]:
# Convert 'Date' to Year and extract the year
consumption['Year'] = consumption['Date'].dt.year

# Check which years are present and how many rows per year
print(sorted(consumption['Year'].unique()))
print(consumption['Year'].value_counts().sort_index())

# Group by Year and sum the gasoline consumption
annual_consumption = (
    consumption
    .groupby('Year', as_index=False)['Gasoline_Consumption(Millions of Gallons)']
    .sum()
    .rename(
        columns={
            'Gasoline_Consumption(Millions of Gallons)':
            'AnnualConsumption (Millions of Gallons)'
        }
    )
)

[np.int32(1995), np.int32(1996), np.int32(1997), np.int32(1998), np.int32(1999), np.int32(2000), np.int32(2001), np.int32(2002), np.int32(2003), np.int32(2004), np.int32(2005), np.int32(2006), np.int32(2007), np.int32(2008), np.int32(2009), np.int32(2010), np.int32(2011), np.int32(2012), np.int32(2013), np.int32(2014), np.int32(2015), np.int32(2016), np.int32(2017), np.int32(2018), np.int32(2019), np.int32(2020), np.int32(2021)]
Year
1995    12
1996    12
1997    12
1998    12
1999    12
2000    12
2001    12
2002    12
2003    12
2004    12
2005    12
2006    12
2007    12
2008    12
2009    12
2010    12
2011    12
2012    12
2013    12
2014    12
2015    12
2016    12
2017    12
2018    12
2019    12
2020    12
2021    12
Name: count, dtype: int64


In [46]:
consumption.info()
print("\n")
fueleconomy.info()
print("\n")
registrations.info()

<class 'pandas.core.frame.DataFrame'>
Index: 324 entries, 600 to 923
Data columns (total 3 columns):
 #   Column                                     Non-Null Count  Dtype         
---  ------                                     --------------  -----         
 0   Date                                       324 non-null    datetime64[ns]
 1   Gasoline_Consumption(Millions of Gallons)  324 non-null    float64       
 2   Year                                       324 non-null    int32         
dtypes: datetime64[ns](1), float64(1), int32(1)
memory usage: 8.9 KB


<class 'pandas.core.frame.DataFrame'>
Index: 27 entries, 7 to 33
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Year          27 non-null     datetime64[ns]
 1   Car           27 non-null     float64       
 2   Car SUV       27 non-null     float64       
 3   Pickup truck  27 non-null     float64       
 4   Van           27 non-null     f

# Merge

# in progress 

In [47]:
''' Merge consumption and prices data on Date '''

# Normalize and align Date columns just in case
consumption["Year"] = pd.to_datetime(consumption["Year"]).dt.tz_localize(None).dt.normalize()
fueleconomy["Year"] = pd.to_datetime(fueleconomy["Year"]).dt.tz_localize(None).dt.normalize()
registrations["Year"] = pd.to_datetime(registrations["Year"]).dt.tz_localize(None).dt.normalize()

# Merge consumption and fueleconomy data
merged_df = pd.merge(consumption, fueleconomy, on="Year", how="inner")
merged_df = pd.merge(merged_df, registrations, on="Year", how="inner")

# ------^  edited ^------#




'''
# Keep only relevant columns
merged_df = merged_df[[
    "Date",
    "GasolinePrices",
    "DieselPrices",
    "Gasoline_Consumption(Millions of Gallons)"
]]

# Rename for clarity
merged_df.rename(columns={
    "Dollars_Per_Gallon": "Nominal_Price_USD",
    "price_adjusted": "Inflation_Adjusted_Price_USD"
}, inplace=True)

# Round values for cleaner display
merged_df["Nominal_Price_USD"] = merged_df["Nominal_Price_USD"].round(3)
merged_df["Inflation_Adjusted_Price_USD"] = merged_df["Inflation_Adjusted_Price_USD"].round(3)

# Add time breakdowns
merged_df["Year"] = merged_df["Date"].dt.year
merged_df["Month"] = merged_df["Date"].dt.month_name()

'''

# Verify output
print(merged_df.tail(10))

Empty DataFrame
Columns: [Date, Gasoline_Consumption(Millions of Gallons), Year, Car, Car SUV, Pickup truck, Van, Truck SUV, Automobile_Registrations, Motorcycle_Registrations, Licensed_Drivers]
Index: []
