<a href="https://colab.research.google.com/github/nikolasleeb/INFO523_FinalProject/blob/main/FuelEconomy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [60]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import statsmodels.api as sm
from scipy.stats import skew, kurtosis
from sklearn.preprocessing import StandardScaler
import glob, os

# Attach Data

In [61]:
'''Connects to Google Drive to load in data'''

'''
# Connect to and mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Find the dataset in google drive and assign path to variable called 'Match'
matches = glob.glob('/content/drive/MyDrive/**/FuelEconomy.xlsx', recursive=True)
print("Found paths:", matches)

# Using 'Match' load in the dataset using the path
path = matches[0]
fueleconomy = pd.read_excel(path, engine='openpyxl')
fueleconomy.head()
'''

# '''
# Load dataset from local file for demonstration purposes
fueleconomy = pd.read_excel('FuelEconomy.xlsx', engine='openpyxl')
fueleconomy.head()
# '''


Unnamed: 0,"Fuel economy, mpg",1980,1985,1990,1991,1992,1993,1994,1995,1996,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Car,20.0121,23.01593,23.33429,23.42825,23.06995,23.45914,23.273,23.44355,23.33458,...,27.59971,28.35773,28.38531,29.00986,29.2051,30.18222,30.76299,30.87589,31.73063,31.72793
1,Car SUV,14.57638,20.08032,18.84529,18.20849,17.84349,17.0292,18.03998,17.8118,18.43161,...,23.29446,24.3448,24.43912,25.13993,26.2064,26.14284,27.32779,27.4895,28.37686,30.50092
2,Pickup truck,16.51884,18.20137,17.41153,18.18688,17.48087,17.58531,17.44093,16.89032,17.14796,...,17.20749,17.47393,18.0377,18.80861,18.92237,18.91763,19.10974,19.02718,19.19349,19.39958
3,Van,14.13642,16.54583,17.84376,17.91037,17.92191,18.20216,17.83757,18.078,18.33961,...,21.26883,21.05797,21.26902,21.78246,21.66192,22.23448,22.75932,22.42918,23.35398,26.20616
4,Truck SUV,13.18631,16.53857,16.43622,16.73325,16.20331,16.29119,16.00656,15.99832,16.22709,...,20.00681,20.82639,21.59169,21.94241,22.21006,22.3386,23.12897,23.48195,23.7501,23.99702


In [62]:
colors = { "fueleconomy": "#C5947C" }

In [63]:
''' check for null and missing values'''
fueleconomy.isnull().sum()

Fuel economy, mpg    0
1980                 0
1985                 0
1990                 0
1991                 0
1992                 0
1993                 0
1994                 0
1995                 0
1996                 0
1997                 0
1998                 0
1999                 0
2000                 0
2001                 0
2002                 0
2003                 0
2004                 0
2005                 0
2006                 0
2007                 0
2008                 0
2009                 0
2010                 0
2011                 0
2012                 0
2013                 0
2014                 0
2015                 0
2016                 0
2017                 0
2018                 0
2019                 0
2020                 0
2021                 0
dtype: int64

# Cleaning and Transforming

In [64]:
''' Transpose the table so that years become the first column '''

id_col = fueleconomy.columns[0]
fueleconomy = fueleconomy.round(2)
fueleconomy = fueleconomy.set_index(id_col).T.reset_index().rename(columns={'index':'Year'})

fueleconomy.head()

"Fuel economy, mpg",Year,Car,Car SUV,Pickup truck,Van,Truck SUV
0,1980,20.01,14.58,16.52,14.14,13.19
1,1985,23.02,20.08,18.2,16.55,16.54
2,1990,23.33,18.85,17.41,17.84,16.44
3,1991,23.43,18.21,18.19,17.91,16.73
4,1992,23.07,17.84,17.48,17.92,16.2


In [65]:
''' Verify the columns, datatypes, and number of entries '''
fueleconomy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34 entries, 0 to 33
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Year          34 non-null     int64  
 1   Car           34 non-null     float64
 2   Car SUV       34 non-null     float64
 3   Pickup truck  34 non-null     float64
 4   Van           34 non-null     float64
 5   Truck SUV     34 non-null     float64
dtypes: float64(5), int64(1)
memory usage: 1.7 KB


In [66]:
''' Data Cleaning for fueleconomy Data '''

# Convert Year column to Year format
fueleconomy["Year"] = pd.to_datetime(fueleconomy["Year"], format='%Y')

# Filter to keep data where the year is between 1995 and 2021 (inclusive)
fueleconomy = fueleconomy[
    (fueleconomy["Year"].dt.year >= 1995) &
    (fueleconomy["Year"].dt.year <= 2021)
]

# Preview the first 10 rows
fueleconomy.head(10)

"Fuel economy, mpg",Year,Car,Car SUV,Pickup truck,Van,Truck SUV
7,1995-01-01,23.44,17.81,16.89,18.08,16.0
8,1996-01-01,23.33,18.43,17.15,18.34,16.23
9,1997-01-01,23.37,19.23,16.84,18.18,16.13
10,1998-01-01,23.37,18.24,17.0,18.7,16.16
11,1999-01-01,23.0,18.51,16.29,18.28,16.07
12,2000-01-01,22.91,17.89,16.65,18.61,16.01
13,2001-01-01,23.05,18.83,15.95,18.04,16.41
14,2002-01-01,23.08,19.3,15.75,18.7,16.31
15,2003-01-01,23.28,19.91,16.08,18.98,16.42
16,2004-01-01,23.14,19.97,15.74,19.16,16.47


# Summary Stats

In [67]:
fueleconomy.describe()

"Fuel economy, mpg",Year,Car,Car SUV,Pickup truck,Van,Truck SUV
count,27,27.0,27.0,27.0,27.0,27.0
mean,2008-01-01 08:53:20,26.011481,22.313333,17.261852,20.31037,19.038148
min,1995-01-01 00:00:00,22.91,17.81,15.74,18.04,16.0
25%,2001-07-02 12:00:00,23.315,19.265,16.23,18.7,16.36
50%,2008-01-01 00:00:00,24.27,21.19,16.9,19.82,18.19
75%,2014-07-02 12:00:00,28.7,24.79,18.425,21.465,21.765
max,2021-01-01 00:00:00,31.73,30.5,19.4,26.21,24.0
std,,3.176431,3.643223,1.211652,1.975641,2.878855


In [68]:
''' View skewness and kurtosis of fueleconomy data '''

# List of columns to analyze
columns_to_analyze = ['Car', 'Car SUV', 'Pickup truck', 'Van', 'Truck SUV']

# Compute skewness and kurtosis for each specified column
for col in columns_to_analyze:
    if col in fueleconomy.columns:
        col_skew = skew(fueleconomy[col], nan_policy='omit')
        col_kurt = kurtosis(fueleconomy[col], nan_policy='omit')
        # Display results
        print(f"{col} — Skewness: {col_skew:.2f}, Kurtosis: {col_kurt:.2f}")
    else:
        print(f"Warning: Column '{col}' not found in the DataFrame.")

Car — Skewness: 0.59, Kurtosis: -1.22
Car SUV — Skewness: 0.56, Kurtosis: -0.81
Pickup truck — Skewness: 0.52, Kurtosis: -1.12
Van — Skewness: 1.04, Kurtosis: 0.96
Truck SUV — Skewness: 0.41, Kurtosis: -1.37


In [69]:
''' View fueleconomy date range and total years of data '''

print("Earliest date:", fueleconomy["Year"].min())
print("Latest date:", fueleconomy["Year"].max())
print("Total months of data:", len(fueleconomy))

Earliest date: 1995-01-01 00:00:00
Latest date: 2021-01-01 00:00:00
Total months of data: 27


# Cars Summary

In [70]:
''' Calculate and print average yearly fueleconomy, standard deviation, and coefficient of variation '''

mean = fueleconomy["Car"].mean()
std = fueleconomy["Car"].std()
cv = (std / mean) * 100
print(f"Average fueleconomy for Cars: {mean:,.2f}")
print(f"Standard deviation: {std:,.2f}")
print(f"Coefficient of variation: {cv:.2f}% \n")

Average fueleconomy for Cars: 26.01
Standard deviation: 3.18
Coefficient of variation: 12.21% 



In [71]:
''' Create a histogram to visualize the distribution of Cars Fuel Economy '''

# Create histogram
figfe1 = px.histogram(
    fueleconomy,
    x="Car",
    nbins=15,
    title="Distribution of Cars Fuel Economy",
    color_discrete_sequence=[colors["fueleconomy"]]
)

# Customize layout
figfe1.update_layout(
    template="plotly_white",
    title_x=0.5,
    xaxis_title="MPG of Cars",
    yaxis_title="Frequency",
    xaxis=dict(showgrid=True, gridcolor="lightgray"),
    yaxis=dict(showgrid=True, gridcolor="lightgray"),
    hovermode="x unified",
    height=500
)

figfe1.update_traces(marker_line_color="black", marker_line_width=1, opacity=1)

figfe1.show()

In [72]:
''' Outlier Detection for Car Fuel Economy using Z-Score Method and Box Plot '''

# Use an existing column name from the DataFrame (e.g., "Car")
col_name = "Car"

# Compute Z-score for the selected column
fueleconomy["Z_fueleconomy"] = (
    (fueleconomy[col_name] - fueleconomy[col_name].mean())
    / fueleconomy[col_name].std()
)

# Flag potential outliers (|Z| > 3)
fueleconomy_outliers = fueleconomy[np.abs(fueleconomy["Z_fueleconomy"]) > 3]

print(f"Detected {len(fueleconomy_outliers)} potential outliers in '{col_name}' fueleconomy.")
display(fueleconomy_outliers[["Year", col_name, "Z_fueleconomy"]])

# visual outlier detection with box plot
figfe2 = px.box(
    fueleconomy,
    y=col_name,
    title=f"Outlier Detection: {col_name} Fuel Economy",
    points="all",
    color_discrete_sequence=[colors["fueleconomy"]]
)
figfe2.update_layout(template="plotly_white", title_x=0.5, height=500)
figfe2.show()

Detected 0 potential outliers in 'Car' fueleconomy.


"Fuel economy, mpg",Year,Car,Z_fueleconomy


# Car SUV's Summary

In [73]:
''' Calculate and print average yearly fueleconomy, standard deviation, and coefficient of variation '''

mean = fueleconomy["Car SUV"].mean()
std = fueleconomy["Car SUV"].std()
cv = (std / mean) * 100
print(f"Average fueleconomy for Car SUV's: {mean:,.2f}")
print(f"Standard deviation: {std:,.2f}")
print(f"Coefficient of variation: {cv:.2f}% \n")

Average fueleconomy for Car SUV's: 22.31
Standard deviation: 3.64
Coefficient of variation: 16.33% 



In [74]:
''' Create a histogram to visualize the distribution of Car SUV's Fuel Economy '''

# Create histogram
figfe3 = px.histogram(
    fueleconomy,
    x="Car SUV",
    nbins=14,
    title="Distribution of Car SUV's Fuel Economy",
    color_discrete_sequence=[colors["fueleconomy"]]
)

# Customize layout
figfe3.update_layout(
    template="plotly_white",
    title_x=0.5,
    xaxis_title="MPG of Car SUV's",
    yaxis_title="Frequency",
    xaxis=dict(showgrid=True, gridcolor="lightgray"),
    yaxis=dict(showgrid=True, gridcolor="lightgray"),
    hovermode="x unified",
    height=500
)

figfe3.update_traces(marker_line_color="black", marker_line_width=1, opacity=1)

figfe3.show()

In [75]:
''' Outlier Detection for Car Fuel Economy using Z-Score Method and Box Plot '''

# Use an existing column name from the DataFrame (e.g., "Car")
col_name = "Car SUV"

# Compute Z-score for the selected column
fueleconomy["Z_fueleconomy"] = (
    (fueleconomy[col_name] - fueleconomy[col_name].mean())
    / fueleconomy[col_name].std()
)

# Flag potential outliers (|Z| > 3)
fueleconomy_outliers = fueleconomy[np.abs(fueleconomy["Z_fueleconomy"]) > 3]

print(f"Detected {len(fueleconomy_outliers)} potential outliers in '{col_name}' fueleconomy.")
display(fueleconomy_outliers[["Year", col_name, "Z_fueleconomy"]])

# visual outlier detection with box plot
figfe4 = px.box(
    fueleconomy,
    y=col_name,
    title=f"Outlier Detection: {col_name} Fuel Economy",
    points="all",
    color_discrete_sequence=[colors["fueleconomy"]]
)
figfe4.update_layout(template="plotly_white", title_x=0.5, height=500)
figfe4.show()

Detected 0 potential outliers in 'Car SUV' fueleconomy.


"Fuel economy, mpg",Year,Car SUV,Z_fueleconomy


# Pickup truck's Summary

In [76]:
''' Calculate and print average yearly fueleconomy, standard deviation, and coefficient of variation '''

mean = fueleconomy["Pickup truck"].mean()
std = fueleconomy["Pickup truck"].std()
cv = (std / mean) * 100
print(f"Average fueleconomy for Pickup trucks: {mean:,.2f}")
print(f"Standard deviation: {std:,.2f}")
print(f"Coefficient of variation: {cv:.2f}% \n")

Average fueleconomy for Pickup trucks: 17.26
Standard deviation: 1.21
Coefficient of variation: 7.02% 



In [77]:
''' Create a histogram to visualize the distribution of Pickup trucks Fuel Economy '''

# Create histogram
figfe5 = px.histogram(
    fueleconomy,
    x="Pickup truck",
    nbins=5,
    title="Distribution of Pickup Truck's Fuel Economy",
    color_discrete_sequence=[colors["fueleconomy"]]
)

# Customize layout
figfe5.update_layout(
    template="plotly_white",
    title_x=0.5,
    xaxis_title="MPG of Pickup truck's",
    yaxis_title="Frequency",
    xaxis=dict(showgrid=True, gridcolor="lightgray"),
    yaxis=dict(showgrid=True, gridcolor="lightgray"),
    hovermode="x unified",
    height=500
)

figfe5.update_traces(marker_line_color="black", marker_line_width=1, opacity=1)

figfe5.show()

In [78]:
''' Outlier Detection for Car Fuel Economy using Z-Score Method and Box Plot '''

# Use an existing column name from the DataFrame (e.g., "Car")
col_name = "Pickup truck"

# Compute Z-score for the selected column
fueleconomy["Z_fueleconomy"] = (
    (fueleconomy[col_name] - fueleconomy[col_name].mean())
    / fueleconomy[col_name].std()
)

# Flag potential outliers (|Z| > 3)
fueleconomy_outliers = fueleconomy[np.abs(fueleconomy["Z_fueleconomy"]) > 3]

print(f"Detected {len(fueleconomy_outliers)} potential outliers in '{col_name}' fueleconomy.")
display(fueleconomy_outliers[["Year", col_name, "Z_fueleconomy"]])

# visual outlier detection with box plot
figfe6 = px.box(
    fueleconomy,
    y=col_name,
    title=f"Outlier Detection: {col_name} Fuel Economy",
    points="all",
    color_discrete_sequence=[colors["fueleconomy"]]
)
figfe6.update_layout(template="plotly_white", title_x=0.5, height=500)
figfe6.show()

Detected 0 potential outliers in 'Pickup truck' fueleconomy.


"Fuel economy, mpg",Year,Pickup truck,Z_fueleconomy


# Truck SUV's Summary

In [79]:
''' Calculate and print average yearly fueleconomy, standard deviation, and coefficient of variation '''

mean = fueleconomy["Truck SUV"].mean()
std = fueleconomy["Truck SUV"].std()
cv = (std / mean) * 100
print(f"Average fueleconomy for Truck SUVs': {mean:,.2f}")
print(f"Standard deviation: {std:,.2f}")
print(f"Coefficient of variation: {cv:.2f}%")

Average fueleconomy for Truck SUVs': 19.04
Standard deviation: 2.88
Coefficient of variation: 15.12%


In [80]:
''' Create a histogram to visualize the distribution of Truck SUV's Fuel Economy '''
# fix number of bins or how they bins

# Create histogram
figfe7 = px.histogram(
    fueleconomy,
    x="Truck SUV",
    nbins=10,
    title="Distribution of Truck SUV's Fuel Economy",
    color_discrete_sequence=[colors["fueleconomy"]]
)

# Customize layout
figfe7.update_layout(
    template="plotly_white",
    title_x=0.5,
    xaxis_title="MPG of Truck SUV's",
    yaxis_title="Frequency",
    xaxis=dict(showgrid=True, gridcolor="lightgray"),
    yaxis=dict(showgrid=True, gridcolor="lightgray"),
    hovermode="x unified",
    height=500
)

figfe7.update_traces(marker_line_color="black", marker_line_width=1, opacity=1)

figfe7.show()

In [81]:
''' Outlier Detection for Car Fuel Economy using Z-Score Method and Box Plot '''

# Use an existing column name from the DataFrame (e.g., "Car")
col_name = "Truck SUV"

# Compute Z-score for the selected column
fueleconomy["Z_fueleconomy"] = (
    (fueleconomy[col_name] - fueleconomy[col_name].mean())
    / fueleconomy[col_name].std()
)

# Flag potential outliers (|Z| > 3)
fueleconomy_outliers = fueleconomy[np.abs(fueleconomy["Z_fueleconomy"]) > 3]

print(f"Detected {len(fueleconomy_outliers)} potential outliers in '{col_name}' fueleconomy.")
display(fueleconomy_outliers[["Year", col_name, "Z_fueleconomy"]])

# visual outlier detection with box plot
figfe8 = px.box(
    fueleconomy,
    y=col_name,
    title=f"Outlier Detection: {col_name} Fuel Economy",
    points="all",
    color_discrete_sequence=[colors["fueleconomy"]]
)
figfe8.update_layout(template="plotly_white", title_x=0.5, height=500)
figfe8.show()

Detected 0 potential outliers in 'Truck SUV' fueleconomy.


"Fuel economy, mpg",Year,Truck SUV,Z_fueleconomy


# Van's Summary

In [82]:
''' Calculate and print average yearly fueleconomy, standard deviation, and coefficient of variation '''

mean = fueleconomy["Van"].mean()
std = fueleconomy["Van"].std()
cv = (std / mean) * 100
print(f"Average fueleconomy for Vans: {mean:,.2f}")
print(f"Standard deviation: {std:,.2f}")
print(f"Coefficient of variation: {cv:.2f}% \n")

Average fueleconomy for Vans: 20.31
Standard deviation: 1.98
Coefficient of variation: 9.73% 



In [83]:
''' Create a histogram to visualize the distribution of Vans Fuel Economy '''

# Create histogram
figfe9 = px.histogram(
    fueleconomy,
    x="Van",
    nbins=10,
    title="Distribution of Van's Fuel Economy",
    color_discrete_sequence=[colors["fueleconomy"]]
)

# Customize layout
figfe9.update_layout(
    template="plotly_white",
    title_x=0.5,
    xaxis_title="MPG of Van's",
    yaxis_title="Frequency",
    xaxis=dict(showgrid=True, gridcolor="lightgray"),
    yaxis=dict(showgrid=True, gridcolor="lightgray"),
    hovermode="x unified",
    height=500
)

figfe9.update_traces(marker_line_color="black", marker_line_width=1, opacity=1)

figfe9.show()

In [84]:
''' Outlier Detection for Car Fuel Economy using Z-Score Method and Box Plot '''

# Use an existing column name from the DataFrame
col_name = "Van"

# Compute Z-score for the selected column
fueleconomy["Z_fueleconomy"] = (
    (fueleconomy[col_name] - fueleconomy[col_name].mean())
    / fueleconomy[col_name].std()
)

# Flag potential outliers (|Z| > 3)
fueleconomy_outliers = fueleconomy[np.abs(fueleconomy["Z_fueleconomy"]) > 3]

print(f"Detected {len(fueleconomy_outliers)} potential outliers in '{col_name}' fueleconomy.")
display(fueleconomy_outliers[["Year", col_name, "Z_fueleconomy"]])

# visual outlier detection with box plot
figfe10 = px.box(
    fueleconomy,
    y=col_name,
    title=f"Outlier Detection: {col_name} Fuel Economy",
    points="all",
    color_discrete_sequence=[colors["fueleconomy"]]
)
figfe10.update_layout(template="plotly_white", title_x=0.5, height=500)
figfe10.show()

Detected 0 potential outliers in 'Van' fueleconomy.


"Fuel economy, mpg",Year,Van,Z_fueleconomy


# Line Graph

In [88]:
''' create a line graph showing trends over time for each vehicle type '''

vehicle_cols = ['Car', 'Car SUV', 'Pickup truck', 'Van', 'Truck SUV']
fueleconomy['AvgFuelEconomy'] = fueleconomy[vehicle_cols].mean(axis=1)
fueleconomy['AvgFuelEconomy'] = fueleconomy['AvgFuelEconomy'].round(2)

palette = [
    "#d62728",  # Car (red)
    "#1f77b4",  # Car SUV (blue)
    "#2ca02c",  # Pickup truck (green)
    "#9467bd",  # Van (purple)
    "#ff7f0e",  # Truck SUV (orange)
    "#111111",  # AvgFuelEconomy (dark)
]

# create line graph for each vehicle type and average fuel economy
fig_line = px.line(
    fueleconomy,
    x="Year",
    y=["Car", "Car SUV", "Pickup truck", "Van", "Truck SUV", "AvgFuelEconomy"],
    title="Fuel Economy Trends Over Time by Vehicle Type",
    labels={"value": "Fuel Economy (MPG)", "variable": "Vehicle Type"},
    color_discrete_sequence=palette
)
fig_line.update_layout(
    template="plotly_white",
    title_x=0.5,
    xaxis_title="Year",
    yaxis_title="Fuel Economy (MPG)",
    xaxis=dict(showgrid=True, gridcolor="lightgray"),
    yaxis=dict(showgrid=True, gridcolor="lightgray"),
    hovermode="x unified",
    height=600
)
fig_line.show()

# Final Transformations

In [86]:
# create new column in fueleconomy called 'avgFuelEconomy' that takes avg fuel economy of all vehicles in a year
fueleconomy = fueleconomy.iloc[:, :6]

fueleconomy['Year'] = fueleconomy['Year'].dt.year
fueleconomy['Year'] = fueleconomy['Year'].astype(int)

vehicle_cols = ['Car', 'Car SUV', 'Pickup truck', 'Van', 'Truck SUV']
fueleconomy['AvgFuelEconomy'] = fueleconomy[vehicle_cols].mean(axis=1)
fueleconomy['AvgFuelEconomy'] = fueleconomy['AvgFuelEconomy'].round(2)

print(fueleconomy)

Fuel economy, mpg  Year    Car  Car SUV  Pickup truck    Van  Truck SUV  \
7                  1995  23.44    17.81         16.89  18.08      16.00   
8                  1996  23.33    18.43         17.15  18.34      16.23   
9                  1997  23.37    19.23         16.84  18.18      16.13   
10                 1998  23.37    18.24         17.00  18.70      16.16   
11                 1999  23.00    18.51         16.29  18.28      16.07   
12                 2000  22.91    17.89         16.65  18.61      16.01   
13                 2001  23.05    18.83         15.95  18.04      16.41   
14                 2002  23.08    19.30         15.75  18.70      16.31   
15                 2003  23.28    19.91         16.08  18.98      16.42   
16                 2004  23.14    19.97         15.74  19.16      16.47   
17                 2005  23.49    20.22         15.85  19.30      16.73   
18                 2006  23.30    20.45         16.14  19.53      17.16   
19                 2007  

# Export

In [87]:
# Save as CSV
fueleconomy.to_csv("cleanedfueleconomy.csv", index=False)

# Optional: Save as Pickle for faster loading
fueleconomy.to_pickle("cleanedfueleconomy.pkl")