<a href="https://colab.research.google.com/github/nikolasleeb/INFO523_FinalProject/blob/main/VehicleRegistration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import statsmodels.api as sm
from scipy.stats import skew, kurtosis
from sklearn.preprocessing import StandardScaler
import glob, os

# Attach Data

In [28]:
'''Connects to Google Drive to load in data'''

# '''
# Connect to and mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Find the dataset in google drive and assign path to variable called 'Match'
matches = glob.glob('/content/drive/MyDrive/**/VehicleRegistrations.xlsx', recursive=True)
print("Found paths:", matches)

# Using 'Match' load in the dataset using the path
path = matches[0]
registrations = pd.read_excel(path, engine='openpyxl')
registrations.head()
# '''

'''
# Load dataset from local file for demonstration purposes
registrations = pd.read_excel('VehicleRegistrations.xlsx', engine='openpyxl')
registrations.head()
'''

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Found paths: ['/content/drive/MyDrive/DataMiningFall25/INFO 523 Final Project/Datasets/VehicleRegistration/VehicleRegistrations.xlsx']


"\n# Load dataset from local file for demonstration purposes\nregistrations = pd.read_excel('VehicleRegistrations.xlsx', engine='openpyxl')\nregistrations.head()\n"

In [29]:
colors = { "registrations": "#DCDCAF" }

In [30]:
''' check for null and missing values'''
registrations.isnull().sum()

Unnamed: 0,0
Inventory,0
1995,0
1996,0
1997,0
1998,0
1999,0
2000,0
2001,0
2002,0
2003,0


# Cleaning and Transforming

In [31]:
''' Transpose the table so that years become the first column '''

id_col = registrations.columns[0]            # e.g. "Fuel economy, mpg"
registrations = registrations.round(2)
registrations = registrations.set_index(id_col).T.reset_index().rename(columns={'index':'Year'})

registrations.head()

Inventory,Year,Number of Automobile Vehicle Registrations,Number of Motorcycle Vehicle Registrations,Motor vehicle licensed drivers
0,1995,136066045.0,3767029.0,176628482.0
1,1996,129728341.0,3871237.14,179539340.0
2,1997,129748704.0,3826373.0,182709204.0
3,1998,131838538.0,3879450.0,184860969.0
4,1999,132432044.0,4152433.0,187170420.0


In [32]:
''' Verify the columns, datatypes, and number of entries'''
registrations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 4 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Year                                        27 non-null     int64  
 1   Number of Automobile Vehicle Registrations  27 non-null     float64
 2   Number of Motorcycle Vehicle Registrations  27 non-null     float64
 3   Motor vehicle licensed drivers              27 non-null     float64
dtypes: float64(3), int64(1)
memory usage: 996.0 bytes


In [33]:
''' Data Cleaning for registrations Data '''

# Convert Year column to Year format
registrations["Year"] = pd.to_datetime(registrations["Year"], format='%Y')

# Filter to keep data where the year is between 1995 and 2021 (inclusive)
registrations = registrations[
    (registrations["Year"].dt.year >= 1995) &
    (registrations["Year"].dt.year <= 2021)
]

# Preview the first 10 rows
registrations.head(10)

Inventory,Year,Number of Automobile Vehicle Registrations,Number of Motorcycle Vehicle Registrations,Motor vehicle licensed drivers
0,1995-01-01,136066045.0,3767029.0,176628482.0
1,1996-01-01,129728341.0,3871237.14,179539340.0
2,1997-01-01,129748704.0,3826373.0,182709204.0
3,1998-01-01,131838538.0,3879450.0,184860969.0
4,1999-01-01,132432044.0,4152433.0,187170420.0
5,2000-01-01,133621420.0,4346068.0,190625023.0
6,2001-01-01,137633467.0,4903056.0,191275719.0
7,2002-01-01,135920677.0,5004156.0,194295633.0
8,2003-01-01,135669897.0,5370035.0,196165667.0
9,2004-01-01,136430651.0,5780870.0,198888912.0


In [34]:
''' Rename columns for easier access '''
registrations = registrations.rename(columns={
    'Number of Automobile Vehicle Registrations': 'Automobile_Registrations',
    'Number of Motorcycle Vehicle Registrations': 'Motorcycle_Registrations',
    'Motor vehicle licensed drivers': 'Licensed_Drivers'
})

# Summary Stats

In [35]:
registrations.describe()

Inventory,Year,Automobile_Registrations,Motorcycle_Registrations,Licensed_Drivers
count,27,27.0,27.0,27.0
mean,2008-01-01 08:53:20,125158000.0,6806726.0,205630100.0
min,1995-01-01 00:00:00,102973900.0,3767029.0,176628500.0
25%,2001-07-02 12:00:00,112912700.0,4953606.0,192785700.0
50%,2008-01-01 00:00:00,130892200.0,7752926.0,208320600.0
75%,2014-07-02 12:00:00,135795300.0,8446220.0,216088500.0
max,2021-01-01 00:00:00,137633500.0,9881414.0,232781800.0
std,,12040000.0,1977060.0,16233330.0


In [36]:
'''' View skewness and kurtosis of consumption data '''

columns_to_analyze = registrations.columns[1:]  # Exclude the 'Year' column

for column in columns_to_analyze:
    skewness = skew(registrations[column].dropna())
    kurt = kurtosis(registrations[column].dropna())
    print(f"Column: {column}")
    print(f"  Skewness: {skewness:.4f}")
    print(f"  Kurtosis: {kurt:.4f}\n")

Column: Automobile_Registrations
  Skewness: -0.5179
  Kurtosis: -1.4172

Column: Motorcycle_Registrations
  Skewness: -0.3593
  Kurtosis: -1.4021

Column: Licensed_Drivers
  Skewness: -0.0799
  Kurtosis: -1.0196



In [37]:
''' View registrations date range and total years of data '''

print("Earliest date:", registrations["Year"].min())
print("Latest date:", registrations["Year"].max())
print("Total months of data:", len(registrations))

Earliest date: 1995-01-01 00:00:00
Latest date: 2021-01-01 00:00:00
Total months of data: 27


# Automobile Registrations

- number of new drivers

In [38]:
''' Calculate and print average automobile registrations, standard deviation, and coefficient of variation '''

mean = registrations["Automobile_Registrations"].mean()
std = registrations["Automobile_Registrations"].std()
cv = (std / mean) * 100
print(f"Average Annual Automobile Registrations: {mean:,.2f}")
print(f"Standard deviation: {std:,.2f}")
print(f"Coefficient of variation: {cv:.2f}% \n")

Average Annual Automobile Registrations: 125,157,985.01
Standard deviation: 12,040,001.98
Coefficient of variation: 9.62% 



In [39]:
''' Create a histogram to visualize the distribution of Automobile Registrations '''

# Create histogram
figvr1 = px.histogram(
    registrations,
    x="Automobile_Registrations",
    nbins=8,
    title="Distribution of Automobile Registrations",
    color_discrete_sequence=[colors["registrations"]]
)

# Customize layout
figvr1.update_layout(
    template="plotly_white",
    title_x=0.5,
    xaxis_title="Automobile Registrations",
    yaxis_title="Frequency",
    xaxis=dict(showgrid=True, gridcolor="lightgray"),
    yaxis=dict(showgrid=True, gridcolor="lightgray"),
    hovermode="x unified",
    height=500
)

figvr1.update_traces(marker_line_color="black", marker_line_width=1, opacity=1)

figvr1.show()

In [40]:
''' Outlier Detection for Automobile Registrations using Z-Score Method and Box Plot '''

# Use an existing column name from the DataFrame (e.g., "Car")
col_name = "Automobile_Registrations"

# Compute Z-score for the selected column
registrations["Z_automobile_registrations"] = (
    (registrations[col_name] - registrations[col_name].mean())
    / registrations[col_name].std()
)

# Flag potential outliers (|Z| > 3)
registrations_outliers = registrations[np.abs(registrations["Z_automobile_registrations"]) > 3]
print(f"Detected {len(registrations_outliers)} potential outliers in '{col_name}' registrations.")
display(registrations_outliers[["Year", col_name, "Z_automobile_registrations"]])

# visual outlier detection with box plot
figvr2 = px.box(
    registrations,
    y=col_name,
    title=f"Outlier Detection: {col_name} Registrations",
    points="all",
    color_discrete_sequence=[colors["registrations"]]
)
figvr2.update_layout(template="plotly_white", title_x=0.5, height=500)
figvr2.show()

Detected 0 potential outliers in 'Automobile_Registrations' registrations.


Inventory,Year,Automobile_Registrations,Z_automobile_registrations


# Motorcycle Registrations

In [41]:
''' Calculate and print average yearly motorcycle registrations, standard deviation, and coefficient of variation '''

mean = registrations["Motorcycle_Registrations"].mean()
std = registrations["Motorcycle_Registrations"].std()
cv = (std / mean) * 100
print(f"Average annual motorcycle registrations: {mean:,.2f}")
print(f"Standard deviation: {std:,.2f}")
print(f"Coefficient of variation: {cv:.2f}% \n")

Average annual motorcycle registrations: 6,806,726.41
Standard deviation: 1,977,060.34
Coefficient of variation: 29.05% 



In [42]:
''' Create a histogram to visualize the distribution of Motorcycle Registrations '''

# Create histogram
figvr3 = px.histogram(
    registrations,
    x="Motorcycle_Registrations",
    nbins=8,
    title="Distribution of Motorcycle Registrations",
    color_discrete_sequence=[colors["registrations"]]
)

# Customize layout
figvr3.update_layout(
    template="plotly_white",
    title_x=0.5,
    xaxis_title="Motorcycle Registrations",
    yaxis_title="Frequency",
    xaxis=dict(showgrid=True, gridcolor="lightgray"),
    yaxis=dict(showgrid=True, gridcolor="lightgray"),
    hovermode="x unified",
    height=500
)

figvr3.update_traces(marker_line_color="black", marker_line_width=1, opacity=1)

figvr3.show()

In [43]:
''' Outlier Detection for Motorcycle Registrations using Z-Score Method and Box Plot '''

# Use an existing column name from the DataFrame (e.g., "Car")
col_name = "Motorcycle_Registrations"

# Compute Z-score for the selected column
registrations["Z_motorcycle_registrations"] = (
    (registrations[col_name] - registrations[col_name].mean())
    / registrations[col_name].std()
)

# Flag potential outliers (|Z| > 3)
registrations_outliers = registrations[np.abs(registrations["Z_motorcycle_registrations"]) > 3]
print(f"Detected {len(registrations_outliers)} potential outliers in '{col_name}' registrations.")
display(registrations_outliers[["Year", col_name, "Z_motorcycle_registrations"]])

# visual outlier detection with box plot
figvr4 = px.box(
    registrations,
    y=col_name,
    title=f"Outlier Detection: {col_name}",
    points="all",
    color_discrete_sequence=[colors["registrations"]]
)
figvr4.update_layout(template="plotly_white", title_x=0.5, height=500)
figvr4.show()

Detected 0 potential outliers in 'Motorcycle_Registrations' registrations.


Inventory,Year,Motorcycle_Registrations,Z_motorcycle_registrations


# Licensed Drivers

In [44]:
''' Calculate and print average yearly licensed drivers, standard deviation, and coefficient of variation '''

mean = registrations["Licensed_Drivers"].mean()
std = registrations["Licensed_Drivers"].std()
cv = (std / mean) * 100
print(f"Average Annual licensed drivers: {mean:,.2f}")
print(f"Standard deviation: {std:,.2f}")
print(f"Coefficient of variation: {cv:.2f}% \n")

Average Annual licensed drivers: 205,630,124.95
Standard deviation: 16,233,334.02
Coefficient of variation: 7.89% 



In [45]:

''' Create a histogram to visualize the distribution of License Drivers '''

# Create histogram
figvr5 = px.histogram(
    registrations,
    x="Licensed_Drivers",
    nbins=12,
    title="Distribution of License Drivers",
    color_discrete_sequence=[colors["registrations"]]
)

# Customize layout
figvr5.update_layout(
    template="plotly_white",
    title_x=0.5,
    xaxis_title="Licensed Drivers",
    yaxis_title="Frequency",
    xaxis=dict(showgrid=True, gridcolor="lightgray"),
    yaxis=dict(showgrid=True, gridcolor="lightgray"),
    hovermode="x unified",
    height=500
)

figvr5.update_traces(marker_line_color="black", marker_line_width=1, opacity=1)

figvr5.show()

In [46]:
''' Outlier Detection for Licensed Driver using Z-Score Method and Box Plot '''

# Use an existing column name from the DataFrame (e.g., "Car")
col_name = "Licensed_Drivers"

# Compute Z-score for the selected column
registrations["Z_licensed_drivers"] = (
    (registrations[col_name] - registrations[col_name].mean())
    / registrations[col_name].std()
)

# Flag potential outliers (|Z| > 3)
registrations_outliers = registrations[np.abs(registrations["Z_licensed_drivers"]) > 3]
print(f"Detected {len(registrations_outliers)} potential outliers in '{col_name}' registrations.")
display(registrations_outliers[["Year", col_name, "Z_licensed_drivers"]])

# visual outlier detection with box plot
figvr6 = px.box(
    registrations,
    y=col_name,
    title=f"Outlier Detection: {col_name}",
    points="all",
    color_discrete_sequence=[colors["registrations"]]
)
figvr6.update_layout(template="plotly_white", title_x=0.5, height=500)
figvr6.show()

Detected 0 potential outliers in 'Licensed_Drivers' registrations.


Inventory,Year,Licensed_Drivers,Z_licensed_drivers


# Line Graph

In [47]:
''' create a line graph showing trends over time for each vehicle type '''

fig_line = px.line(
    registrations,
    x="Year",
    y=["Automobile_Registrations", "Motorcycle_Registrations", "Licensed_Drivers"],
    title="Vehicle Registration Trends Over Time by Vehicle Type",
    labels={"value": "Number of Registrations", "variable": "Vehicle Type"},
    color_discrete_sequence=px.colors.qualitative.Set1
)
fig_line.update_layout(
    template="plotly_white",
    title_x=0.5,
    xaxis_title="Year",
    yaxis_title="Number of Registrations",
    xaxis=dict(showgrid=True, gridcolor="lightgray"),
    yaxis=dict(showgrid=True, gridcolor="lightgray"),
    hovermode="x unified",
    height=600
)
fig_line.show()

# Final Transformations

In [48]:
# create new column of total registrations
registrations['TotalRegistrations'] = registrations['Motorcycle_Registrations'] + registrations['Automobile_Registrations']
registrations['TotalRegistrations'] = registrations['TotalRegistrations'].round(0)
registrations['Year'] = registrations['Year'].dt.year
registrations['Year'] = registrations['Year'].astype(int)

print(registrations)

Inventory  Year  Automobile_Registrations  Motorcycle_Registrations  \
0          1995              1.360660e+08                3767029.00   
1          1996              1.297283e+08                3871237.14   
2          1997              1.297487e+08                3826373.00   
3          1998              1.318385e+08                3879450.00   
4          1999              1.324320e+08                4152433.00   
5          2000              1.336214e+08                4346068.00   
6          2001              1.376335e+08                4903056.00   
7          2002              1.359207e+08                5004156.00   
8          2003              1.356699e+08                5370035.00   
9          2004              1.364307e+08                5780870.00   
10         2005              1.365681e+08                6227146.00   
11         2006              1.353999e+08                6678958.00   
12         2007              1.359329e+08                7138475.79   
13    

# Export

In [49]:
# Save as CSV
registrations.to_csv("cleanedregistrations.csv", index=False)

# Optional: Save as Pickle for faster loading
registrations.to_pickle("cleanedregistrations.pkl")