<a href="https://colab.research.google.com/github/nikolasleeb/INFO523_FinalProject/blob/main/VehicleRegistration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import statsmodels.api as sm
from scipy.stats import skew, kurtosis
from sklearn.preprocessing import StandardScaler
import glob, os

pd.set_option('display.float_format', '{:,.0f}'.format)   # no decimals, comma thousands separator
# if you want two decimals:
# pd.set_option('display.float_format', '{:,.2f}'.format)

# Attach Data

In [2]:
'''Connects to Google Drive to load in data'''

'''
# Connect to and mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Find the dataset in google drive and assign path to variable called 'Match'
matches = glob.glob('/content/drive/MyDrive/**/VehicleRegistrations.xlsx', recursive=True)
print("Found paths:", matches)

# Using 'Match' load in the dataset using the path
path = matches[0]
registrations = pd.read_excel(path, engine='openpyxl')
registrations.head()
'''

# '''
# Load dataset from local file for demonstration purposes
registrations = pd.read_excel('VehicleRegistrations.xlsx', engine='openpyxl')
registrations.index.name = None
registrations.columns.name = None
registrations.head()
# '''

Unnamed: 0,Year,1995,1996,1997,1998,1999,2000,2001,2002,2003,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Number of Automobile Vehicle Registrations,136066045,129728341,129748704,131838538,132432044,133621420,137633467,135920677,135669897,...,111289906,113676345,113898845,112864228,112961266,111177029,111242132,108547710,105135300,102973881
1,Number of Motorcycle Vehicle Registrations,3767029,3871237,3826373,3879450,4152433,4346068,4903056,5004156,5370035,...,8454939,8404687,8417718,8600936,8679380,8664108,8659741,8596314,8347435,9881414
2,Motor vehicle licensed drivers,176628482,179539340,182709204,184860969,187170420,190625023,191275719,194295633,196165667,...,211814830,212159728,214092472,218084465,221994424,225346257,227558385,228679719,228195802,232781797


In [3]:
colors = { "registrations": "#DCDCAF" }

In [4]:
''' check for null and missing values'''
registrations.isnull().sum()

Year    0
1995    0
1996    0
1997    0
1998    0
1999    0
2000    0
2001    0
2002    0
2003    0
2004    0
2005    0
2006    0
2007    0
2008    0
2009    0
2010    0
2011    0
2012    0
2013    0
2014    0
2015    0
2016    0
2017    0
2018    0
2019    0
2020    0
2021    0
dtype: int64

# Cleaning and Transforming

In [5]:
''' Transpose the table so that years become the first column '''

id_col = registrations.columns[0]            # e.g. "Fuel economy, mpg"
registrations = registrations.round(2)
registrations = registrations.set_index(id_col).T.reset_index().rename(columns={'index':'Year'})

registrations.head()

Year,Year.1,Number of Automobile Vehicle Registrations,Number of Motorcycle Vehicle Registrations,Motor vehicle licensed drivers
0,1995,136066045,3767029,176628482
1,1996,129728341,3871237,179539340
2,1997,129748704,3826373,182709204
3,1998,131838538,3879450,184860969
4,1999,132432044,4152433,187170420


In [6]:
''' Verify the columns, datatypes, and number of entries'''
registrations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 4 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Year                                        27 non-null     int64  
 1   Number of Automobile Vehicle Registrations  27 non-null     float64
 2   Number of Motorcycle Vehicle Registrations  27 non-null     float64
 3   Motor vehicle licensed drivers              27 non-null     float64
dtypes: float64(3), int64(1)
memory usage: 996.0 bytes


In [7]:
''' Data Cleaning for registrations Data '''

# Convert Year column to Year format
registrations["Year"] = pd.to_datetime(registrations["Year"], format='%Y')

# Filter to keep data where the year is between 1995 and 2021 (inclusive)
registrations = registrations[
    (registrations["Year"].dt.year >= 1995) &
    (registrations["Year"].dt.year <= 2021)
]

# Preview the first 10 rows
registrations.head(10)

Year,Year.1,Number of Automobile Vehicle Registrations,Number of Motorcycle Vehicle Registrations,Motor vehicle licensed drivers
0,1995-01-01,136066045,3767029,176628482
1,1996-01-01,129728341,3871237,179539340
2,1997-01-01,129748704,3826373,182709204
3,1998-01-01,131838538,3879450,184860969
4,1999-01-01,132432044,4152433,187170420
5,2000-01-01,133621420,4346068,190625023
6,2001-01-01,137633467,4903056,191275719
7,2002-01-01,135920677,5004156,194295633
8,2003-01-01,135669897,5370035,196165667
9,2004-01-01,136430651,5780870,198888912


In [8]:
''' Rename columns for easier access '''
registrations = registrations.rename(columns={
    'Number of Automobile Vehicle Registrations': 'Automobile_Registrations',
    'Number of Motorcycle Vehicle Registrations': 'Motorcycle_Registrations',
    'Motor vehicle licensed drivers': 'Licensed_Drivers'
})

# Summary Stats

In [9]:
registrations.describe()

Year,Year.1,Automobile_Registrations,Motorcycle_Registrations,Licensed_Drivers
count,27,27,27,27
mean,2008-01-01 08:53:20,125157985,6806726,205630125
min,1995-01-01 00:00:00,102973881,3767029,176628482
25%,2001-07-02 12:00:00,112912747,4953606,192785676
50%,2008-01-01 00:00:00,130892240,7752926,208320601
75%,2014-07-02 12:00:00,135795287,8446220,216088469
max,2021-01-01 00:00:00,137633467,9881414,232781797
std,,12040002,1977060,16233334


In [10]:
'''' View skewness and kurtosis of consumption data '''

columns_to_analyze = registrations.columns[1:]  # Exclude the 'Year' column

for column in columns_to_analyze:
    skewness = skew(registrations[column].dropna())
    kurt = kurtosis(registrations[column].dropna())
    print(f"Column: {column}")
    print(f"  Skewness: {skewness:.4f}")
    print(f"  Kurtosis: {kurt:.4f}\n")

Column: Automobile_Registrations
  Skewness: -0.5179
  Kurtosis: -1.4172

Column: Motorcycle_Registrations
  Skewness: -0.3593
  Kurtosis: -1.4021

Column: Licensed_Drivers
  Skewness: -0.0799
  Kurtosis: -1.0196



In [11]:
''' View registrations date range and total years of data '''

print("Earliest date:", registrations["Year"].min())
print("Latest date:", registrations["Year"].max())
print("Total months of data:", len(registrations))

Earliest date: 1995-01-01 00:00:00
Latest date: 2021-01-01 00:00:00
Total months of data: 27


# Automobile Registrations

- number of new drivers

In [12]:
''' Calculate and print average automobile registrations, standard deviation, and coefficient of variation '''

mean = registrations["Automobile_Registrations"].mean()
std = registrations["Automobile_Registrations"].std()
cv = (std / mean) * 100
print(f"Average Annual Automobile Registrations: {mean:,.2f}")
print(f"Standard deviation: {std:,.2f}")
print(f"Coefficient of variation: {cv:.2f}% \n")

Average Annual Automobile Registrations: 125,157,985.01
Standard deviation: 12,040,001.98
Coefficient of variation: 9.62% 



In [13]:
''' Create a histogram to visualize the distribution of Automobile Registrations '''

# Create histogram
figvr1 = px.histogram(
    registrations,
    x="Automobile_Registrations",
    nbins=8,
    title="Distribution of Automobile Registrations",
    color_discrete_sequence=[colors["registrations"]]
)

# Customize layout
figvr1.update_layout(
    template="plotly_white",
    title_x=0.5,
    xaxis_title="Automobile Registrations",
    yaxis_title="Frequency",
    xaxis=dict(showgrid=True, gridcolor="lightgray"),
    yaxis=dict(showgrid=True, gridcolor="lightgray"),
    hovermode="x unified",
    height=500
)

figvr1.update_traces(marker_line_color="black", marker_line_width=1, opacity=1)

figvr1.show()

In [14]:
''' Outlier Detection for Automobile Registrations using Z-Score Method and Box Plot '''

# Use an existing column name from the DataFrame (e.g., "Car")
col_name = "Automobile_Registrations"

# Compute Z-score for the selected column
registrations["Z_automobile_registrations"] = (
    (registrations[col_name] - registrations[col_name].mean())
    / registrations[col_name].std()
)

# Flag potential outliers (|Z| > 3)
registrations_outliers = registrations[np.abs(registrations["Z_automobile_registrations"]) > 3]
print(f"Detected {len(registrations_outliers)} potential outliers in '{col_name}' registrations.")
display(registrations_outliers[["Year", col_name, "Z_automobile_registrations"]])

# visual outlier detection with box plot
figvr2 = px.box(
    registrations,
    y=col_name,
    title=f"Outlier Detection: {col_name} Registrations",
    points="all",
    color_discrete_sequence=[colors["registrations"]]
)
figvr2.update_layout(template="plotly_white", title_x=0.5, height=500)
figvr2.show()

Detected 0 potential outliers in 'Automobile_Registrations' registrations.


Year,Year.1,Automobile_Registrations,Z_automobile_registrations


# Motorcycle Registrations

In [15]:
''' Calculate and print average yearly motorcycle registrations, standard deviation, and coefficient of variation '''

mean = registrations["Motorcycle_Registrations"].mean()
std = registrations["Motorcycle_Registrations"].std()
cv = (std / mean) * 100
print(f"Average annual motorcycle registrations: {mean:,.2f}")
print(f"Standard deviation: {std:,.2f}")
print(f"Coefficient of variation: {cv:.2f}% \n")

Average annual motorcycle registrations: 6,806,726.41
Standard deviation: 1,977,060.34
Coefficient of variation: 29.05% 



In [16]:
''' Create a histogram to visualize the distribution of Motorcycle Registrations '''

# Create histogram
figvr3 = px.histogram(
    registrations,
    x="Motorcycle_Registrations",
    nbins=8,
    title="Distribution of Motorcycle Registrations",
    color_discrete_sequence=[colors["registrations"]]
)

# Customize layout
figvr3.update_layout(
    template="plotly_white",
    title_x=0.5,
    xaxis_title="Motorcycle Registrations",
    yaxis_title="Frequency",
    xaxis=dict(showgrid=True, gridcolor="lightgray"),
    yaxis=dict(showgrid=True, gridcolor="lightgray"),
    hovermode="x unified",
    height=500
)

figvr3.update_traces(marker_line_color="black", marker_line_width=1, opacity=1)

figvr3.show()

In [17]:
''' Outlier Detection for Motorcycle Registrations using Z-Score Method and Box Plot '''

# Use an existing column name from the DataFrame (e.g., "Car")
col_name = "Motorcycle_Registrations"

# Compute Z-score for the selected column
registrations["Z_motorcycle_registrations"] = (
    (registrations[col_name] - registrations[col_name].mean())
    / registrations[col_name].std()
)

# Flag potential outliers (|Z| > 3)
registrations_outliers = registrations[np.abs(registrations["Z_motorcycle_registrations"]) > 3]
print(f"Detected {len(registrations_outliers)} potential outliers in '{col_name}' registrations.")
display(registrations_outliers[["Year", col_name, "Z_motorcycle_registrations"]])

# visual outlier detection with box plot
figvr4 = px.box(
    registrations,
    y=col_name,
    title=f"Outlier Detection: {col_name}",
    points="all",
    color_discrete_sequence=[colors["registrations"]]
)
figvr4.update_layout(template="plotly_white", title_x=0.5, height=500)
figvr4.show()

Detected 0 potential outliers in 'Motorcycle_Registrations' registrations.


Year,Year.1,Motorcycle_Registrations,Z_motorcycle_registrations


# Licensed Drivers

In [18]:
''' Calculate and print average yearly licensed drivers, standard deviation, and coefficient of variation '''

mean = registrations["Licensed_Drivers"].mean()
std = registrations["Licensed_Drivers"].std()
cv = (std / mean) * 100
print(f"Average Annual licensed drivers: {mean:,.2f}")
print(f"Standard deviation: {std:,.2f}")
print(f"Coefficient of variation: {cv:.2f}% \n")

Average Annual licensed drivers: 205,630,124.95
Standard deviation: 16,233,334.02
Coefficient of variation: 7.89% 



In [19]:

''' Create a histogram to visualize the distribution of License Drivers '''

# Create histogram
figvr5 = px.histogram(
    registrations,
    x="Licensed_Drivers",
    nbins=12,
    title="Distribution of License Drivers",
    color_discrete_sequence=[colors["registrations"]]
)

# Customize layout
figvr5.update_layout(
    template="plotly_white",
    title_x=0.5,
    xaxis_title="Licensed Drivers",
    yaxis_title="Frequency",
    xaxis=dict(showgrid=True, gridcolor="lightgray"),
    yaxis=dict(showgrid=True, gridcolor="lightgray"),
    hovermode="x unified",
    height=500
)

figvr5.update_traces(marker_line_color="black", marker_line_width=1, opacity=1)

figvr5.show()

In [20]:
''' Outlier Detection for Licensed Driver using Z-Score Method and Box Plot '''

# Use an existing column name from the DataFrame (e.g., "Car")
col_name = "Licensed_Drivers"

# Compute Z-score for the selected column
registrations["Z_licensed_drivers"] = (
    (registrations[col_name] - registrations[col_name].mean())
    / registrations[col_name].std()
)

# Flag potential outliers (|Z| > 3)
registrations_outliers = registrations[np.abs(registrations["Z_licensed_drivers"]) > 3]
print(f"Detected {len(registrations_outliers)} potential outliers in '{col_name}' registrations.")
display(registrations_outliers[["Year", col_name, "Z_licensed_drivers"]])

# visual outlier detection with box plot
figvr6 = px.box(
    registrations,
    y=col_name,
    title=f"Outlier Detection: {col_name}",
    points="all",
    color_discrete_sequence=[colors["registrations"]]
)
figvr6.update_layout(template="plotly_white", title_x=0.5, height=500)
figvr6.show()

Detected 0 potential outliers in 'Licensed_Drivers' registrations.


Year,Year.1,Licensed_Drivers,Z_licensed_drivers


# Line Graph

In [21]:
''' create a line graph showing trends over time for each vehicle type '''

fig_line = px.line(
    registrations,
    x="Year",
    y=["Automobile_Registrations", "Motorcycle_Registrations", "Licensed_Drivers"],
    title="Vehicle Registration Trends Over Time by Vehicle Type",
    labels={"value": "Number of Registrations", "variable": "Vehicle Type"},
    color_discrete_sequence=px.colors.qualitative.Set1
)
fig_line.update_layout(
    template="plotly_white",
    title_x=0.5,
    xaxis_title="Year",
    yaxis_title="Number of Registrations",
    xaxis=dict(showgrid=True, gridcolor="lightgray"),
    yaxis=dict(showgrid=True, gridcolor="lightgray"),
    hovermode="x unified",
    height=600
)
fig_line.show()

# Final Transformations

In [22]:
# create new column of total registrations
registrations = registrations.iloc[:, :4]

registrations['TotalRegistrations'] = registrations['Motorcycle_Registrations'] + registrations['Automobile_Registrations']
registrations['TotalRegistrations'] = registrations['TotalRegistrations'].round(0)
registrations['Year'] = registrations['Year'].dt.year
registrations['Year'] = registrations['Year'].astype(int)

print(registrations)

Year  Year  Automobile_Registrations  Motorcycle_Registrations  \
0     1995               136,066,045                 3,767,029   
1     1996               129,728,341                 3,871,237   
2     1997               129,748,704                 3,826,373   
3     1998               131,838,538                 3,879,450   
4     1999               132,432,044                 4,152,433   
5     2000               133,621,420                 4,346,068   
6     2001               137,633,467                 4,903,056   
7     2002               135,920,677                 5,004,156   
8     2003               135,669,897                 5,370,035   
9     2004               136,430,651                 5,780,870   
10    2005               136,568,083                 6,227,146   
11    2006               135,399,945                 6,678,958   
12    2007               135,932,930                 7,138,476   
13    2008               137,079,843                 7,752,926   
14    2009

# Export

In [23]:
# Save as CSV
registrations.to_csv("cleanedregistrations.csv", index=False)

# Optional: Save as Pickle for faster loading
registrations.to_pickle("cleanedregistrations.pkl")