In [None]:
from pathlib import Path

import numpy as np
import pandas as pd

DATASET = Path("../data/auto-mpg.data")

# Define column names for the dataset
column_names = ['mpg', 'cylinders', 'displacement', 'horsepower', 
                'weight', 'acceleration', 'model_year', 'origin', 'car_name']

# Load the dataset with custom column names and proper delimiter
cars = pd.read_csv(DATASET, delim_whitespace=True, names=column_names, na_values='?')

# Display the first few rows
cars.head()

In [None]:
# Get summary statistics for all features in the dataset
cars.describe(include="all")


The vast majority of cars have 4 cylinders (204), with 8 cylinders (103) and 6 cylinders (84) also being common
The model years are distributed across 1970-1982, with 1973 having the most cars (40)
American cars make up the majority of the dataset (249), with Japanese (79) and European (70) cars representing smaller portions

In [None]:
# Let's display the distribution of categorical columns in our data
# For auto-mpg, categorical columns would be cylinders, model_year, and origin

# First, convert origin to categorical values (1=USA, 2=Europe, 3=Japan)
cars['origin'] = cars['origin'].astype(int).map({1: 'USA', 2: 'Europe', 3: 'Japan'})

# Get the distributions
cylinders_distribution = cars["cylinders"].value_counts().sort_index()
model_year_distribution = cars["model_year"].value_counts().sort_index()
origin_distribution = cars["origin"].value_counts()

# Display the distributions
print("Distribution of cylinders:")
print(cylinders_distribution, end="\n\n")

print("Distribution of model years:")
print(model_year_distribution, end="\n\n")

print("Distribution of car origins:")
print(origin_distribution)

In [None]:
# Replace any '?' with NaN in the horsepower column
cars["horsepower"] = cars["horsepower"].replace("?", np.nan)

# Convert horsepower to numeric type
cars["horsepower"] = pd.to_numeric(cars["horsepower"])

# Let's display the distribution of the horsepower column
print(cars["horsepower"].describe())

# Check for missing values
print("\nNumber of missing horsepower values:", cars["horsepower"].isna().sum())

In [None]:
# Check for missing values across all columns
cars.isna().sum()

In [None]:
from sklearn.impute import SimpleImputer

# For numeric columns, using the median is often better than most_frequent
numeric_cols = cars.select_dtypes(include=['float64', 'int64']).columns
numeric_imputer = SimpleImputer(strategy="median")
cars[numeric_cols] = numeric_imputer.fit_transform(cars[numeric_cols])

# For categorical columns, use most_frequent
categorical_cols = cars.select_dtypes(include=['object']).columns
if len(categorical_cols) > 0:
    categorical_imputer = SimpleImputer(strategy="most_frequent")
    cars[categorical_cols] = categorical_imputer.fit_transform(cars[categorical_cols])

# Let's display again the number of missing values:
cars.isna().sum()

In [None]:
import matplotlib.pyplot as plt

fig, axs = plt.subplots(3, 1, figsize=(8, 12))

# Visualize distribution of cylinders
axs[0].bar(cylinders_distribution.index, cylinders_distribution.values)
axs[0].set_xlabel("Number of Cylinders")
axs[0].set_ylabel("Count")
axs[0].set_title("Distribution of Cylinders")

# Visualize distribution of origins
axs[1].bar(origin_distribution.index, origin_distribution.values)
axs[1].set_xlabel("Country of Origin")
axs[1].set_ylabel("Count")
axs[1].set_title("Distribution of Car Origins")

# Visualize distribution of model years
axs[2].bar(model_year_distribution.index, model_year_distribution.values)
axs[2].set_xlabel("Model Year")
axs[2].set_ylabel("Count")
axs[2].set_title("Distribution of Model Years")

plt.tight_layout()
plt.show()

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(10, 8))

# MPG distribution
axs[0, 0].hist(cars["mpg"], bins=20, color='skyblue', edgecolor='black')
axs[0, 0].set_xlabel("Miles Per Gallon")
axs[0, 0].set_ylabel("Count")
axs[0, 0].set_title("Distribution of MPG")

# Displacement distribution
axs[0, 1].hist(cars["displacement"], bins=20, color='skyblue', edgecolor='black')
axs[0, 1].set_xlabel("Displacement (cu. inches)")
axs[0, 1].set_ylabel("Count")
axs[0, 1].set_title("Distribution of Displacement")

# Horsepower distribution
axs[1, 0].hist(cars["horsepower"], bins=20, color='skyblue', edgecolor='black')
axs[1, 0].set_xlabel("Horsepower")
axs[1, 0].set_ylabel("Count")
axs[1, 0].set_title("Distribution of Horsepower")

# Weight distribution
axs[1, 1].hist(cars["weight"], bins=20, color='skyblue', edgecolor='black')
axs[1, 1].set_xlabel("Weight (lbs)")
axs[1, 1].set_ylabel("Count")
axs[1, 1].set_title("Distribution of Weight")

plt.tight_layout()
plt.show()

# Create a separate plot for acceleration
plt.figure(figsize=(8, 5))
plt.hist(cars["acceleration"], bins=20, color='skyblue', edgecolor='black')
plt.xlabel("Acceleration (sec to reach 60mph)")
plt.ylabel("Count")
plt.title("Distribution of Acceleration")
plt.tight_layout()
plt.show()

MPG and Weight: 
There's a strong negative covariance (approximately -5505) between MPG and weight. This indicates that heavier cars tend to have lower fuel efficiency, which aligns with physical principles - more mass requires more energy to move.
MPG and Displacement/Cylinders/Horsepower: 
There are negative covariances between MPG and engine characteristics (displacement around -655, cylinders around -10, horsepower around -231). This confirms that cars with larger, more powerful engines tend to consume more fuel.
Horsepower and Displacement: 
There's a strong positive covariance (around 3570) between horsepower and displacement, showing that engines with larger displacement tend to produce more power.
Weight and Displacement/Horsepower: 
High positive covariance values (around 82368 for displacement and 27915 for horsepower) indicate that heavier cars tend to have larger, more powerful engines.
MPG and Model Year: 
There's a positive covariance (around 16.7) between MPG and model year, suggesting that fuel efficiency improved over time, likely due to advancing technology and stricter regulations.
MPG and Acceleration: 
The positive covariance (around 9) between MPG and acceleration time might seem counterintuitive, but it indicates that cars with higher MPG tend to have higher acceleration times (slower acceleration), reflecting the trade-off between performance and efficiency.

These covariance patterns reveal the fundamental engineering trade-offs in automotive design during this period, particularly the balance between power, weight, and fuel economy.RetryClaude can make mistakes. 

In [None]:
# Calculate the covariance matrix for numeric columns
cars_numeric = cars.select_dtypes(include=['float64', 'int64'])
covariance_matrix = cars_numeric.cov()

# Display the covariance matrix
covariance_matrix

MPG and Weight (-0.83): This strong negative correlation confirms that heavier cars consistently have lower fuel efficiency. Weight appears to be the strongest negative predictor of MPG in the dataset.
MPG and Displacement (-0.80): Another strong negative correlation showing that cars with larger engines (higher displacement) typically achieve fewer miles per gallon.
MPG and Cylinders/Horsepower (both around -0.77): Strong negative correlations indicating that more cylinders and higher horsepower are associated with lower fuel economy, as expected.
MPG and Model Year (+0.58): A moderate positive correlation that confirms fuel efficiency improved over time during the 1970s and early 1980s. This likely reflects advancements in automotive technology and stricter emissions/fuel economy regulations.
MPG and Acceleration (+0.42): A moderate positive correlation showing that cars with higher MPG tend to have longer acceleration times (slower acceleration), representing the trade-off between performance and efficiency.
Displacement and Cylinders (+0.95): An extremely strong correlation indicating that these two variables capture very similar information - more cylinders typically mean larger engine displacement.
Weight and Displacement (+0.93): Very strong correlation showing heavier cars almost always have larger engines.
Horsepower and Displacement/Weight/Cylinders (all above +0.84): Strong correlations confirming that powerful cars tend to have larger engines with more cylinders and greater weight.
Acceleration and Horsepower (-0.69): A strong negative correlation showing that cars with more horsepower accelerate faster (lower acceleration time).

These correlations reveal important insights for modeling:

The strongest predictors of MPG are weight, displacement, cylinders, and horsepower (all negatively correlated).
There's significant multicollinearity among engine features (displacement, cylinders, horsepower) that could affect model stability.
The positive trend in MPG over model years suggests technological improvements over time.
The data confirms basic automotive engineering principles about the relationships between weight, power, and fuel efficiency.

When building your prediction model, you may need to consider feature selection or dimensionality reduction techniques to address the multicollinearity among predictors.RetryClaude can make mistakes. Please double-check responses.

In [None]:
# Calculate the correlation matrix for numeric columns
correlation_matrix = cars_numeric.corr()

# Display the correlation matrix
correlation_matrix

In [None]:
unique_cylinders = cars["cylinders"].unique()

fig, ax = plt.subplots(figsize=(10, 6))
for cylinder in sorted(unique_cylinders):
    data = cars[cars["cylinders"] == cylinder]
    ax.hist(data["origin"], bins=3, alpha=0.5, label=f"{cylinder} cylinders")

ax.set_xlabel("Origin")
ax.set_ylabel("Count")
ax.set_title("Distribution of Car Origins by Cylinder Count")
ax.legend()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

for origin in cars["origin"].unique():
    data = cars[cars["origin"] == origin]
    ax.hist(data["model_year"], bins=len(cars["model_year"].unique()), alpha=0.5, label=origin)

ax.set_xlabel("Model Year")
ax.set_ylabel("Count")
ax.set_title("Distribution of Car Origins by Model Year")

ax.legend()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

for cyl in sorted(cars["cylinders"].unique()):
    data = cars[cars["cylinders"] == cyl]
    ax.hist(data["mpg"], bins=20, alpha=0.5, label=f"{cyl} cylinders")

ax.set_xlabel("MPG")
ax.set_ylabel("Count")
ax.set_title("Distribution of MPG by Number of Cylinders")
ax.legend()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

for origin in cars["origin"].unique():
    data = cars[cars["origin"] == origin]
    ax.hist(data["horsepower"], bins=20, alpha=0.5, label=origin)

ax.set_xlabel("Horsepower")
ax.set_ylabel("Count")
ax.set_title("Distribution of Horsepower by Origin")
ax.legend()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

for origin in cars["origin"].unique():
    data = cars[cars["origin"] == origin]
    ax.hist(data["weight"], bins=20, alpha=0.5, label=origin)

ax.set_xlabel("Weight (lbs)")
ax.set_ylabel("Count")
ax.set_title("Distribution of Car Weight by Origin")
ax.legend()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

for origin in cars["origin"].unique():
    data = cars[cars["origin"] == origin]
    yearly_avg = data.groupby("model_year")["mpg"].mean()
    ax.plot(yearly_avg.index, yearly_avg.values, marker='o', linestyle='-', label=origin)

ax.set_xlabel("Model Year")
ax.set_ylabel("Average MPG")
ax.set_title("Average MPG by Model Year and Origin")
ax.grid(True, alpha=0.3)
ax.legend()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

for cyl in sorted(cars["cylinders"].unique()):
    data = cars[cars["cylinders"] == cyl]
    ax.hist(data["acceleration"], bins=15, alpha=0.5, label=f"{cyl} cylinders")

ax.set_xlabel("Acceleration (sec to reach 60mph)")
ax.set_ylabel("Count")
ax.set_title("Distribution of Acceleration by Number of Cylinders")
ax.legend()
plt.show()