In [1]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt

# --- Configuration (No Hardcoded Values) ---
# Use a URL to directly download the dataset
DATASET_URL = "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
COLUMN_NAMES = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']
TARGET_VARIABLE = 'mpg'
TEST_SET_SIZE = 0.2
RANDOM_STATE = 42

# --- Step 2: Load the Dataset ---
# Load the data from the URL, specifying column names and handling missing values
# The dataset uses '?' for missing horsepower values
try:
    dataset = pd.read_csv(DATASET_URL, names=COLUMN_NAMES,
                          na_values='?', comment='\t',
                          sep=" ", skipinitialspace=True)
    print("Dataset loaded successfully.")
except Exception as e:
    print(f"Error loading dataset: {e}")
    exit()


# --- Step 3: Data Cleaning and Preprocessing ---
# Drop rows with any missing values. [1]
dataset = dataset.dropna()

# The 'origin' column is categorical (1: USA, 2: Europe, 3: Japan).
# Convert it into dummy variables (one-hot encoding) for the model to use. [5, 10]
dataset = pd.get_dummies(dataset, columns=['origin'], prefix='', prefix_sep='')
dataset.rename(columns={1: 'USA', 2: 'Europe', 3: 'Japan'}, inplace=True)

# The 'car_name' column is not useful for prediction, so we drop it.
dataset = dataset.drop('car_name', axis=1)


# --- Step 4: Prepare Data for Modeling ---
# Separate the features (X) from the target variable (y)
X = dataset.drop(TARGET_VARIABLE, axis=1)
y = dataset[TARGET_VARIABLE]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SET_SIZE, random_state=RANDOM_STATE)


# --- Step 5: Build and Train the Linear Regression Model ---
# Create an instance of the LinearRegression model
model = LinearRegression()

# Train the model using the training data
model.fit(X_train, y_train)
print("\nLinear Regression model trained successfully.")


# --- Step 6: Make Predictions and Evaluate the Model ---
# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


# --- Step 7: Analyze the Results for Two Purposes ---

# Purpose 1: Performance Evaluation
print("\n--- Model Performance Analysis ---")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (RÂ²): {r2:.2f}")
print(f"\nInterpretation: The R-squared value of {r2:.2f} means that our model can explain approximately {r2*100:.0f}% of the variance in MPG, which indicates a good fit.")

# Purpose 2: Feature Importance Analysis
print("\n--- Feature Importance Analysis ---")
# Create a DataFrame to view the feature coefficients
coefficients = pd.DataFrame(model.coef_, X.columns, columns=['Coefficient'])
print(coefficients)
print("\nInterpretation of Coefficients:")
print("- A negative coefficient (e.g., weight, horsepower) means that as the feature's value increases, the MPG tends to decrease.")
print("- A positive coefficient (e.g., model_year) means that as the feature's value increases, the MPG tends to increase.")

# --- Bonus: Visualize the Results ---
# Scatter plot of actual vs. predicted values
plt.figure(figsize=(10, 6))
sns.regplot(x=y_test, y=y_pred, scatter_kws={"alpha":0.6})
plt.xlabel("Actual MPG (y_test)")
plt.ylabel("Predicted MPG (y_pred)")
plt.title("Actual vs. Predicted MPG")
plt.grid(True)
plt.show()

Dataset loaded successfully.


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.