In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# Load the Auto MPG dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
column_names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin', 'car name']
auto_mpg_data = pd.read_csv(url, delim_whitespace=True, names=column_names)

# Handling Missing Values
auto_mpg_data.replace('?', np.nan, inplace=True)
auto_mpg_data.dropna(inplace=True)

# Split the data into features (X) and the target (y)
X = auto_mpg_data.drop(columns=['mpg', 'car name'])
y = auto_mpg_data['mpg']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define a list of models to experiment with
models = [
    ("Linear Regression", LinearRegression()),
    ("Decision Tree", DecisionTreeRegressor()),
    ("Random Forest", RandomForestRegressor()),
    ("Support Vector Regression", SVR(kernel='linear')),
    # Add other models here
]

best_score = -float("inf")
best_params = None
best_components = None
best_model = None

for model_name, model in models:
    # Hyperparameter tuning (use GridSearchCV or other techniques)
    if model_name == "Linear Regression":
        param_grid = {
            'fit_intercept': [True, False]
        }
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_params = grid_search.best_params_
        best_model = model_name

    elif model_name == "Decision Tree":
        param_grid = {
            'max_depth': [3, 5, 7, None],
            'min_samples_split': [2, 5, 10]
        }
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_params = grid_search.best_params_
        best_model = model_name

    elif model_name == "Random Forest":
        param_grid = {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10]
        }
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_params = grid_search.best_params_
        best_model = model_name

    elif model_name == "Support Vector Regression":
        param_grid = {
            'C': [0.1, 1, 10],
            'epsilon': [0.01, 0.1, 1]
        }
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_params = grid_search.best_params_
        best_model = model_name

    # Fit the model with the best parameters on the training data
    model = model.set_params(**best_params)
    model.fit(X_train, y_train)

    # Evaluate the model on the testing data
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Model: {model_name}")
    print(f"Best Parameters: {best_params}")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"R-squared: {r2:.2f}\n")

    # Apply PCA with varying components and evaluate
    components_range = range(1, len(X.columns) + 1)
    best_component_score = -float("inf")
    best_component = None
    for n_components in components_range:
        pca = PCA(n_components=n_components)
        X_train_pca = pca.fit_transform(X_train)
        X_test_pca = pca.transform(X_test)

        model.fit(X_train_pca, y_train)
        y_pred = model.predict(X_test_pca)
        r2_pca = r2_score(y_test, y_pred)

        if r2_pca > best_component_score:
            best_component_score = r2_pca
            best_component = n_components

    print(f"Best R-squared with PCA Components: {best_component_score:.2f} (Components: {best_component})\n")

    if best_component_score > best_score:
        best_score = best_component_score
        best_components = best_component

print("Best Model:", best_model)
print("Best Score (R-squared):", best_score)
print("Best Hyperparameters:", best_params)
print("Best Number of PCA Components:", best_components)

Model: Linear Regression
Best Parameters: {'fit_intercept': True}
Mean Squared Error: 10.71
R-squared: 0.79

Best R-squared with PCA Components: 0.79 (Components: 7)

Model: Decision Tree
Best Parameters: {'max_depth': 5, 'min_samples_split': 5}
Mean Squared Error: 10.96
R-squared: 0.79

Best R-squared with PCA Components: 0.82 (Components: 4)

Model: Random Forest
Best Parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 50}
Mean Squared Error: 6.12
R-squared: 0.88

Best R-squared with PCA Components: 0.88 (Components: 4)

Model: Support Vector Regression
Best Parameters: {'C': 10, 'epsilon': 1}
Mean Squared Error: 11.19
R-squared: 0.78

Best R-squared with PCA Components: 0.78 (Components: 7)

Best Model: Support Vector Regression
Best Score (R-squared): 0.878217809513733
Best Hyperparameters: {'C': 10, 'epsilon': 1}
Best Number of PCA Components: 4
