# Polynomial Regression

In [None]:
# Imports
import pandas as pd
from data_cleaning import clean_car_data
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [None]:
# Load the Data
cars_raw = pd.read_csv("USA_cars_datasets.csv")

In [None]:
# Clean the data
cars = clean_car_data(cars_raw)

In [None]:
# One-hot Encoding for categorical variables

# Determine categorical columns
category_columns = cars.select_dtypes('category').columns

# Drop categorical columns
cars = pd.get_dummies(cars, columns=category_columns, drop_first=True)

In [None]:
# Separate the features and target variable
x = cars.drop(columns='price')
y = cars['price']

In [None]:
# Standardize the numeric variables
scaler = MinMaxScaler()

# Identify numeric columns
number_columns = x.select_dtypes('number').columns

# Standardize numeric features
x[number_columns] = scaler.fit_transform(cars[number_columns])

In [None]:
# Partition the Data
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=42)

In [None]:
# Implement Polynomial Regression
poly = PolynomialFeatures(degree=2)

In [None]:
# Transform training and testing features
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

In [None]:
# Lists to store errors
R2_train_list = []
RMSE_train_list = []
R2_test_list = []
RMSE_test_list = []

degrees = [2, 3, 4, 5, 6, 7, 8, 9, 10]

In [None]:
for d in degrees:
    # Create polynomial features
    poly = PolynomialFeatures(degree=d)
    x_train_poly = poly.fit_transform(x_train)
    x_test_poly = poly.transform(x_test)

    # Train model
    model = LinearRegression()
    model.fit(x_train_poly, y_train)

    # Predictions
    pred_train = model.predict(x_train_poly)
    pred_test = model.predict(x_test_poly)

    # Train errors
    R2_train = r2_score(y_train, pred_train)
    RMSE_train = mean_squared_error(y_train, pred_train) ** 0.5

    # Test errors
    R2_test = r2_score(y_test, pred_test)
    RMSE_test = mean_squared_error(y_test, pred_test) ** 0.5

    # Append to lists
    R2_train_list.append(R2_train)
    RMSE_train_list.append(RMSE_train)
    R2_test_list.append(R2_test)
    RMSE_test_list.append(RMSE_test)

In [None]:
# Plot R² Scores
plt.figure(figsize=(10, 5))
plt.plot(degrees, R2_train_list, label="Train R²", marker='o')
plt.plot(degrees, R2_test_list, label="Test R²", marker='s')
plt.xlabel("Polynomial Degree")
plt.ylabel("R² Score")
plt.title("Model Performance vs. Polynomial Degree")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Plot RMSE
plt.figure(figsize=(10, 5))
plt.plot(degrees, RMSE_train_list, label="Train RMSE", marker='o')
plt.plot(degrees, RMSE_test_list, label="Test RMSE", marker='s')
plt.xlabel("Polynomial Degree")
plt.ylabel("RMSE")
plt.title("Root Mean Squared Error vs. Polynomial Degree")
plt.legend()
plt.grid(True)
plt.show()