In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [36]:
df = pd.read_csv("US-pumpkins.csv")
df = df[['Variety', 'City Name', 'Package', 'Date', 'Low Price']].dropna()
df = df.rename(columns={
    'City Name': 'City',
    'Low Price': 'Price'
})
df['Date'] = pd.to_datetime(df['Date'], format="%m/%d/%y", errors='coerce')
df = df.dropna(subset=['Date'])
df['DayOfYear'] = df['Date'].dt.dayofyear

In [38]:
def evaluate_model(X, y, model_name, polynomial=False):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    model = make_pipeline(PolynomialFeatures(2), LinearRegression()) if polynomial else LinearRegression()
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, pred))
    perc_error = rmse / np.mean(y_test) * 100
    score = model.score(X_train, y_train)
    return model_name, rmse, perc_error, score


In [40]:
results = []

# DayOfYear models
X = df[['DayOfYear']]
y = df['Price']
results.append(evaluate_model(X, y, "DayOfYear Linear", polynomial=False))
results.append(evaluate_model(X, y, "DayOfYear Polynomial", polynomial=True))

# Variety only
X = pd.get_dummies(df['Variety'])
results.append(evaluate_model(X, y, "Variety Linear", polynomial=False))

# All features
X = pd.get_dummies(df[['Variety', 'City', 'Package']])
X = X.join(df['DayOfYear'])
results.append(evaluate_model(X, y, "All features Linear", polynomial=False))
results.append(evaluate_model(X, y, "All features Polynomial", polynomial=True))


In [42]:
results_df = pd.DataFrame(results, columns=["Model", "MSE", "Percent Error", "Determination"])
results_df["MSE"] = results_df["MSE"].apply(lambda x: f"{x:.2f}")
results_df["Percent Error"] = results_df["Percent Error"].apply(lambda x: f"{x:.1f}%")
results_df["Determination"] = results_df["Determination"].apply(lambda x: f"{x:.2f}")
results_df


Unnamed: 0,Model,MSE,Percent Error,Determination
0,DayOfYear Linear,80.97,65.4%,0.02
1,DayOfYear Polynomial,80.81,65.3%,0.02
2,Variety Linear,63.64,51.4%,0.34
3,All features Linear,36.76,29.7%,0.78
4,All features Polynomial,19999980.82,16157921.7%,0.93
