Libraries

In [223]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cluster        import KMeans
from sklearn.neural_network import MLPRegressor
import kagglehub
from kagglehub import KaggleDatasetAdapter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor

Dataset Import

In [224]:
file_path = "vehiclesclean.csv"
if os.path.isfile(file_path):
    df = pd.read_csv(file_path)
else:
    df = kagglehub.load_dataset(
      KaggleDatasetAdapter.PANDAS,
      "austinreese/craigslist-carstrucks-data",
      file_path
    )

print("First 5 records:", df.head())

First 5 records:    price    year manufacturer                 model  condition    cylinders  \
0  15000  2013.0         ford             f-150 xlt  excellent  6 cylinders   
1  19900  2004.0         ford       f250 super duty       good  8 cylinders   
2  14000  2012.0        honda               odyssey  excellent  6 cylinders   
3  22500  2001.0         ford                  f450       good  8 cylinders   
4  15000  2017.0        dodge  charger rt 4dr sedan  excellent  8 cylinders   

     fuel  odometer title_status transmission drive       size      type  \
0     gas  128000.0        clean    automatic   rwd  full-size     truck   
1  diesel   88000.0        clean    automatic   4wd  full-size    pickup   
2     gas   95000.0        clean    automatic   fwd  full-size  mini-van   
3  diesel  144700.0        clean       manual   rwd  full-size     truck   
4     gas   90000.0      rebuilt    automatic   rwd   mid-size     sedan   

  paint_color state  
0       black    al  
1      

Data Preprocessing

In [None]:
# uncomment if vehiclesclean.csv not found
# df = df.drop(columns=['id', 'region', 'url', 'region_url', 'VIN', 'image_url', 'description', 'county', 'lat', 'long', 'posting_date'])
# print(len(df)) 426880 rows before dropping
df = df.dropna()
# print(len(df)) 79195 after dropping

df = df[df['cylinders'] != "other"] # Need to be careful with this, may change in future, removing a lot of rows
df['cylinders'] = df['cylinders'].str[0].astype(int)
print(df.dtypes)
df.head()

KeyError: "['id', 'region', 'url', 'region_url', 'VIN', 'image_url', 'description', 'county', 'lat', 'long', 'posting_date'] not found in axis"

Feature Engineering

In [None]:
size_map = {
    'sub-compact': 1,
    'compact':     2,
    'mid-size':    3,
    'full-size':   4
}
df['sizes_rank'] = df['size'].map(size_map)

cond_map = {
    'salvage':     1,
    'fair':        2,
    'good':        3,
    'excellent':   4,
    'like new':    5,
    'new':         6
}
df['condition_rank'] = df['condition'].map(cond_map)

# df = df.drop(columns=['size', 'condition'])

def make_desc(row):
    return f"This is a used {int(row.year)} {row.manufacturer} {row.model} with {row.cylinders} cylinders"

df['description'] = df.apply(make_desc, axis=1)
df.head()

Unnamed: 0,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state,sizes_rank,condition_rank,description
31,15000,2013.0,ford,f-150 xlt,excellent,6,gas,128000.0,clean,automatic,rwd,full-size,truck,black,al,4,4,This is a used 2013 ford f-150 xlt with 6 cyli...
55,19900,2004.0,ford,f250 super duty,good,8,diesel,88000.0,clean,automatic,4wd,full-size,pickup,blue,al,4,3,This is a used 2004 ford f250 super duty with ...
59,14000,2012.0,honda,odyssey,excellent,6,gas,95000.0,clean,automatic,fwd,full-size,mini-van,silver,al,4,4,This is a used 2012 honda odyssey with 6 cylin...
65,22500,2001.0,ford,f450,good,8,diesel,144700.0,clean,manual,rwd,full-size,truck,white,al,4,3,This is a used 2001 ford f450 with 8 cylinders
73,15000,2017.0,dodge,charger rt 4dr sedan,excellent,8,gas,90000.0,rebuilt,automatic,rwd,mid-size,sedan,grey,al,3,4,This is a used 2017 dodge charger rt 4dr sedan...


In [None]:
data = df[['price', 'year', 'odometer', 'cylinders']].copy()
X = data[['year', 'odometer', 'cylinders']]
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

Matthew's Linear Regression

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

model = LinearRegression()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
print(f"R² score: {r2_score(y_test, y_pred):.4f}")
print(f"MSE:     {mean_squared_error(y_test, y_pred):.2f}")

print("Intercept:", model.intercept_)
print("Coefs:    ", dict(zip(X.columns, model.coef_)))

R² score: -36.3422
MSE:     17989342512.20
Intercept: 96231.26553296599
Coefs:     {'year': np.float64(-5741.428560185611), 'odometer': np.float64(6434.882238875661), 'cylinders': np.float64(104377.88308629108)}


Logan's Polynomial Regression

In [None]:
# valuable LABELS: price, year, manufacturer, model, condition, cylinders, fuel, odometer,
#                  title_status, transmission, drive, size, type, paint_color, posting_date

# for now, use only the numerical ones:
#     price, year, cylinders, odometer


# Create polynomial features (degree=2)
poly = PolynomialFeatures(degree=2, include_bias=False)

# Transform the features
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Display the new feature names
feature_names = poly.get_feature_names_out(X.columns)
print("\nPolynomial feature names:")
print(feature_names)

# Create and train the polynomial regression model
poly_reg = LinearRegression()
poly_reg.fit(X_train_poly, y_train)

# Display the coefficients
print("\nModel coefficients:")
for i, coef in enumerate(poly_reg.coef_):
    print(f"{feature_names[i]}: {coef:.4f}")
print(f"Intercept: {poly_reg.intercept_:.4f}")

# Make predictions on the test set
y_pred = poly_reg.predict(X_test_poly)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R²): {r2:.4f}")

# Create a DataFrame with actual and predicted prices
results = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred
})

# Sort by actual price for better visualization
results = results.sort_values(by='Actual')

# Reset index
results = results.reset_index(drop=True)

def predict_car_price(year, cylinders, mileage):
    # Create a DataFrame with the new car's features
    new_car = pd.DataFrame({
        'year': [year],
        'cylinders': [cylinders],
        'odometer': [mileage]
    })

    # Transform the features to polynomial features
    new_car_poly = poly.transform(new_car)

    # Predict the price
    predicted_price = poly_reg.predict(new_car_poly)[0]

    return predicted_price



Polynomial feature names:
['year' 'odometer' 'cylinders' 'year^2' 'year odometer' 'year cylinders'
 'odometer^2' 'odometer cylinders' 'cylinders^2']

Model coefficients:
year: -0.0000
odometer: 2.3233
cylinders: 0.0135
year^2: -0.1021
year odometer: -0.0011
year cylinders: 27.2025
odometer^2: -0.0000
odometer cylinders: 0.0544
cylinders^2: 0.1554
Intercept: 141521.8355

Model Evaluation:
Mean Squared Error (MSE): 19467101633.52
Root Mean Squared Error (RMSE): 139524.56
R-squared (R²): -39.4098


Dheeraj's Decision Tree


In [None]:
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

Mean Squared Error: 4426296756561849.0
R² Score: -9188092.64714874


Everett's Neural Network

In [None]:
# Step 1: Apply K-Means clustering to the dataset
kmeans = KMeans(n_clusters=5, random_state=42)  # Choose 5 clusters (can be adjusted)
df['cluster'] = kmeans.fit_predict(X)  # Add cluster labels as a new feature

nn_model = MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
nn_model.fit(X_train, y_train)

y_pred_nn = nn_model.predict(X_test)
mse_nn = mean_squared_error(y_test, y_pred_nn)
r2_nn = r2_score(y_test, y_pred_nn)

print("\nNeural Network Evaluation:")
print(f"Mean Squared Error (MSE): {mse_nn:.2f}")
print(f"R-squared (R²): {r2_nn:.4f}")

# Predict car price using the neural network
def predict_price_nn(year, cylinders, mileage):
    cluster = kmeans.predict([[year, cylinders, mileage]])[0]  # Predict cluster for the new car
    new_car_nn = pd.DataFrame({
        'year': [year],
        'cylinders': [cylinders],
        'odometer': [mileage],
        'cluster': [cluster]
    })
    predicted_price_nn = nn_model.predict(new_car_nn)[0]
    return predicted_price_nn

# # Example usage
# year = 2018
# cylinders = 6
# mileage = 85000
# predicted_price_nn = predict_price_nn(year, cylinders, mileage)
# print(f"\nPredicted price for a car with {mileage} miles, {cylinders} cylinders, from year {year}: ${predicted_price_nn:.2f}")


Neural Network Evaluation:
Mean Squared Error (MSE): 4758217931.32
R-squared (R²): -8.8771


Plotting

In [None]:
# plt.figure(figsize=(10, 6))
# plt.plot(results.index, results['Actual'], label='Actual Prices', color='blue', marker='o')
# plt.plot(results.index, results['Predicted'], label='Predicted Prices', color='red', marker='x')
# plt.title('Actual vs. Predicted Car Prices')
# plt.xlabel('Index')
# plt.ylabel('Price')
# plt.legend()
# plt.grid(True)

# # Plot a scatter plot of actual vs. predicted prices
# plt.figure(figsize=(10, 6))
# plt.scatter(results['Actual'], results['Predicted'], alpha=0.5)
# plt.plot([results['Actual'].min(), results['Actual'].max()],
#          [results['Actual'].min(), results['Actual'].max()],
#          'k--', lw=2)
# plt.title('Actual vs. Predicted Car Prices')
# plt.xlabel('Actual Prices')
# plt.ylabel('Predicted Prices')
# plt.grid(True)
