### Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cluster        import KMeans
from sklearn.neural_network import MLPRegressor
import kagglehub
from kagglehub import KaggleDatasetAdapter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor

from transformers import BertTokenizer, BertModel
from sklearn.decomposition import PCA
import torch
from torch.utils.data import Dataset, DataLoader

### Dataset Import

In [None]:
file_path = "vehiclesclean.csv"
if os.path.isfile(file_path):
    df = pd.read_csv(file_path)
else:
    df = kagglehub.load_dataset(
      KaggleDatasetAdapter.PANDAS,
      "austinreese/craigslist-carstrucks-data",
      file_path
    )
    df = df.drop(columns=['id', 'region', 'url', 'region_url', 'VIN', 'image_url', 'description', 'county', 'lat', 'long', 'posting_date'])

### Data Preprocessing

In [None]:
# print(len(df)) 426880 rows before dropping
df = df.dropna()
# print(len(df)) 79195 after dropping
df.head()

### Feature Engineering

In [None]:
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
categorical_columns = []
features = df.columns.values.tolist()

# Find categorical features
for col in features:
    if df[col].dtype in numerics: continue
    categorical_columns.append(col)

# Label encode categorical features
for col in categorical_columns:
    if col in df.columns:
        le = LabelEncoder()
        le.fit(list(df[col].astype(str).values))
        df[col] = le.transform(list(df[col].astype(str).values))

# Only use cars from last 15 years
df['year'] = (2025-df['year']).astype(int)
df = df[df['year'] < 15]

# Only using cars priced from 1-100k
df = df[df['price'] > 1000]
df = df[df['price'] < 150000]

# Binning odometer into groups of 5k
df['odometer'] = df['odometer'].astype(int)
df['odometer'] = df['odometer'] // 5000


def make_desc(row):
    return (
        f"This is a {row['condition']} {int(row['year'])} {row['manufacturer']} {row['model']}, "
        f"a {row['size']} sized {row['type']} with a {int(row['cylinders'])}-cylinder {row['fuel']} engine, "
        f"{row['transmission']} transmission, and {row['drive']} drive. "
        f"It has {int(row['odometer']):,} miles, holds a {row['title_status']} title in {row['state']}, "
        f"is painted {row['paint_color']}, predict its price."
    )

df['description'] = df.apply(make_desc, axis=1)

df.head()

### BERT Encoding

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts
        
    def __len__(self):
        return len(self.texts)
        
    def __getitem__(self, idx):
        return self.texts[idx]

# Batch processing function
def process_batch(batch):
    inputs = tokenizer(
        batch,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=128
    ).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    return outputs.last_hidden_state[:, 0, :].cpu().numpy()

# Prepare dataset and dataloader
dataset = TextDataset(df['description'].tolist())
dataloader = DataLoader(dataset, batch_size=64, shuffle=False)

# Process all batches
embeddings = []
for batch in dataloader:
    embeddings.append(process_batch(batch))
    
embeddings_matrix = np.vstack(embeddings)

# Add to DataFrame
for i in range(embeddings_matrix.shape[1]):
    df[f'bert_{i}'] = embeddings_matrix[:, i]

# Dimensionality reduction
pca = PCA(n_components=50, random_state=42)
reduced_embeddings = pca.fit_transform(embeddings_matrix)
for i in range(reduced_embeddings.shape[1]):
    df[f'bert_pca_{i}'] = reduced_embeddings[:, i]


In [None]:
X = df.drop(columns=['description', 'price'], axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

### Matthew's Linear Regression

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

model = LinearRegression()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

lr_r2 = r2_score(y_test, y_pred)
lr_mse = mean_squared_error(y_test, y_pred)

### Logan's Polynomial Regression

In [None]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

model = LinearRegression()
model.fit(X_train_poly, y_train)

y_pred = model.predict(X_test_poly)

pr_r2 = r2_score(y_test, y_pred)
pr_mse = mean_squared_error(y_test, y_pred)


### Dheeraj's Decision Tree


In [None]:
model = DecisionTreeRegressor(random_state=0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

dt_r2 = r2_score(y_test, y_pred)
dt_mse = mean_squared_error(y_test, y_pred)

### Everett's Neural Network

In [None]:
kmeans = KMeans(n_clusters=5, random_state=0)
df['cluster'] = kmeans.fit_predict(X)

model = MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

nn_r2 = r2_score(y_test, y_pred)
nn_mse = mean_squared_error(y_test, y_pred)

### Results

In [None]:
print(f"R-squared (R²): \n\tLinear Regression: {lr_r2:.4f}\n\tPolynomial Regression: {pr_r2:.4f}\n\tDecision Tree: {dt_r2:.4f}\n\tNeural Network: {nn_r2:.4f}")
print()
print(f"Mean Squared Error (MSE): \n\tLinear Regression: {lr_mse:.2f}\n\tPolynomial Regression: {pr_mse:.2f}\n\tDecision Tree: {dt_mse:.2f}\n\tNeural Network: {nn_mse:.2f}")