In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from tensorflow.keras import layers

# Load your file (adjust path as needed)
df = pd.read_csv("../data/true_car_listings.csv")

# Quick look
print(df.head())
print(df.info())


2025-11-11 21:28:53.405496: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


   Price  Year  Mileage              City State                Vin   Make  \
0   8995  2014    35725           El Paso    TX  19VDE2E53EE000083  Acura   
1  10888  2013    19606  Long Island City    NY  19VDE1F52DE012636  Acura   
2   8995  2013    48851           El Paso    TX  19VDE2E52DE000025  Acura   
3  10999  2014    39922           Windsor    CO  19VDE1F71EE003817  Acura   
4  14799  2016    22142            Lindon    UT  19UDE2F32GA001284  Acura   

          Model  
0    ILX6-Speed  
1    ILX5-Speed  
2    ILX6-Speed  
3    ILX5-Speed  
4  ILXAutomatic  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 852122 entries, 0 to 852121
Data columns (total 8 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Price    852122 non-null  int64 
 1   Year     852122 non-null  int64 
 2   Mileage  852122 non-null  int64 
 3   City     852122 non-null  object
 4   State    852122 non-null  object
 5   Vin      852122 non-null  object
 6   Make     8

In [2]:
# Drop VIN (unique, non-predictive)
df = df.drop(columns=['Vin'])

# Drop rows with missing price, year or mileage
df = df.dropna(subset=['Price', 'Year', 'Mileage'])

# # Remove obvious outliers
df = df[df['Price'] > 1000]
df = df[df['Mileage'] < 500000]


In [3]:
from sklearn.preprocessing import LabelEncoder

for col in ['Make', 'Model', 'City', 'State']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 852064 entries, 0 to 852121
Data columns (total 7 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   Price    852064 non-null  int64
 1   Year     852064 non-null  int64
 2   Mileage  852064 non-null  int64
 3   City     852064 non-null  int64
 4   State    852064 non-null  int64
 5   Make     852064 non-null  int64
 6   Model    852064 non-null  int64
dtypes: int64(7)
memory usage: 52.0 MB
None


In [4]:
X = df.drop(columns=['Price'])
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [6]:
model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)  # regression output
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               896       
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 9217 (36.00 KB)
Trainable params: 9217 (36.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [7]:
history = model.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    epochs=20,
    batch_size=256,
    verbose=1
)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [8]:
test_loss, test_mae = model.evaluate(X_test_scaled, y_test, verbose=1)
print(f"Test MAE: ${test_mae:,.0f}")


Test MAE: $7,022


In [9]:
y_pred = model.predict(X_test_scaled)
comparison = pd.DataFrame({'Actual': y_test.values[:10], 'Predicted': y_pred.flatten()[:10]})
print(comparison)


   Actual     Predicted
0   13995  10346.714844
1   11467  17651.367188
2   13491  19472.097656
3   15496  22866.191406
4   47400  27910.103516
5   30185  24474.537109
6   26995  25799.562500
7   13295  11767.904297
8   27987  24824.119141
9   30760  28480.250000
