In [None]:
# Created by: Michael Cullen
# 19/11/2024
# https://zenodo.org/records/10457828

In [44]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
import matplotlib.pyplot as plt

In [45]:
df = pd.read_csv('vehicles.csv')

df.head()

Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,...,size,type,paint_color,image_url,description,county,state,lat,long,posting_date
0,7222695916,https://prescott.craigslist.org/cto/d/prescott...,prescott,https://prescott.craigslist.org,6000,,,,,,...,,,,,,,az,,,
1,7218891961,https://fayar.craigslist.org/ctd/d/bentonville...,fayetteville,https://fayar.craigslist.org,11900,,,,,,...,,,,,,,ar,,,
2,7221797935,https://keys.craigslist.org/cto/d/summerland-k...,florida keys,https://keys.craigslist.org,21000,,,,,,...,,,,,,,fl,,,
3,7222270760,https://worcester.craigslist.org/cto/d/west-br...,worcester / central MA,https://worcester.craigslist.org,1500,,,,,,...,,,,,,,ma,,,
4,7210384030,https://greensboro.craigslist.org/cto/d/trinit...,greensboro,https://greensboro.craigslist.org,4900,,,,,,...,,,,,,,nc,,,


In [46]:
# Drop not needed columns
df = df.drop(columns=['region_url','description','url','id','posting_date','lat','long','VIN','image_url','county','state','region','size','model','paint_color','title_status','type'], errors='ignore')

df = df.iloc[30:]
df = df.fillna('unknown')


df['fuel'] = df['fuel'].replace({
    'other': 'diesel',  # Replace 'other' with 'diesel'
    'gas': 'petrol'     # Replace 'gas' with 'petrol'
})

# Replace 'other' in the 'transmission' column
df['transmission'] = df['transmission'].replace({
    'other': 'manual'   # Replace 'other' with 'manual'
})


df.head(20)


Unnamed: 0,price,year,manufacturer,condition,cylinders,fuel,odometer,transmission,drive
30,30990,2017.0,toyota,good,8 cylinders,petrol,41124.0,manual,unknown
31,15000,2013.0,ford,excellent,6 cylinders,petrol,128000.0,automatic,rwd
32,27990,2012.0,gmc,good,8 cylinders,petrol,68696.0,manual,4wd
33,34590,2016.0,chevrolet,good,6 cylinders,petrol,29499.0,manual,4wd
34,35000,2019.0,toyota,excellent,6 cylinders,petrol,43000.0,automatic,4wd
35,29990,2016.0,chevrolet,good,6 cylinders,petrol,17302.0,manual,4wd
36,38590,2011.0,chevrolet,good,8 cylinders,petrol,30237.0,manual,rwd
37,4500,1992.0,jeep,excellent,6 cylinders,petrol,192000.0,automatic,4wd
38,32990,2017.0,jeep,good,6 cylinders,petrol,30041.0,manual,4wd
39,24590,2017.0,chevrolet,good,6 cylinders,petrol,40784.0,manual,unknown


In [47]:
# Convert odometer to numeric
df['odometer'] = pd.to_numeric(df['odometer'], errors='coerce')
df['year'] = pd.to_numeric(df['year'], errors='coerce')



# Define the function to categorize odometer values
def categorize_odometer(odometer):  # 1 = missing, 2 = 0-20k, 3 = 20-50k, 4 = 50-100k, 5 = 100-150k, 6 = 150-200k, 7 = 200k+
    if odometer is None or odometer < 0 or pd.isna(odometer):
        return 1  # Missing
    elif odometer <= 20000:
        return 2  # 0-20k
    elif odometer <= 50000:
        return 3  # 20-50k
    elif odometer <= 100000:
        return 4  # 50-100k
    elif odometer <= 150000:
        return 5  # 100-150k
    elif odometer <= 200000:
        return 6  # 150-200k
    else:
        return 7  # 200k+
    
    # Define function to categorize year
def categorize_year(year):  # 1 = very old, 2 = old, 3 = modern, 4 = new
    if year is None or pd.isna(year):
        return 1  # Missing
    elif year < 1990:
        return 1  # Very old
    elif year < 2005:
        return 2  # Old
    elif year < 2015:
        return 3  # Modern
    else:
        return 4  # New

# Define function to categorize price
def categorize_price(price):  # 1 = low, 2 = medium, 3 = high, 4 = premium
    if price is None or pd.isna(price):
        return 1  # Missing
    elif price < 5000:
        return 1  # Low
    elif price < 20000:
        return 2  # Medium
    elif price < 40000:
        return 3  # High
    else:
        return 4  # Premium

# Apply the function to create the 'group' columns
df['odometer_group'] = df['odometer'].apply(categorize_odometer)
df['year_group'] = df['year'].apply(categorize_year)
df['price_group'] = df['price'].apply(categorize_price)

from sklearn.preprocessing import LabelEncoder

for col in ['manufacturer','cylinders','drive','condition','fuel', 'transmission']:
    df[col] = LabelEncoder().fit_transform(df[col])


# Drop the original columns
df = df.drop(columns=['odometer', 'year', 'price'], errors='ignore')

# Show the first 10 rows
df.head(10)


Unnamed: 0,manufacturer,condition,cylinders,fuel,transmission,drive,odometer_group,year_group,price_group
30,39,2,6,3,1,3,3,4,3
31,13,0,5,3,0,2,5,3,2
32,14,2,6,3,1,0,4,3,3
33,7,2,5,3,1,0,3,4,3
34,39,0,5,3,0,0,3,4,3
35,7,2,5,3,1,0,2,4,3
36,7,2,6,3,1,2,3,3,3
37,20,0,5,3,0,0,6,2,1
38,20,2,5,3,1,0,3,4,3
39,7,2,5,3,1,3,3,4,3


In [48]:
print(df.shape[0])

426850


In [49]:
# Define features (X) and target (y)
X = df.drop(columns=['price_group'])
y = df['price_group']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [50]:
rf = RandomForestClassifier(n_estimators=10, max_depth=5)  # Use fewer trees
rf.fit(X_train, y_train)


y_pred = rf.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5881925735035727


: 