# Model Development

In this notebook, we will develop predictive models using the [Vehicles.csv](https://www.kaggle.com/datasets/austinreese/craigslist-carstrucks-data) dataset. dataset. We will explore various machine learning techniques to create models that can accurately predict used car prices. The process will include building base models and optimizing them through techniques such as regularization and hyperparameter tuning.

By the end of this notebook, we will have established a set of foundational models ready for evaluation, aiming to identify the most effective model for predicting used car prices.

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import os, zipfile
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:,.2f}'.format)

with zipfile.ZipFile('../data/clean_vehicles.csv.zip', 'r') as zipf:
    zipf.extract('clean_vehicles.csv', path='../data')

data = pd.read_csv('../data/clean_vehicles.csv')
data = data.sample(n=10000, random_state=42)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 15418 to 32859
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         10000 non-null  int64  
 1   year          10000 non-null  int64  
 2   manufacturer  10000 non-null  object 
 3   model         10000 non-null  object 
 4   condition     10000 non-null  object 
 5   odometer      10000 non-null  float64
 6   cylinders     10000 non-null  object 
 7   fuel          10000 non-null  object 
 8   title_status  10000 non-null  object 
 9   transmission  10000 non-null  object 
 10  drive         10000 non-null  object 
 11  type          10000 non-null  object 
 12  paint_color   10000 non-null  object 
 13  state         10000 non-null  object 
 14  description   10000 non-null  object 
dtypes: float64(1), int64(2), object(12)
memory usage: 1.2+ MB


## Random Forest

In [None]:
df = data.copy()
X = df.drop(columns='price')
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numerical_features = ['odometer']
ordinal_features = ['year', 'condition', 'title_status']
categorical_features = ['manufacturer', 'model', 'cylinders', 'fuel',
                        'transmission', 'drive', 'type', 'paint_color', 'state']

numerical_transformer = StandardScaler()
ordinal_transformer = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('ord', ordinal_transformer, ordinal_features),
        ('cat', categorical_transformer, categorical_features)
    ])

rf_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.99)),
    ('model', RandomForestRegressor(n_jobs=-1, random_state=42))
])

rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_error = mean_squared_error(y_test, rf_predictions)
error_price = np.sqrt(rf_error)
print(error_price)

## Neural Network

In [None]:
df = data.copy()
X = df.drop(columns='price')
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numerical_features = ['odometer']
ordinal_features = ['year', 'condition', 'title_status']
categorical_features = ['manufacturer', 'model', 'cylinders', 'fuel',
                        'transmission', 'drive', 'type', 'paint_color', 'state']

numerical_transformer = StandardScaler()
ordinal_transformer = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('ord', ordinal_transformer, ordinal_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)
pca = PCA(n_components=0.99)
X_train_pca = pca.fit_transform(X_train_preprocessed)
X_test_pca = pca.transform(X_test_preprocessed)

neural_net = Sequential([
    Dense(256, input_dim=X_train_pca.shape[1], activation='relu'),
    Dropout(0.2),
    Dense(128, activation='relu'),
    Dropout(0.15),
    Dense(64, activation='relu'),
    Dropout(0.1),
    Dense(32, activation='relu'),
    Dense(1)
])

neural_net.compile(optimizer='adam', loss='mse')
neural_net.fit(X_train_pca, y_train, epochs=100, batch_size=32, verbose=0, validation_split=0.2)
neural_pred = neural_net.predict(X_test_pca)
neural_mse = mean_squared_error(y_test, neural_pred)
neural_price_error = np.sqrt(neural_mse)
print('----------')
print(neural_price_error)
print(df['price'].describe())

In [None]:
try:
    print('Script executed successfully')
except:
    print('FAILED')