In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/usa-cers-dataset/USA_cars_datasets.csv


In [101]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [50]:
data = pd.read_csv("/kaggle/input/usa-cers-dataset/USA_cars_datasets.csv")

In [21]:
data.head()

Unnamed: 0.1,Unnamed: 0,price,brand,model,year,title_status,mileage,color,vin,lot,state,country,condition
0,0,6300,toyota,cruiser,2008,clean vehicle,274117.0,black,jtezu11f88k007763,159348797,new jersey,usa,10 days left
1,1,2899,ford,se,2011,clean vehicle,190552.0,silver,2fmdk3gc4bbb02217,166951262,tennessee,usa,6 days left
2,2,5350,dodge,mpv,2018,clean vehicle,39590.0,silver,3c4pdcgg5jt346413,167655728,georgia,usa,2 days left
3,3,25000,ford,door,2014,clean vehicle,64146.0,blue,1ftfw1et4efc23745,167753855,virginia,usa,22 hours left
4,4,27700,chevrolet,1500,2018,clean vehicle,6654.0,red,3gcpcrec2jg473991,167763266,florida,usa,22 hours left


In [22]:
data.shape

(2499, 13)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2499 entries, 0 to 2498
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    2499 non-null   int64  
 1   price         2499 non-null   int64  
 2   brand         2499 non-null   object 
 3   model         2499 non-null   object 
 4   year          2499 non-null   int64  
 5   title_status  2499 non-null   object 
 6   mileage       2499 non-null   float64
 7   color         2499 non-null   object 
 8   vin           2499 non-null   object 
 9   lot           2499 non-null   int64  
 10  state         2499 non-null   object 
 11  country       2499 non-null   object 
 12  condition     2499 non-null   object 
dtypes: float64(1), int64(4), object(8)
memory usage: 253.9+ KB


# **Preprocessing**

In [40]:
def binary_encode(df, columns_with_positive_values):
    df = df.copy()
    for column, positive_values in columns_with_positive_values:
        df[column] = df[column].apply(lambda x:1 if x == positive_values else 0)
    return df

def onehot_encode(df, columns_with_prefixes):
    df = df.copy()
    for column, prefix in columns_with_prefixes:
        dummies = pd.get_dummies(df[column], prefix=prefix)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    return df

In [41]:
def preprocess_inputs(df):
    df = df.copy()

    # Drop unnecessary columns
    df = df.drop(['Unnamed: 0', 'vin', 'lot'],axis=1)

    # Binary encode the title_status and country columns
    df = binary_encode(
        df,
        columns_with_positive_values=[
            ('title_status', 'salvage insurance'),
            ('country', ' canada')
        ]
    )

    # One-hot encode the brand, model, color, state, and condition columns
    df = onehot_encode(
        df,
        columns_with_prefixes=[
            ('brand', 'br'),
            ('model', 'md'),
            ('color', 'cl'),
            ('state', 'st'),
            ('condition', 'cd')
        ]
    )

    # Fixes LightGBM error
    df = df.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x))
    
    # Split df into X and y
    y = df['price'].copy()
    X = df.drop('price', axis=1).copy()
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)
    
    # Scale X with a standard scaler
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
    
    return X_train, X_test, y_train, y_test

In [44]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [49]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape 

((1749, 299), (750, 299), (1749,), (750,))

In [51]:
X_train

Unnamed: 0,year,title_status,mileage,country,br_acura,br_audi,br_bmw,br_buick,br_cadillac,br_chevrolet,...,cd_5hoursleft,cd_53minutes,cd_6daysleft,cd_6hoursleft,cd_7daysleft,cd_7hoursleft,cd_8daysleft,cd_9daysleft,cd_9minutes,cd_ListingExpired
0,-1.288806,-0.260318,1.512784,-0.053544,-0.041451,-0.023918,-0.075832,-0.083117,-0.067787,-0.364379,...,-0.086536,-0.023918,-0.142899,-0.058671,-0.136518,-0.041451,-0.191707,-0.142899,-0.041451,-0.089829
1,0.372505,-0.260318,-0.522124,-0.053544,-0.041451,-0.023918,-0.075832,-0.083117,-0.067787,-0.364379,...,-0.086536,-0.023918,-0.142899,-0.058671,-0.136518,-0.041451,-0.191707,-0.142899,-0.041451,-0.089829
2,0.649390,-0.260318,-0.394639,-0.053544,-0.041451,-0.023918,-0.075832,-0.083117,-0.067787,-0.364379,...,-0.086536,-0.023918,-0.142899,-0.058671,-0.136518,-0.041451,-0.191707,-0.142899,-0.041451,-0.089829
3,-1.011921,-0.260318,0.592934,-0.053544,-0.041451,-0.023918,-0.075832,-0.083117,-0.067787,2.744396,...,-0.086536,-0.023918,-0.142899,-0.058671,-0.136518,-0.041451,-0.191707,-0.142899,-0.041451,-0.089829
4,0.649390,-0.260318,-0.625620,-0.053544,-0.041451,-0.023918,-0.075832,-0.083117,-0.067787,-0.364379,...,-0.086536,-0.023918,-0.142899,-0.058671,-0.136518,-0.041451,-0.191707,-0.142899,-0.041451,-0.089829
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1744,0.649390,-0.260318,-0.285273,-0.053544,-0.041451,-0.023918,-0.075832,-0.083117,-0.067787,-0.364379,...,-0.086536,-0.023918,-0.142899,-0.058671,-0.136518,-0.041451,-0.191707,-0.142899,-0.041451,-0.089829
1745,0.649390,-0.260318,-0.218108,-0.053544,-0.041451,-0.023918,-0.075832,-0.083117,-0.067787,-0.364379,...,-0.086536,-0.023918,-0.142899,-0.058671,-0.136518,-0.041451,-0.191707,-0.142899,-0.041451,-0.089829
1746,-0.458151,-0.260318,0.097645,-0.053544,-0.041451,-0.023918,-0.075832,-0.083117,-0.067787,-0.364379,...,-0.086536,-0.023918,-0.142899,-0.058671,-0.136518,-0.041451,-0.191707,-0.142899,-0.041451,-0.089829
1747,0.649390,-0.260318,-0.392006,-0.053544,-0.041451,-0.023918,-0.075832,-0.083117,-0.067787,-0.364379,...,-0.086536,-0.023918,-0.142899,-0.058671,-0.136518,-0.041451,-0.191707,-0.142899,-0.041451,-0.089829


In [52]:
y_train

1351    26400
903     28700
2049    47500
798     11400
1360    29000
        ...  
1147    15900
2154    25400
1766    13000
1122    15800
1346     8300
Name: price, Length: 1749, dtype: int64

# Traning

In [71]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [72]:
Knn = KNeighborsRegressor()
Knn.fit(X_train, y_train)

In [73]:
mlp = MLPRegressor()
mlp.fit(X_train, y_train)



In [74]:
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)

In [75]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

In [76]:
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)

In [77]:
xgboost = XGBRegressor()
xgboost.fit(X_train, y_train)

In [78]:
gbm = LGBMRegressor()
gbm.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000993 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 495
[LightGBM] [Info] Number of data points in the train set: 1749, number of used features: 74
[LightGBM] [Info] Start training from score 18689.759291


In [79]:
cat = CatBoostRegressor(verbose=0)
cat.fit(X_train, y_train)

<catboost.core.CatBoostRegressor at 0x78b4a55358d0>

In [102]:
models = {
    "Linear Regression": lr,
    "KNeighborsRegressor" : Knn,
    "MLPRegressor" : mlp,
    "DecisionTreeRegressor" : dt,
    "RandomForestRegressor": rf,
    "GradientBoostingRegressor" : gb,
    "XGBoostRegressor": xgboost,
    "LGBMRegressor": gbm,
    "CatBoostRegressor": cat
}

In [103]:
def evaluate_models(models, X_test, y_test):
    results = []

    for name, model in models.items():
        y_pred = model.predict(X_test)

        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)

        results.append({
            "Model": name,
            "MAE": mae,
            "RMSE": rmse,
            "R2": r2
        })

    results_df = pd.DataFrame(results)
    return results_df.sort_values(by="RMSE")  # sort best (lowest RMSE) first


In [104]:
results_df = evaluate_models(models, X_test, y_test)
print(results_df)

                       Model           MAE          RMSE            R2
8          CatBoostRegressor  4.434426e+03  7.028589e+03  6.797067e-01
6           XGBoostRegressor  4.580507e+03  7.296647e+03  6.548099e-01
4      RandomForestRegressor  4.494252e+03  7.514389e+03  6.339006e-01
7              LGBMRegressor  5.199557e+03  7.928982e+03  5.923884e-01
5  GradientBoostingRegressor  5.510184e+03  8.122552e+03  5.722434e-01
1        KNeighborsRegressor  5.810129e+03  9.136563e+03  4.587756e-01
3      DecisionTreeRegressor  5.527700e+03  9.299519e+03  4.392973e-01
2               MLPRegressor  1.496249e+04  1.826414e+04 -1.162769e+00
0          Linear Regression  1.146821e+17  1.178388e+18 -9.003020e+27
