In [4]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
import re

# Load dataset
df = pd.read_csv('./used_cars.csv')

# Drop or fill missing values
df.dropna(subset=['price', 'milage', 'model_year'], inplace=True)
df.fillna('None reported', inplace=True)

# Data cleaning
# Remove non-numeric characters from 'milage' and 'price'
df['milage'] = df['milage'].str.replace('[^\d]', '', regex=True).astype(float)
df['price'] = df['price'].str.replace('[$,]', '', regex=True).astype(float)

# Extract horsepower from the engine column
def extract_hp(engine_str):
    match = re.search(r'(\d+\.\d+|\d+)HP', str(engine_str))
    return float(match.group(1)) if match else 0

df['horsepower'] = df['engine'].apply(extract_hp)

# Create new features
current_year = 2024
df['vehicle_age'] = current_year - df['model_year']
df['accident_flag'] = df['accident'].apply(lambda x: 1 if 'at least 1' in str(x).lower() else 0)

# Identify categorical and numerical features
categorical_features = ['brand', 'fuel_type', 'transmission', 'ext_col', 'int_col', 'clean_title']
numerical_features = ['milage', 'horsepower', 'vehicle_age', 'accident_flag']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Define features and target
X = df[categorical_features + numerical_features]
y = df['price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create SVR pipeline
svr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('svr', SVR())
])

# Hyperparameter tuning
param_grid = {
    'svr__C': [0.1, 1, 10],
    'svr__epsilon': [0.01, 0.1, 0.5],
    'svr__kernel': ['linear', 'rbf', 'poly']
}

grid_search = GridSearchCV(svr_pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_svr = grid_search.best_estimator_

# Cross-validation scores
cv_scores = cross_val_score(best_svr, X_train, y_train, cv=5)

# Model evaluation
y_pred = best_svr.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Results
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Cross-Validation R2 Score: {cv_scores.mean():.4f}')
print(f'MAE: {mae:.2f}, MSE: {mse:.2f}, R2 Score: {r2:.4f}')

  df['milage'] = df['milage'].str.replace('[^\d]', '', regex=True).astype(float)


Best Parameters: {'svr__C': 10, 'svr__epsilon': 0.01, 'svr__kernel': 'linear'}
Cross-Validation R2 Score: 0.1210
MAE: 26317.95, MSE: 20389553963.02, R2 Score: 0.0024
