In [1]:
import catboost as cb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd


In [8]:
# Load dataset
df =pd.read_csv("D:/mohammed/UNI/Dynamic Pricing for Used Cars in Jordan/Electric_cars_feature_engineered.csv")

In [9]:
df.columns

Index(['Condition', 'Car Make', 'is_luxury', 'Model', 'Model_encoded', 'Trim',
       'Trim_encoded', 'Year', 'car_age', 'car_age_sqrt',
       'Kilometers Numerical', 'age_km_interaction', 'Body Type',
       'Number of Seats', 'Fuel', 'Transmission', 'Engine Size (cc)',
       'Regional Specs', 'Car License', 'Insurance', 'Car Customs',
       'Body Condition', 'body_condition_encoded', 'Paint',
       'paint_condition_encoded', 'interior_steering_wheel_controls',
       'interior_airbags', 'technology_cruise_control',
       'exterior_rear_sensors', 'exterior_keyless_entry',
       'technology_tyre_pressure_monitoring', 'exterior_front_sensors',
       'interior_electric_seat_control', 'technology_traction_control',
       'technology_voice_control', 'technology_blind_spot_alert',
       'technology_forward_collision_alert', 'technology_lane_departure_alert',
       'technology_navigation_system_/_maps', 'Interior_Options_Count',
       'Interior_Options_Count_cuberoot', 'Exterior_O

In [10]:
df.drop(columns = ['Fuel','Transmission','Engine Size (cc)','tech_features_count', 'Model_encoded','Trim_encoded'],inplace = True)

In [11]:

# Identify categorical features
cat_features = df.select_dtypes(include=['object']).columns.tolist()

# Convert NaNs in categorical features to string "NaN"
df[cat_features] = df[cat_features].astype(str).fillna("NaN")

# Identify target and features
target_column = "log_price"  # Change if necessary
X = df.drop(columns=[target_column])
y = df[target_column]

# Initialize CatBoostRegressor
model = CatBoostRegressor(
    iterations=500,  # Increased from 26
    learning_rate=0.05,  # Reduced from 0.1
    depth=8,  # Increased from 6
    l2_leaf_reg=3,  # Added regularization
    border_count=128,  # For better splits
    random_strength=1,  # Helps with generalization
    od_wait=50,  # Increased early stopping patience
    od_type="Iter",
    loss_function='RMSE',  # Explicitly set
    eval_metric='RMSE',
    verbose=100,  # To see training progress
    cat_features=cat_features,
    
)

# Perform 10-fold cross-validation
rmse_scores = -cross_val_score(model, X, y, cv=10, scoring="neg_root_mean_squared_error")
mae_scores = -cross_val_score(model, X, y, cv=10, scoring="neg_mean_absolute_error")
r2_scores = cross_val_score(model, X, y, cv=10, scoring="r2")

# Custom accuracy metric based on MAPE
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Train model to get predictions
model.fit(X, y)
y_pred = model.predict(X)
mape = mean_absolute_percentage_error(y, y_pred)
accuracy = 100 - mape  # Accuracy as (1 - MAPE) * 100

# Print evaluation metrics
print(f"Mean RMSE: {rmse_scores.mean()}")
print(f"Mean MAE: {mae_scores.mean()}")
print(f"Mean R-Squared: {r2_scores.mean()}")
print(f"Accuracy (1 - MAPE): {accuracy:}%")

0:	learn: 0.5367538	total: 69.3ms	remaining: 34.6s
100:	learn: 0.2112693	total: 7.45s	remaining: 29.4s
200:	learn: 0.1818450	total: 15.1s	remaining: 22.5s
300:	learn: 0.1642879	total: 24s	remaining: 15.9s
400:	learn: 0.1446198	total: 31.9s	remaining: 7.89s
499:	learn: 0.1265095	total: 40.1s	remaining: 0us
0:	learn: 0.5364979	total: 71.2ms	remaining: 35.5s
100:	learn: 0.2138092	total: 7.38s	remaining: 29.1s
200:	learn: 0.1820372	total: 15.3s	remaining: 22.7s
300:	learn: 0.1633566	total: 23.3s	remaining: 15.4s
400:	learn: 0.1489806	total: 30.8s	remaining: 7.6s
499:	learn: 0.1343137	total: 38.2s	remaining: 0us
0:	learn: 0.5359528	total: 69.5ms	remaining: 34.7s
100:	learn: 0.2124121	total: 7.21s	remaining: 28.5s
200:	learn: 0.1811265	total: 14.3s	remaining: 21.2s
300:	learn: 0.1610481	total: 21.9s	remaining: 14.5s
400:	learn: 0.1433589	total: 29.1s	remaining: 7.18s
499:	learn: 0.1229049	total: 37.1s	remaining: 0us
0:	learn: 0.5334266	total: 55.9ms	remaining: 27.9s
100:	learn: 0.2159297	tot

In [12]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Get the actual prices (original scale)
if 'price' in df.columns:
    y_true_price = df['price'].values
else:
    y_true_price = np.exp(y)  # Convert from log_price to price

# Get predicted log prices and convert to original scale
y_pred_log = model.predict(X)
y_pred_price = np.exp(y_pred_log)  # Convert predictions back to original scale

# Calculate metrics on original price scale
rmse_price = np.sqrt(mean_squared_error(y_true_price, y_pred_price))
mae_price = mean_absolute_error(y_true_price, y_pred_price)
r2_price = r2_score(y_true_price, y_pred_price)

# Custom MAPE function for original prices
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape_price = mean_absolute_percentage_error(y_true_price, y_pred_price)
accuracy_price = 100 - mape_price

# Print evaluation metrics on original price scale
print("\nMetrics on Original Price Scale:")
print(f"RMSE (Price): {rmse_price}")
print(f"MAE (Price): {mae_price}")
print(f"R-Squared (Price): {r2_price}")
print(f"Accuracy (1 - MAPE) (Price): {accuracy_price:.2f}%")

# Keep your existing log-scale metrics for comparison
print("\nMetrics on Log Price Scale:")
print(f"Mean RMSE (log): {rmse_scores.mean()}")
print(f"Mean MAE (log): {mae_scores.mean()}")
print(f"Mean R-Squared (log): {r2_scores.mean()}")
print(f"Accuracy (1 - MAPE) (log): {accuracy:.2f}%")


Metrics on Original Price Scale:
RMSE (Price): 4113.499473569396
MAE (Price): 2087.8470832234457
R-Squared (Price): 0.8828273717415118
Accuracy (1 - MAPE) (Price): 89.57%

Metrics on Log Price Scale:
Mean RMSE (log): 0.24101611981394783
Mean MAE (log): 0.14970962866265297
Mean R-Squared (log): 0.8048475522569131
Accuracy (1 - MAPE) (log): 98.97%


In [14]:
model.save_model("D:/mohammed/UNI/Dynamic Pricing for Used Cars in Jordan/models/electric.cbm",
                             format="cbm",
                             export_parameters=None,
                             pool=None)

In [15]:
# After training your model
feature_names = model.feature_names_
feature_names

['Condition',
 'Car Make',
 'is_luxury',
 'Model',
 'Trim',
 'Year',
 'car_age',
 'car_age_sqrt',
 'Kilometers Numerical',
 'age_km_interaction',
 'Body Type',
 'Number of Seats',
 'Regional Specs',
 'Car License',
 'Insurance',
 'Car Customs',
 'Body Condition',
 'body_condition_encoded',
 'Paint',
 'paint_condition_encoded',
 'interior_steering_wheel_controls',
 'interior_airbags',
 'technology_cruise_control',
 'exterior_rear_sensors',
 'exterior_keyless_entry',
 'technology_tyre_pressure_monitoring',
 'exterior_front_sensors',
 'interior_electric_seat_control',
 'technology_traction_control',
 'technology_voice_control',
 'technology_blind_spot_alert',
 'technology_forward_collision_alert',
 'technology_lane_departure_alert',
 'technology_navigation_system_/_maps',
 'Interior_Options_Count',
 'Interior_Options_Count_cuberoot',
 'Exterior_Options_Count',
 'Exterior_Options_Count_cuberoot',
 'Technology_Options_Count',
 'Technology_Options_Count_cuberoot',
 'has_advanced_tech',
 'Tot

In [16]:
cat_features

['Condition',
 'Car Make',
 'Model',
 'Trim',
 'Body Type',
 'Regional Specs',
 'Car License',
 'Insurance',
 'Car Customs',
 'Body Condition',
 'Paint',
 'size_class']