In [1]:
import catboost as cb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

# Load dataset
df =pd.read_csv("D:/mohammed/UNI/Dynamic Pricing for Used Cars in Jordan/Non_Electric_cars_feature_engineered.csv")


In [2]:
df.columns

Index(['Condition', 'Car Make', 'is_luxury', 'Model', 'Trim', 'Year',
       'car_age', 'car_age_sqrt', 'Kilometers Numerical', 'age_km_interaction',
       'Body Type', 'Number of Seats', 'Fuel', 'Transmission',
       'Engine Size (cc)', 'Engine Size (cc)_cuberoot', 'Regional Specs',
       'is_premium_region', 'Car License', 'Insurance', 'Car Customs',
       'Body Condition', 'body_condition_encoded', 'Paint',
       'paint_condition_encoded', 'interior_steering_wheel_controls',
       'interior_airbags', 'technology_cruise_control',
       'exterior_rear_sensors', 'exterior_keyless_entry',
       'technology_tyre_pressure_monitoring', 'exterior_front_sensors',
       'interior_electric_seat_control', 'technology_traction_control',
       'technology_voice_control', 'technology_blind_spot_alert',
       'technology_forward_collision_alert', 'technology_lane_departure_alert',
       'technology_navigation_system_/_maps', 'Interior_Options_Count',
       'Interior_Options_Count_cuber

In [3]:
df.drop(columns = ['is_premium_region','tech_features_count','price_segment'], inplace = True)

In [4]:

# Identify categorical features
cat_features = df.select_dtypes(include=['object']).columns.tolist()

# Convert NaNs in categorical features to string "NaN"
df[cat_features] = df[cat_features].astype(str).fillna("NaN")

# Identify target and features
target_column = "log_price"  # Change if necessary
X = df.drop(columns=[target_column])
y = df[target_column]

# Initialize CatBoostRegressor
model = CatBoostRegressor(
    iterations=500,  # Increased from 26
    learning_rate=0.05,  # Reduced from 0.1
    depth=8,  # Increased from 6
    l2_leaf_reg=3,  # Added regularization
    border_count=128,  # For better splits
    random_strength=1,  # Helps with generalization
    od_wait=50,  # Increased early stopping patience
    od_type="Iter",
    loss_function='RMSE',  # Explicitly set
    eval_metric='RMSE',
    verbose=100,  # To see training progress
    cat_features=cat_features,
    
)

# Perform 10-fold cross-validation
rmse_scores = -cross_val_score(model, X, y, cv=10, scoring="neg_root_mean_squared_error")
mae_scores = -cross_val_score(model, X, y, cv=10, scoring="neg_mean_absolute_error")
r2_scores = cross_val_score(model, X, y, cv=10, scoring="r2")

# Custom accuracy metric based on MAPE
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Train model to get predictions
model.fit(X, y)
y_pred = model.predict(X)
mape = mean_absolute_percentage_error(y, y_pred)
accuracy = 100 - mape  # Accuracy as (1 - MAPE) * 100

# Print evaluation metrics
print(f"Mean RMSE: {rmse_scores.mean()}")
print(f"Mean MAE: {mae_scores.mean()}")
print(f"Mean R-Squared: {r2_scores.mean()}")
print(f"Accuracy (1 - MAPE): {accuracy:}%")

0:	learn: 0.8895741	total: 376ms	remaining: 3m 7s
100:	learn: 0.2679169	total: 20.2s	remaining: 1m 19s
200:	learn: 0.2424630	total: 40.8s	remaining: 1m
300:	learn: 0.2262619	total: 1m 3s	remaining: 41.8s
400:	learn: 0.2136459	total: 1m 23s	remaining: 20.7s
499:	learn: 0.2030069	total: 1m 47s	remaining: 0us
0:	learn: 0.8897366	total: 191ms	remaining: 1m 35s
100:	learn: 0.2619799	total: 21.2s	remaining: 1m 23s
200:	learn: 0.2351816	total: 41.7s	remaining: 1m 2s
300:	learn: 0.2180840	total: 1m 2s	remaining: 41s
400:	learn: 0.2056255	total: 1m 24s	remaining: 20.8s
499:	learn: 0.1954586	total: 1m 45s	remaining: 0us
0:	learn: 0.8914516	total: 182ms	remaining: 1m 31s
100:	learn: 0.2616138	total: 21.5s	remaining: 1m 25s
200:	learn: 0.2345918	total: 41.9s	remaining: 1m 2s
300:	learn: 0.2163440	total: 1m 2s	remaining: 41s
400:	learn: 0.2045570	total: 1m 22s	remaining: 20.4s
499:	learn: 0.1928555	total: 1m 42s	remaining: 0us
0:	learn: 0.8906320	total: 199ms	remaining: 1m 39s
100:	learn: 0.2643777

In [5]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Get the actual prices (original scale)
if 'price' in df.columns:
    y_true_price = df['price'].values
else:
    y_true_price = np.exp(y)  # Convert from log_price to price

# Get predicted log prices and convert to original scale
y_pred_log = model.predict(X)
y_pred_price = np.exp(y_pred_log)  # Convert predictions back to original scale

# Calculate metrics on original price scale
rmse_price = np.sqrt(mean_squared_error(y_true_price, y_pred_price))
mae_price = mean_absolute_error(y_true_price, y_pred_price)
r2_price = r2_score(y_true_price, y_pred_price)

# Custom MAPE function for original prices
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape_price = mean_absolute_percentage_error(y_true_price, y_pred_price)
accuracy_price = 100 - mape_price

# Print evaluation metrics on original price scale
print("\nMetrics on Original Price Scale:")
print(f"RMSE (Price): {rmse_price}")
print(f"MAE (Price): {mae_price}")
print(f"R-Squared (Price): {r2_price}")
print(f"Accuracy (1 - MAPE) (Price): {accuracy_price:.2f}%")

# Keep your existing log-scale metrics for comparison
print("\nMetrics on Log Price Scale:")
print(f"Mean RMSE (log): {rmse_scores.mean()}")
print(f"Mean MAE (log): {mae_scores.mean()}")
print(f"Mean R-Squared (log): {r2_scores.mean()}")
print(f"Accuracy (1 - MAPE) (log): {accuracy:.2f}%")


Metrics on Original Price Scale:
RMSE (Price): 4921.649948608153
MAE (Price): 1899.3300852818836
R-Squared (Price): 0.8831994152681218
Accuracy (1 - MAPE) (Price): 84.36%

Metrics on Log Price Scale:
Mean RMSE (log): 0.2621108971849719
Mean MAE (log): 0.17880206490719558
Mean R-Squared (log): 0.9171178712251752
Accuracy (1 - MAPE) (log): 98.30%


In [28]:
model.save_model("D:/mohammed/UNI/Dynamic Pricing for Used Cars in Jordan/models/non_electric.cbm",
                             format="cbm",
                             export_parameters=None,
                             pool=None)


In [26]:
# After training your model
feature_names = model.feature_names_
feature_names

['Condition',
 'Car Make',
 'is_luxury',
 'Model',
 'Trim',
 'Year',
 'car_age',
 'car_age_sqrt',
 'Kilometers Numerical',
 'age_km_interaction',
 'Body Type',
 'Number of Seats',
 'Fuel',
 'Transmission',
 'Engine Size (cc)',
 'Engine Size (cc)_cuberoot',
 'Regional Specs',
 'Car License',
 'Insurance',
 'Car Customs',
 'Body Condition',
 'body_condition_encoded',
 'Paint',
 'paint_condition_encoded',
 'interior_steering_wheel_controls',
 'interior_airbags',
 'technology_cruise_control',
 'exterior_rear_sensors',
 'exterior_keyless_entry',
 'technology_tyre_pressure_monitoring',
 'exterior_front_sensors',
 'interior_electric_seat_control',
 'technology_traction_control',
 'technology_voice_control',
 'technology_blind_spot_alert',
 'technology_forward_collision_alert',
 'technology_lane_departure_alert',
 'technology_navigation_system_/_maps',
 'Interior_Options_Count',
 'Interior_Options_Count_cuberoot',
 'Exterior_Options_Count',
 'Exterior_Options_Count_cuberoot',
 'Technology_Opti

In [27]:
cat_features

['Condition',
 'Car Make',
 'Model',
 'Trim',
 'Body Type',
 'Fuel',
 'Transmission',
 'Regional Specs',
 'Car License',
 'Insurance',
 'Car Customs',
 'Body Condition',
 'Paint',
 'size_class']

In [9]:
car_data = {
    "Condition": "Used",  # string or categorical
    "Car Make": "Honda",  # string
    "is_luxury": 0,  # boolean (Note: Typo? Should it be "is_luxury"?)
    "Model": "Civic",  # string
    "Trim": "EXi",  # string
    "Year": 2006,  # integer
    "car_age": 19,  # numerical
    "car_age_sqrt": 4.358898944,  # numerical
    "Kilometers Numerical": 14583.71247,  # numerical
    "age_km_interaction": 277090.5369,  # numerical
    "Body Type": "Sedan",  # categorical
    "Number of Seats": 5,  # integer
    "Fuel": "Hybrid",  # categorical
    "Transmission": "Automatic",  # categorical
    "Engine Size (cc)": 1600,  # numerical
    "Engine Size (cc)_cuberoot": 11.44459805,  # numerical
    "Regional Specs": "Japanese Specs",  # categorical
    "Car License": "Licensed",  # categorical
    "Insurance": "Compulsory Insurance",  # categorical
    "Car Customs": "With Customs",  # categorical
    "Body Condition": "Excellent with no defects",  # categorical
    "body_condition_encoded": 4,  # numerical
    "Paint": "Total repaint",  # categorical
    "paint_condition_encoded": 1,  # numerical
    "interior_steering_wheel_controls": 1,  # boolean
    "interior_airbags": 1,  # boolean
    "technology_cruise_control": 1,  # boolean
    "exterior_rear_sensors": 0,  # boolean
    "exterior_keyless_entry": 1,  # boolean
    "technology_tyre_pressure_monitoring": 1,  # boolean
    "exterior_front_sensors": 0,  # boolean
    "interior_electric_seat_control": 0,  # boolean
    "technology_traction_control": 1,  # boolean
    "technology_voice_control": 0,  # boolean
    "technology_blind_spot_alert": 0,  # boolean
    "technology_forward_collision_alert": 0,  # boolean
    "technology_lane_departure_alert": 0,  # boolean
    "technology_navigation_system_/_maps": 0,  # boolean
    "Interior_Options_Count": 7,  # integer (Note: Typo? Should it be "Interior"?)
    "Interior_Options_Count_cuberoot": 1.8171205928,  # numerical
    "Exterior_Options_Count": 4,  # integer
    "Exterior_Options_Count_cuberoot": 1.587401052,  # numerical
    "Technology_Options_Count": 5,  # integer
    "Technology_Options_Count_cuberoot": 1.7099759467,  # numerical
    "has_advanced_tech": 0,  # boolean
    "Total_Options_Count": 16,  # integer
    "Total_Options_Count_cuberoot": 2.5198420998,  # numerical
    "weighted_options": 3.4,  # numerical
    "maintenance_score": 5,  # numerical
    "excellent_maintenance": 0,  # boolean
    "size_class": "medium"  # categorical
}

In [10]:
car_df = pd.DataFrame([car_data])
# Make prediction
prediction = model.predict(car_df)
print(f"Predicted price: {np.exp(prediction[0]):.2f}")  # Assuming log_price was target

Predicted price: 5865.53


In [21]:
car2_data = {
    "Condition": "Used",  # string or categorical
    "Car Make": "Mitsubishi",  # string
    "is_luxury": 0,  # boolean (Note: Typo? Should it be "is_luxury"?)
    "Model": "Lancer",  # string
    "Trim": "GT",  # string
    "Year": 2017,  # integer
    "car_age": 8,  # numerical
    "car_age_sqrt": 2,  # numerical
    "Kilometers Numerical": 75000,  # numerical
    "age_km_interaction": 376696.6307,  # numerical
    "Body Type": "Sedan",  # categorical
    "Number of Seats": 5,  # integer
    "Fuel": "Gasoline",  # categorical
    "Transmission": "Automatic",  # categorical
    "Engine Size (cc)": 1600,  # numerical
    "Engine Size (cc)_cuberoot": 10.5992104989,  # numerical
    "Regional Specs": "Japanese Specs",  # categorical
    "Car License": "Licensed",  # categorical
    "Insurance": "Compulsory Insurance",  # categorical
    "Car Customs": "With Customs",  # categorical
    "Body Condition": "Excellent with no defects",  # categorical
    "body_condition_encoded": 4,  # numerical
    "Paint": "Original Paint",  # categorical
    "paint_condition_encoded": 3,  # numerical
    "interior_steering_wheel_controls": 0,  # boolean
    "interior_airbags": 1,  # boolean
    "technology_cruise_control": 0,  # boolean
    "exterior_rear_sensors": 0,  # boolean
    "exterior_keyless_entry": 0,  # boolean
    "technology_tyre_pressure_monitoring": 1,  # boolean
    "exterior_front_sensors": 0,  # boolean
    "interior_electric_seat_control": 0,  # boolean
    "technology_traction_control": 1,  # boolean
    "technology_voice_control": 0,  # boolean
    "technology_blind_spot_alert": 0,  # boolean
    "technology_forward_collision_alert": 0,  # boolean
    "technology_lane_departure_alert": 0,  # boolean
    "technology_navigation_system_/_maps": 0,  # boolean
    "Interior_Options_Count": 5,  # integer (Note: Typo? Should it be "Interior"?)
    "Interior_Options_Count_cuberoot": 1.8171205928,  # numerical
    "Exterior_Options_Count": 5,  # integer
    "Exterior_Options_Count_cuberoot": 1.587401052,  # numerical
    "Technology_Options_Count": 10,  # integer
    "Technology_Options_Count_cuberoot": 2.15443469,  # numerical
    "has_advanced_tech": 0,  # boolean
    "Total_Options_Count": 20,  # integer
    "Total_Options_Count_cuberoot": 2.7198420998,  # numerical
    "weighted_options": 3.9,  # numerical
    "maintenance_score": 5,  # numerical
    "excellent_maintenance": 0,  # boolean
    "size_class": "medium"  # categorical
}

In [22]:
car2_df = pd.DataFrame([car2_data])
# Make prediction
prediction = model.predict(car2_df)
print(f"Predicted price: {np.exp(prediction[0]):.2f}")  # Assuming log_price was target

Predicted price: 10490.97


In [17]:
car3_data = {
    "Condition": "Used",  # string or categorical
    "Car Make": "Toyota",  # string
    "is_luxury": 0,  # boolean (Note: Typo? Should it be "is_luxury"?)
    "Model": "RAV 4",  # string
    "Trim": "Basic",  # string
    "Year": 2020,  # integer
    "car_age": 5,  # numerical
    "car_age_sqrt": 1.7099759467,  # numerical
    "Kilometers Numerical": 66000,  # numerical
    "age_km_interaction": 193658.0784,  # numerical
    "Body Type": "SUV",  # categorical
    "Number of Seats": 5,  # integer
    "Fuel": "Hybrid",  # categorical
    "Transmission": "Automatic",  # categorical
    "Engine Size (cc)": 2500,  # numerical
    "Engine Size (cc)_cuberoot": 13.572088083,  # numerical
    "Regional Specs": "Japanese Specs",  # categorical
    "Car License": "Licensed",  # categorical
    "Insurance": "Compulsory Insurance",  # categorical
    "Car Customs": "With Customs",  # categorical
    "Body Condition": "Excellent with no defects",  # categorical
    "body_condition_encoded": 4,  # numerical
    "Paint": "Original Paint",  # categorical
    "paint_condition_encoded": 3,  # numerical
    "interior_steering_wheel_controls": 1,  # boolean
    "interior_airbags": 1,  # boolean
    "technology_cruise_control": 1,  # boolean
    "exterior_rear_sensors": 1,  # boolean
    "exterior_keyless_entry": 1,  # boolean
    "technology_tyre_pressure_monitoring": 1,  # boolean
    "exterior_front_sensors": 1,  # boolean
    "interior_electric_seat_control": 1,  # boolean
    "technology_traction_control": 1,  # boolean
    "technology_voice_control": 1,  # boolean
    "technology_blind_spot_alert": 1,  # boolean
    "technology_forward_collision_alert": 1,  # boolean
    "technology_lane_departure_alert": 1,  # boolean
    "technology_navigation_system_/_maps": 1,  # boolean
    "Interior_Options_Count": 14,  # integer (Note: Typo? Should it be "Interior"?)
    "Interior_Options_Count_cuberoot": 2.4101422642,  # numerical
    "Exterior_Options_Count": 14,  # integer
    "Exterior_Options_Count_cuberoot": 2.4101422642,  # numerical
    "Technology_Options_Count": 22,  # integer
    "Technology_Options_Count_cuberoot": 2.8020393307,  # numerical
    "has_advanced_tech": 1,  # boolean
    "Total_Options_Count": 50,  # integer
    "Total_Options_Count_cuberoot": 3.6840314986,  # numerical
    "weighted_options": 11.4,  # numerical
    "maintenance_score": 7,  # numerical
    "excellent_maintenance": 1,  # boolean
    "size_class": "large"  # categorical
}

In [18]:
car3_df = pd.DataFrame([car3_data])
# Make prediction
prediction = model.predict(car3_df)
print(f"Predicted price: {np.exp(prediction[0]):.2f}")  # Assuming log_price was target

Predicted price: 29042.31
