# Diamond Price Regression using Classical Machine Learning Algorithms

## Comparison on RandomForestRegressor, DecisionTreeRegressor and Logistic Regression

In [67]:
import os
import random

import numpy as np
import pandas as pd

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [68]:
seed = 42

np.random.seed(seed)
random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

In [69]:
data = pd.read_csv("data/diamonds.csv")
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [70]:
le_cut = LabelEncoder()
le_color = LabelEncoder()
le_clarity = LabelEncoder()

data["cut"] = le_cut.fit_transform(data["cut"])
data["color"] = le_color.fit_transform(data["color"])
data["clarity"] = le_clarity.fit_transform(data["clarity"])

data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,2,1,3,61.5,55.0,326,3.95,3.98,2.43
1,0.21,3,1,2,59.8,61.0,326,3.89,3.84,2.31
2,0.23,1,1,4,56.9,65.0,327,4.05,4.07,2.31
3,0.29,3,5,5,62.4,58.0,334,4.2,4.23,2.63
4,0.31,1,6,3,63.3,58.0,335,4.34,4.35,2.75


In [71]:
scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data.drop("price", axis=1)), columns=data.columns.drop("price"))
data_scaled["price"] = data["price"].values

data_scaled.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,-1.198168,-0.538099,-0.937163,-0.484264,-0.174092,-1.099672,-1.587837,-1.536196,-1.571129,326
1,-1.240361,0.434949,-0.937163,-1.064117,-1.360738,1.585529,-1.641325,-1.658774,-1.741175,326
2,-1.198168,-1.511147,-0.937163,0.095589,-3.385019,3.375663,-1.498691,-1.457395,-1.741175,327
3,-1.071587,0.434949,1.414272,0.675442,0.454133,0.242928,-1.364971,-1.317305,-1.28772,334
4,-1.029394,-1.511147,2.002131,-0.484264,1.082358,0.242928,-1.240167,-1.212238,-1.117674,335


In [72]:
X = data_scaled.drop("price", axis=1)
y = data_scaled["price"]

In [73]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_val_scaled = scaler.transform(X_val)
# X_test_scaled = scaler.transform(X_test)

print(X_train.shape, X_val.shape, X_test.shape)

(30206, 9) (7552, 9) (16182, 9)


In [74]:
# models = {
#     'Logistic Regression': LogisticRegression(),
#     'Random Forest': RandomForestRegressor(n_estimators=100, criterion='squared_error', random_state=42),
#     'Decision Tree': DecisionTreeRegressor(random_state=42)
# }

# param_grids = {
#     'Logistic Regression': {},
#     'Random Forest': {
#         'n_estimators': [100,], 
#         'max_depth': [None, 10, 20, 30], 
#         'min_samples_split': [2, 5, 10], 
#         'min_samples_leaf': [1, 2, 4]
#     },
#     'Decision Tree': {
#         'max_depth': [None, 10, 20, 30],
#         'min_samples_split': [2, 5, 10],
#         'min_samples_leaf': [1, 2, 4]
#     }
# }

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=200, random_state=42),
    'Decision Tree': DecisionTreeRegressor(random_state=42)
}

In [None]:
results = {}

for name, model in models.items():
    print(f"{name}:")
    
    model.fit(X_train, y_train)

    y_val_pred = model.predict(X_val)
    val_mae = mean_absolute_error(y_val, y_val_pred)
    val_mse = mean_squared_error(y_val, y_val_pred)
    val_r2 = r2_score(y_val, y_val_pred)


    y_test_pred = model.predict(X_test)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    y_pred = model.predict(X_test)

    results[name] = {
        'Validation MAE': val_mae,
        'Validation MSE': val_mse,
        'Validation r2': val_r2,
        'Test MAE': test_mae,
        'Test MSE': test_mse,
        'Test r2': test_r2,
    }
    print(f"\t{name} - Validation MAE: {val_mae:.4f}")
    print(f"\t{name} - Validation MSE: {val_mse:.4f}")
    print(f"\t{name} - Validation r2: {val_r2:.4f}")
    print(f"\t{name} - Test MAE: {test_mae:.4f}")
    print(f"\t{name} - Test MSE: {test_mse:.4f}")
    print(f"\t{name} - Test r2: {test_r2:.4f}")

Linear Regression:
	Linear Regression - Validation MAE: 854.8785
	Linear Regression - Validation MSE: 1752655.4536
	Linear Regression - Validation r2: 0.8904
	Linear Regression - Test MAE: 855.2585
	Linear Regression - Test MSE: 1772334.4245
	Linear Regression - Test r2: 0.8864
Random Forest:


In [None]:
best_model = max(results, key=lambda x: results[x]['Test MSE'])
print(f"Best performing model: {best_model}")

Best performing model: Linear Regression
