In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
import pandas as pd
import numpy as np

In [2]:
df = pd.read_json("cleaned_data_model.json")
df.head()

Unnamed: 0,Fiyat,Brüt M2,Bulunduğu Kat,Bina Yaşı,İlçe,Mahalle,Oda
0,5390000,85.0,-1.0,5,Maltepe,Altıntepe,2
1,5690000,65.0,3.0,4,Kartal,Orhantepe,1
2,1850000,112.0,2.0,0,Esenyurt,Çınar,3
3,895000,104.0,-1.0,5,Esenyurt,Yeşilkent,2
4,3250000,170.0,2.0,0,Beylikdüzü,Adnan Kahveci,3


In [3]:
print(df.isnull().sum())
print(df.describe())

Fiyat            0
Brüt M2          0
Bulunduğu Kat    0
Bina Yaşı        0
İlçe             0
Mahalle          0
Oda              0
dtype: int64
              Fiyat       Brüt M2  Bulunduğu Kat     Bina Yaşı           Oda
count  2.377000e+04  23770.000000   23770.000000  23770.000000  23770.000000
mean   9.251593e+06    128.668915       3.314703     12.684771      2.589230
std    1.184127e+07     56.595839       3.854961     14.023572      0.967068
min    3.200000e+05     15.000000      -3.000000      0.000000      1.000000
25%    3.250000e+06     90.000000       1.000000      1.000000      2.000000
50%    5.450000e+06    115.000000       2.500000      6.000000      2.000000
75%    1.060000e+07    150.000000       4.000000     25.000000      3.000000
max    2.000000e+08    600.000000      45.000000    120.000000      8.000000


### Log Transformation for the price column

In [4]:
df['Fiyat'] = np.log1p(df['Fiyat'])
df.head()

Unnamed: 0,Fiyat,Brüt M2,Bulunduğu Kat,Bina Yaşı,İlçe,Mahalle,Oda
0,15.500056,85.0,-1.0,5,Maltepe,Altıntepe,2
1,15.554221,65.0,3.0,4,Kartal,Orhantepe,1
2,14.430697,112.0,2.0,0,Esenyurt,Çınar,3
3,13.70458,104.0,-1.0,5,Esenyurt,Yeşilkent,2
4,14.994166,170.0,2.0,0,Beylikdüzü,Adnan Kahveci,3


In [5]:
df = df.drop('Mahalle', axis=1)
df.head()

Unnamed: 0,Fiyat,Brüt M2,Bulunduğu Kat,Bina Yaşı,İlçe,Oda
0,15.500056,85.0,-1.0,5,Maltepe,2
1,15.554221,65.0,3.0,4,Kartal,1
2,14.430697,112.0,2.0,0,Esenyurt,3
3,13.70458,104.0,-1.0,5,Esenyurt,2
4,14.994166,170.0,2.0,0,Beylikdüzü,3


In [6]:
X = df.drop(columns=["Fiyat"])
y = df["Fiyat"]

cat_features = ["İlçe"]
num_features = ["Brüt M2", "Bulunduğu Kat", "Bina Yaşı", "Oda"]

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features),
    ("num", StandardScaler(), num_features)
])

In [7]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])

# Stratified K-Fold setup
price_bins = pd.qcut(y, q=10, labels=False)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [8]:
mae_scores_log = []
mae_scores_original = []
nae_scores = []

for train_idx, val_idx in skf.split(X, price_bins):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    pipeline.fit(X_train, y_train)
    y_pred_log = pipeline.predict(X_val)
    
    # Calculate MAE in log scale
    mae_log = mean_absolute_error(y_val, y_pred_log)
    mae_scores_log.append(mae_log)
    
    # Calculate MAE in original scale
    y_val_original = np.exp(y_val)
    y_pred_original = np.exp(y_pred_log)
    mae_original = mean_absolute_error(y_val_original, y_pred_original)
    mae_scores_original.append(mae_original)
    
    # Calculate Normalized Absolute Error (NAE)
    y_val_range = y_val_original.max() - y_val_original.min()
    if y_val_range > 0:  # Handle cases with zero range
        nae = mae_original / y_val_range
    else:
        nae = 0 
        print("Warning: Zero range in y_val_original. NAE set to 0.") 
    nae_scores.append(nae)

# Results
print(f"Mean MAE (log scale): {np.mean(mae_scores_log):.4f}")
print(f"Mean MAE (original scale): {np.mean(mae_scores_original):.2f}")
print(f"Mean NAE: {np.mean(nae_scores):.4f}")

Mean MAE (log scale): 0.2594
Mean MAE (original scale): 2582750.22
Mean NAE: 0.0149


Even though the error rate seems to be better than the regression models, the predictions come out a lot lower than real prices.

In [9]:
new_data = pd.DataFrame({
    "Brüt M2": [40, 100, 150],
    "Bulunduğu Kat": [1, -2, 5],
    "Bina Yaşı": [10, 10, 60],
    "Oda": [1, 1, 3],
    "İlçe": ["Üsküdar", "Üsküdar", "Fatih"]  # Example districts
})

pipeline.fit(X, y)
predictions = pipeline.predict(new_data)
predictions_original_scale = np.exp(predictions)

for district, prediction in zip(new_data['İlçe'], predictions_original_scale):
    print(f"District: {district} Price: {prediction:.2f}")

District: Üsküdar Price: 2872098.95
District: Üsküdar Price: 3675701.85
District: Fatih Price: 5631705.31
