In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [2]:
df = pd.read_json("cleaned_data_model.json")
df.head()

Unnamed: 0,Fiyat,Brüt M2,Bulunduğu Kat,Bina Yaşı,İlçe,Mahalle,Oda
0,5390000,85.0,-1.0,5,Maltepe,Altıntepe,2
1,5690000,65.0,3.0,4,Kartal,Orhantepe,1
2,1850000,112.0,2.0,0,Esenyurt,Çınar,3
3,895000,104.0,-1.0,5,Esenyurt,Yeşilkent,2
4,3250000,170.0,2.0,0,Beylikdüzü,Adnan Kahveci,3


Make sure there are no null values. Look at the statistics of each column.

In [3]:
print(df.isnull().sum())
print(df.describe())

Fiyat            0
Brüt M2          0
Bulunduğu Kat    0
Bina Yaşı        0
İlçe             0
Mahalle          0
Oda              0
dtype: int64
              Fiyat       Brüt M2  Bulunduğu Kat     Bina Yaşı           Oda
count  2.377000e+04  23770.000000   23770.000000  23770.000000  23770.000000
mean   9.251593e+06    128.668915       3.314703     12.684771      2.589230
std    1.184127e+07     56.595839       3.854961     14.023572      0.967068
min    3.200000e+05     15.000000      -3.000000      0.000000      1.000000
25%    3.250000e+06     90.000000       1.000000      1.000000      2.000000
50%    5.450000e+06    115.000000       2.500000      6.000000      2.000000
75%    1.060000e+07    150.000000       4.000000     25.000000      3.000000
max    2.000000e+08    600.000000      45.000000    120.000000      8.000000


### Log Transformation for the price column

In [4]:
df['Fiyat'] = np.log1p(df['Fiyat'])
df.head()

Unnamed: 0,Fiyat,Brüt M2,Bulunduğu Kat,Bina Yaşı,İlçe,Mahalle,Oda
0,15.500056,85.0,-1.0,5,Maltepe,Altıntepe,2
1,15.554221,65.0,3.0,4,Kartal,Orhantepe,1
2,14.430697,112.0,2.0,0,Esenyurt,Çınar,3
3,13.70458,104.0,-1.0,5,Esenyurt,Yeşilkent,2
4,14.994166,170.0,2.0,0,Beylikdüzü,Adnan Kahveci,3


Removing the neighborhood to simplify the features.

In [5]:
df = df.drop('Mahalle', axis=1)
df.head()

Unnamed: 0,Fiyat,Brüt M2,Bulunduğu Kat,Bina Yaşı,İlçe,Oda
0,15.500056,85.0,-1.0,5,Maltepe,2
1,15.554221,65.0,3.0,4,Kartal,1
2,14.430697,112.0,2.0,0,Esenyurt,3
3,13.70458,104.0,-1.0,5,Esenyurt,2
4,14.994166,170.0,2.0,0,Beylikdüzü,3


Define X variable as the target, and y variable as the features.

In [6]:
X = df.drop(columns=["Fiyat"])
y = df["Fiyat"]

Separate numerical and categorical features. Carry out preprocessing: numerical features scaled with the standard scaler while the categorical features are encoded with one-hot-encoding.

In [7]:
num_features = ["Brüt M2", "Bulunduğu Kat", "Bina Yaşı", "Oda"]
cat_features = ["İlçe"]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_features),
    ("cat", OneHotEncoder(), cat_features)
])

Initialized the Stratified K-fold object with 5 splits. Shuffle the data before splitting and make it reproducible with the seed.
Create pipeline with the preprocessor and regressor.

In [8]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

Create the 10 different price bins for stratification.

In [9]:
price_bins = pd.qcut(y, q=5, labels=False) 

Train the model and calculate Mean Absolute Error and Normalized Absolute Error.

In [10]:
# Initialize lists to store the errors
mae_scores = []
nae_scores = []

# Start the loop to be able to work on each fold
for train_idx, val_idx in skf.split(X, price_bins):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx] # Set the train and validation variable with the split indexes
    
    # Fit the features and target in the pipeline and get a prediction for a data point the the validation set
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)
    
    # Calculate MAE in the log scale given the prediction and the actual price in the validation set
    mae_log = mean_absolute_error(y_val, y_pred)
    
    # Convert to original scale and calculate MAE
    y_val_original = np.exp(y_val)
    y_pred_original = np.exp(y_pred)
    mae_original = mean_absolute_error(y_val_original, y_pred_original)
    
    # Calculate Normalized Absolute Error (NAE) : Find the range in the validation set and divide MAE by the range
    y_val_range = y_val_original.max() - y_val_original.min()
    if y_val_range > 0:
        nae = mae_original / y_val_range
    else:
        nae = np.nan  # Handle cases where the range is zero
    nae_scores.append(nae)
    
    print(f"Fold MAE (log scale): {mae_log:.4f}, MAE (original scale): {mae_original:.2f}, NAE: {nae:.4f}")
    mae_scores.append(mae_log)

# Final mean values of MAE and NAE
print(f"Mean MAE (log scale): {np.mean(mae_scores):.4f}")
print(f"Mean NAE: {np.mean(nae_scores):.4f}")


Fold MAE (log scale): 0.3123, MAE (original scale): 3292811.79, NAE: 0.0200
Fold MAE (log scale): 0.3087, MAE (original scale): 3182976.99, NAE: 0.0161
Fold MAE (log scale): 0.3094, MAE (original scale): 3244546.95, NAE: 0.0181
Fold MAE (log scale): 0.3160, MAE (original scale): 3281883.14, NAE: 0.0164
Fold MAE (log scale): 0.3101, MAE (original scale): 3333814.51, NAE: 0.0181
Mean MAE (log scale): 0.3113
Mean NAE: 0.0177


With 10 bins of price:
 - Fold MAE (log scale): 0.3094, MAE (original scale): 3221141.31, NAE: 0.0179
 - Fold MAE (log scale): 0.3115, MAE (original scale): 3422145.23, NAE: 0.0171
 - Fold MAE (log scale): 0.3156, MAE (original scale): 3246152.62, NAE: 0.0164
 - Fold MAE (log scale): 0.3117, MAE (original scale): 3179684.70, NAE: 0.0236
 - Fold MAE (log scale): 0.3081, MAE (original scale): 3261997.70, NAE: 0.0192

Mean MAE (log scale): 0.3113  
Mean NAE: 0.0189

In [23]:
new_data = pd.DataFrame({
    "Brüt M2": [40, 100, 150],
    "Bulunduğu Kat": [1, -2, 5],
    "Bina Yaşı": [10, 10, 60],
    "Oda": [1, 1, 3],
    "İlçe": ["Üsküdar", "Üsküdar", "Fatih"]  # Example districts
})

pipeline.fit(X, y)
predictions = pipeline.predict(new_data)
predictions_original_scale = np.exp(predictions)

for district, prediction in zip(new_data['İlçe'], predictions_original_scale):
    print(f"District: {district} Price: {prediction:.2f}")

District: Üsküdar Price: 3987959.31
District: Üsküdar Price: 5579336.22
District: Fatih Price: 3637496.21
