In [141]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Import Models
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor



In [142]:
# Load Data
file_path = 'home_price_cleaned_OHE.csv'  # CHANGE THIS to your filename
df = pd.read_csv(file_path)

# Define features and target
# Assuming the target column is named 'Price'. Change if necessary.
target_col = 'Fiyat' 

X = df.drop('Fiyat',axis =1)
y = df['Fiyat']

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling (Important for Lasso and MLP)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Calculate the threshold for classification metrics (e.g., Median Price)
binary_threshold = y_train.median()
print(f"Threshold for binary classification metrics: {binary_threshold}")
df.shape
df.info()

Threshold for binary classification metrics: 2300000.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20131 entries, 0 to 20130
Data columns (total 65 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Net_Metrekare                         20131 non-null  int64  
 1   Brüt_Metrekare                        20131 non-null  float64
 2   Oda_Sayısı                            20131 non-null  float64
 3   Fiyat                                 20131 non-null  float64
 4   Binanın_Kat_Sayısı                    20131 non-null  int64  
 5   Banyo_Sayısı                          20131 non-null  float64
 6   Bulunduğu_Kat_1.Kat                   20131 non-null  int64  
 7   Bulunduğu_Kat_10.Kat                  20131 non-null  int64  
 8   Bulunduğu_Kat_11.Kat                  20131 non-null  int64  
 9   Bulunduğu_Kat_12.Kat                  20131 non-null  int64  
 10  Bulunduğu_Kat_13.Kat       

In [143]:
# Global list to store results
results_list = []

# --- Evaluation Function (Using the 3 Metrics) ---
def evaluate_and_log(model_name, tuning_method, model, X_test, y_test):
    # Predict
    y_pred = model.predict(X_test)
    
    # Calculate Metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    print(f"--- {model_name} ({tuning_method}) ---")
    print(f"  R² Score: {r2:.4f}")
    print(f"  MAE:      {mae:,.0f}")
    print(f"  RMSE:     {rmse:,.0f}")
    print("-" * 30)
    
    results_list.append({
        'Model': model_name,
        'Tuning': tuning_method,
        'R2': r2,
        'MAE': mae,
        'RMSE': rmse,
        'Best Params': model.best_params_
    })

In [121]:
# --- Lasso: Bayesian Optimization ---
lasso_bayes_params = {
    'alpha': Real(0.001, 10.0, prior='log-uniform'), # Strength of regularization
    'selection': Categorical(['cyclic', 'random'])
}

opt_lasso = BayesSearchCV(
    Lasso(random_state=42),
    lasso_bayes_params,
    n_iter=20, cv=3, random_state=42, n_jobs=-1
)

print("Training Lasso (Bayesian)...")
opt_lasso.fit(X_train, y_train)
evaluate_and_log("Lasso", "Bayesian", opt_lasso, X_test, y_test)

# --- Lasso: Grid Search ---
lasso_grid_params = {
    'alpha': [0.01, 0.1, 1, 5, 10],
    'selection': ['cyclic', 'random']
}

grid_lasso = GridSearchCV(
    Lasso(random_state=42),
    lasso_grid_params,
    cv=3, n_jobs=-1
)

print("Training Lasso (Grid Search)...")
grid_lasso.fit(X_train, y_train)
evaluate_and_log("Lasso", "Grid Search", grid_lasso, X_test, y_test)

Training Lasso (Bayesian)...
--- Lasso (Bayesian) ---
  R² Score: 0.3784
  MAE:      1,323,480
  RMSE:     3,088,880
------------------------------
Training Lasso (Grid Search)...
--- Lasso (Grid Search) ---
  R² Score: 0.3784
  MAE:      1,323,480
  RMSE:     3,088,880
------------------------------


In [146]:
# --- Decision Tree: Bayesian Optimization ---
dt_bayes_params = {
    'max_depth': Integer(3, 30),
    'min_samples_split': Integer(2, 20),
    'min_samples_leaf': Integer(1, 10),
    'criterion': Categorical(['squared_error', 'absolute_error'])
}

opt_dt = BayesSearchCV(
    DecisionTreeRegressor(random_state=42),
    dt_bayes_params,
    n_iter=20, cv=3, random_state=42, n_jobs=-1
)

print("Training Decision Tree (Bayesian)...")
opt_dt.fit(X_train, y_train)
evaluate_and_log("Decision Tree", "Bayesian", opt_dt, X_test, y_test)

# --- Decision Tree: Grid Search ---
dt_grid_params = {
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 5]
}

grid_dt = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    dt_grid_params,
    cv=3, n_jobs=-1
)

print("Training Decision Tree (Grid Search)...")
grid_dt.fit(X_train, y_train)
evaluate_and_log("Decision Tree", "Grid Search", grid_dt, X_test, y_test)

Training Decision Tree (Bayesian)...
--- Decision Tree (Bayesian) ---
  R² Score: -0.0003
  MAE:      3,588,550
  RMSE:     68,176,986
------------------------------
Training Decision Tree (Grid Search)...
--- Decision Tree (Grid Search) ---
  R² Score: -6.5697
  MAE:      8,637,488
  RMSE:     187,550,476
------------------------------


In [144]:
# model_pipeline.py
import os
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import GroupShuffleSplit, train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler

# ---------------------------
# Configuration
# ---------------------------
DATA_PATH = "home_price_cleaned_first_encode.csv"  # change if needed
OUTPUT_DIR = "model_output"
SEED = 42
np.random.seed(SEED)
Path(OUTPUT_DIR).mkdir(exist_ok=True)

# ---------------------------
# Utility functions
# ---------------------------
def print_scores(y_true, y_pred, label="Model"):
    # compute on original scale (y_true, y_pred are original-scale)
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print(f"--- {label} ---")
    print(f"  R² Score: {r2:.4f}")
    print(f"  MAE:      {mae:,.0f}")
    print(f"  RMSE:     {rmse:,.0f}")
    print("------------------------------")
    return {"r2": r2, "mae": mae, "rmse": rmse}

def evaluate_and_report(model, X_test, y_test_log, scaler=None, label="Model"):
    # model predicts log1p prices (we train on log1p). Convert back with expm1.
    y_pred_log = model.predict(X_test)
    y_pred = np.expm1(y_pred_log)
    y_true = np.expm1(y_test_log)
    return print_scores(y_true, y_pred, label=label)

# ---------------------------
# Load data
# ---------------------------
df = pd.read_csv(DATA_PATH)
print("Loaded:", DATA_PATH)
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())

# Quick check: Fiyat must exist
if "Fiyat" not in df.columns:
    raise ValueError("Fiyat column not found. Rename your price column to 'Fiyat'.")

# ---------------------------
# Ensure Fiyat numeric (clean if necessary)
# ---------------------------
# If Fiyat is string with thousand separators or currency, try to coerce:
if df["Fiyat"].dtype == object:
    df["Fiyat"] = (
        df["Fiyat"]
        .astype(str)
        .str.replace(r"[^\d\-\.]", "", regex=True)  # remove non-numeric chars (keeps minus and dot)
        .replace("", np.nan)
    )
df["Fiyat"] = pd.to_numeric(df["Fiyat"], errors="coerce")

print("Fiyat stats (before cleaning):")
print(df["Fiyat"].describe())

# Drop rows without price
df = df[df["Fiyat"].notna()].copy()
print("After dropping NaN Fiyat:", df.shape)

# ---------------------------
# Remove extreme outliers (robust clipping)
# ---------------------------
# We'll remove top 0.5% and bottom 0.5% by price (adjustable)
lower_q = 0.005
upper_q = 0.995
low_val = df["Fiyat"].quantile(lower_q)
high_val = df["Fiyat"].quantile(upper_q)
print(f"Outlier cut: [{low_val:,.0f} , {high_val:,.0f}]  (quantiles {lower_q}, {upper_q})")
df = df[(df["Fiyat"] >= low_val) & (df["Fiyat"] <= high_val)].copy()
print("After outlier trimming:", df.shape)

# ---------------------------
# Feature engineering
# ---------------------------
# Basic features you already have: use them
# Add price per m2, price per room, floor ratio, and some simple interactions
# But avoid leaking target: compute derived features from existing ones, not from Fiyat.

# Ensure Net_Metrekare > 0 to compute price/m2 later (we will use Net_Metrekare only for derived features if >0)
df = df[df["Net_Metrekare"].notna() & (df["Net_Metrekare"] > 0)].copy()

# Price per m2 (we will compute using Fiyat but only for analysis; not used as a feature)
# Instead, create proxies: rooms_per_m2 = Oda_Sayısı / Net_Metrekare
df["rooms_per_m2"] = df["Oda_Sayısı"] / df["Net_Metrekare"]

# floor ratio: floor / total floors (if available)
if ("Bulunduğu_Kat" in df.columns) and ("Binanın_Kat_Sayısı" in df.columns):
    df["floor_ratio"] = df["Bulunduğu_Kat"] / (df["Binanın_Kat_Sayısı"].replace({0: np.nan}))
    df["floor_ratio"] = df["floor_ratio"].fillna(df["floor_ratio"].median())

# Building age bucket (simple)
df["age_bucket"] = pd.cut(df["Binanın_Yaşı"].fillna(df["Binanın_Yaşı"].median()),
                          bins=[-1,0,5,10,15,25,50,1000],
                          labels=False).astype(int)

# Fill remaining NaNs in features with median (safe for tree models)
for col in df.columns:
    if df[col].isnull().any() and col != "Fiyat":
        med = df[col].median()
        df[col] = df[col].fillna(med)

# ---------------------------
# Prepare target: log transform
# ---------------------------
df["Fiyat_log"] = np.log1p(df["Fiyat"])

# ---------------------------
# Feature set selection
# ---------------------------
# Choose the features to use (exclude target and any columns not meant as input)
feature_list = [
    "Net_Metrekare", "Brüt_Metrekare", "Oda_Sayısı", "Bulunduğu_Kat",
    "Eşya_Durumu", "Binanın_Yaşı", "Isıtma_Tipi", "Şehir",
    "Binanın_Kat_Sayısı", "Kullanım_Durumu", "Yatırıma_Uygunluk", "Takas",
    "Tapu_Durumu", "Banyo_Sayısı", "rooms_per_m2", "floor_ratio", "age_bucket"
]
# keep only those present
feature_list = [c for c in feature_list if c in df.columns]
print("Using features:", feature_list)

X = df[feature_list].copy()
y_log = df["Fiyat_log"].copy()
groups = df["Şehir"].copy() if "Şehir" in df.columns else None

# ---------------------------
# Train/test split (grouped by city to avoid leakage)
# ---------------------------
if groups is not None:
    gss = GroupShuffleSplit(n_splits=1, test_size=0.20, random_state=SEED)
    train_idx, test_idx = next(gss.split(X, y_log, groups=groups))
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train_log, y_test_log = y_log.iloc[train_idx], y_log.iloc[test_idx]
else:
    X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.2, random_state=SEED)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

# ---------------------------
# Optional scaling for linear models (not necessary for tree models)
# ---------------------------
# We'll keep scaler in case you want to try linear models later.
scaler = StandardScaler()
# Fit scaler only on numeric columns (all are numeric here)
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ---------------------------
# Model training helpers
# ---------------------------

results = {}

# Decision Tree - small grid search (fast)
dt_params = {
    "max_depth": [6, 10, 15],
    "min_samples_leaf": [5, 10, 20]
}
dt = DecisionTreeRegressor(random_state=SEED)
dt_gs = GridSearchCV(dt, dt_params, cv=3, scoring="r2", n_jobs=-1, verbose=0)
dt_gs.fit(X_train, y_train_log)
best_dt = dt_gs.best_estimator_
print("Decision Tree best params:", dt_gs.best_params_)
results["DecisionTree"] = evaluate_and_report(best_dt, X_test, y_test_log, label="Decision Tree (Grid Search)")
joblib.dump(best_dt, os.path.join(OUTPUT_DIR, "best_decision_tree.joblib"))

# Random Forest - moderate grid (may take time)
rf = RandomForestRegressor(random_state=SEED, n_jobs=-1)
rf_params = {
    "n_estimators": [200, 400],
    "max_depth": [12, 18],
    "min_samples_leaf": [5, 10]
}
rf_gs = GridSearchCV(rf, rf_params, cv=3, scoring="r2", n_jobs=-1, verbose=0)
rf_gs.fit(X_train, y_train_log)
best_rf = rf_gs.best_estimator_
print("Random Forest best params:", rf_gs.best_params_)
results["RandomForest"] = evaluate_and_report(best_rf, X_test, y_test_log, label="Random Forest (Grid Search)")
joblib.dump(best_rf, os.path.join(OUTPUT_DIR, "best_random_forest.joblib"))

# XGBoost - light grid (fast-ish)
xgb = XGBRegressor(random_state=SEED, n_jobs=-1, objective="reg:squarederror", verbosity=0)
xgb_params = {
    "n_estimators": [200, 400],
    "max_depth": [6, 10],
    "learning_rate": [0.05, 0.1]
}
xgb_gs = GridSearchCV(xgb, xgb_params, cv=3, scoring="r2", n_jobs=-1, verbose=0)
xgb_gs.fit(X_train, y_train_log)
best_xgb = xgb_gs.best_estimator_
print("XGBoost best params:", xgb_gs.best_params_)
results["XGBoost"] = evaluate_and_report(best_xgb, X_test, y_test_log, label="XGBoost (Grid Search)")
joblib.dump(best_xgb, os.path.join(OUTPUT_DIR, "best_xgb.joblib"))

# ---------------------------
# Compare & feature importances (from best model)
# ---------------------------
print("\nSummary of results (R2 / MAE / RMSE):")
for name, metrics in results.items():
    print(name, metrics)

# Choose best model by R2
best_model_name = max(results.items(), key=lambda kv: kv[1]["r2"])[0]
print("Best model by R2:", best_model_name)
best_model = {"DecisionTree": best_dt, "RandomForest": best_rf, "XGBoost": best_xgb}[best_model_name]

# Feature importance (if model supports)
try:
    importances = best_model.feature_importances_
    fi = pd.Series(importances, index=X.columns).sort_values(ascending=False)
    print("\nTop 15 feature importances:")
    print(fi.head(15).to_string())
    fi.to_csv(os.path.join(OUTPUT_DIR, "feature_importances.csv"))
except Exception as e:
    print("Feature importance not available:", e)

# ---------------------------
# Save scaler for later
# ---------------------------
joblib.dump(scaler, os.path.join(OUTPUT_DIR, "scaler.joblib"))

print("\nAll done. Models and artifacts saved to:", OUTPUT_DIR)


Loaded: home_price_cleaned_first_encode.csv
Shape: (20131, 15)
Columns: ['Net_Metrekare', 'Brüt_Metrekare', 'Oda_Sayısı', 'Bulunduğu_Kat', 'Eşya_Durumu', 'Binanın_Yaşı', 'Isıtma_Tipi', 'Fiyat', 'Şehir', 'Binanın_Kat_Sayısı', 'Kullanım_Durumu', 'Yatırıma_Uygunluk', 'Takas', 'Tapu_Durumu', 'Banyo_Sayısı']
Fiyat stats (before cleaning):
count    2.013100e+04
mean     4.656591e+06
std      7.170283e+07
min      2.000000e+04
25%      1.639500e+06
50%      2.300000e+06
75%      3.300000e+06
max      7.500000e+09
Name: Fiyat, dtype: float64
After dropping NaN Fiyat: (20131, 15)
Outlier cut: [434,650 , 28,000,000]  (quantiles 0.005, 0.995)
After outlier trimming: (19933, 15)


TypeError: unsupported operand type(s) for /: 'str' and 'int'

In [None]:
# --- SVR: Bayesian Optimization ---
svr_bayes_params = {
    'C': Real(1e-1, 1e+2, prior='log-uniform'),
    'kernel': Categorical(['linear', 'rbf']),
    'epsilon': Real(1e-2, 1.0, prior='log-uniform')
}

opt_svr = BayesSearchCV(
    SVR(),
    svr_bayes_params,
    n_iter=15, cv=3, n_jobs=-1 # Lower iter as SVR is slow
)

print("Training SVR (Bayesian)...")
opt_svr.fit(X_train, y_train)
evaluate_and_log("SVR", "Bayesian", opt_svr, X_test, y_test)

# --- SVR: Grid Search ---
svr_grid_params = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'epsilon': [0.1, 0.2]
}

grid_svr = GridSearchCV(
    SVR(),
    svr_grid_params,
    cv=3, n_jobs=-1
)

print("Training SVR (Grid Search)...")
grid_svr.fit(X_train, y_train)
evaluate_and_log("SVR", "Grid Search", grid_svr, X_test, y_test)

Training SVR (Bayesian)...


KeyboardInterrupt: 

In [145]:
# --- Random Forest: Bayesian Optimization ---
rf_bayes_params = {
    'n_estimators': Integer(50, 200),
    'max_depth': Integer(5, 30),
    'min_samples_split': Integer(2, 10),
    'max_features': Categorical(['sqrt', 1.0])
}

opt_rf = BayesSearchCV(
    RandomForestRegressor(random_state=42),
    rf_bayes_params,
    n_iter=15, cv=3, random_state=42, n_jobs=-1
)

print("Training Random Forest (Bayesian)...")
opt_rf.fit(X_train, y_train)
evaluate_and_log("Random Forest", "Bayesian", opt_rf, X_test, y_test)

# --- Random Forest: Grid Search ---
rf_grid_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'max_features': ['sqrt', 1.0]
}

grid_rf = GridSearchCV(
    RandomForestRegressor(random_state=42),
    rf_grid_params,
    cv=3, n_jobs=-1
)

print("Training Random Forest (Grid Search)...")
grid_rf.fit(X_train, y_train)
evaluate_and_log("Random Forest", "Grid Search", grid_rf, X_test, y_test)

Training Random Forest (Bayesian)...
--- Random Forest (Bayesian) ---
  R² Score: -0.0012
  MAE:      4,395,622
  RMSE:     68,209,851
------------------------------
Training Random Forest (Grid Search)...
--- Random Forest (Grid Search) ---
  R² Score: -0.0433
  MAE:      4,726,320
  RMSE:     69,628,511
------------------------------


In [112]:
# --- Gradient Boosting: Bayesian Optimization ---
gb_bayes_params = {
    'n_estimators': Integer(100, 300),
    'learning_rate': Real(0.01, 0.2, prior='log-uniform'),
    'max_depth': Integer(3, 8),
    'subsample': Real(0.6, 1.0)
}

opt_gb = BayesSearchCV(
    GradientBoostingRegressor(random_state=42),
    gb_bayes_params,
    n_iter=15, cv=3, random_state=42, n_jobs=-1
)

print("Training Gradient Boosting (Bayesian)...")
opt_gb.fit(X_train, y_train)
evaluate_and_log("Gradient Boosting", "Bayesian", opt_gb, X_test, y_test)

# --- Gradient Boosting: Grid Search ---
gb_grid_params = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5],
    'subsample': [0.8, 1.0]
}

grid_gb = GridSearchCV(
    GradientBoostingRegressor(random_state=42),
    gb_grid_params,
    cv=3, n_jobs=-1
)

print("Training Gradient Boosting (Grid Search)...")
grid_gb.fit(X_train, y_train)
evaluate_and_log("Gradient Boosting", "Grid Search", grid_gb, X_test, y_test)

Training Gradient Boosting (Bayesian)...
--- Gradient Boosting (Bayesian) ---
  R² Score: 0.6300
  MAE:      861,239
  RMSE:     1,783,644
------------------------------
Training Gradient Boosting (Grid Search)...
--- Gradient Boosting (Grid Search) ---
  R² Score: 0.6427
  MAE:      846,823
  RMSE:     1,752,809
------------------------------


In [113]:
# --- XGBoost: Bayesian Optimization ---
xgb_bayes_params = {
    'n_estimators': Integer(100, 500),
    'learning_rate': Real(0.01, 0.3, prior='log-uniform'),
    'max_depth': Integer(3, 10),
    'subsample': Real(0.6, 1.0),
    'colsample_bytree': Real(0.6, 1.0)
}

opt_xgb = BayesSearchCV(
    XGBRegressor(random_state=42, n_jobs=-1),
    xgb_bayes_params,
    n_iter=20, cv=3, random_state=42, n_jobs=-1
)

print("Training XGBoost (Bayesian)...")
opt_xgb.fit(X_train, y_train)
evaluate_and_log("XGBoost", "Bayesian", opt_xgb, X_test, y_test)

# --- XGBoost: Grid Search ---
xgb_grid_params = {
    'n_estimators': [100, 300],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 6],
    'subsample': [0.8, 1.0]
}

grid_xgb = GridSearchCV(
    XGBRegressor(random_state=42, n_jobs=-1),
    xgb_grid_params,
    cv=3, n_jobs=-1
)

print("Training XGBoost (Grid Search)...")
grid_xgb.fit(X_train, y_train)
evaluate_and_log("XGBoost", "Grid Search", grid_xgb, X_test, y_test)

Training XGBoost (Bayesian)...
--- XGBoost (Bayesian) ---
  R² Score: 0.6419
  MAE:      855,567
  RMSE:     1,754,798
------------------------------
Training XGBoost (Grid Search)...
--- XGBoost (Grid Search) ---
  R² Score: 0.6416
  MAE:      866,433
  RMSE:     1,755,512
------------------------------
