In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [49]:
data = pd.read_csv("data/CleanedHomeSaleData.csv", sep=";")

In [50]:
columns = [
    "District",
    "Neighborhood",
    "Price_(TL)",
    "m2_(Net)",
    "Livingroom_number",
    "Room_number",
    "Building_Age",
    "Floor_location",
    "Number_of_floors",
    "Heating",
    "Number_of_bathrooms",
    "Available_for_Loan",
    "From_who",
    "Front_West",
    "Front_East",
    "Front_South",
    "Front_North",
    "Internet",
    "Security_Alarm",
    "Smart_House",
    "Elevator",
    "Balcony",
    "Car_Park",
    "Laminate_Floor",
    "Luxury_Facilities",
    "Airport",
    "Marmaray",
    "Metro",
    "Metrobus",
    "Minibus",
    "Bus_stop",
    "Tram",
    "Railway_station",
    "TEM",
    "E-5"
]

In [51]:
data_model = data[columns].copy()

data_model["m2_(Net)"] = np.log1p(data_model["m2_(Net)"])

X = data_model.drop("Price_(TL)", axis=1)
y = np.log1p(data_model["Price_(TL)"])

In [52]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=15)

In [53]:
catboost_cols = ["District", "Neighborhood", "Building_Age"]
onehot_cols = ["Heating", "Available_for_Loan", "From_who"]

numeric_cols = X.select_dtypes(exclude=["object", "category"]).columns.tolist()

In [54]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from category_encoders import CatBoostEncoder

preprocessor = ColumnTransformer(
    transformers=[
        ("catboost", CatBoostEncoder(), catboost_cols),
        ("onehot", OneHotEncoder(handle_unknown="ignore"), onehot_cols)
    ],
    remainder="passthrough"
)

In [55]:
X_train_encoded = preprocessor.fit_transform(X_train, y_train)
X_test_encoded = preprocessor.transform(X_test)

In [56]:
X_train = pd.DataFrame(X_train_encoded, columns=preprocessor.get_feature_names_out(), index=X_train.index)
X_test = pd.DataFrame(X_test_encoded, columns=preprocessor.get_feature_names_out(), index=X_test.index)

In [57]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [58]:
def calculate_regression_metrics(true_log, predicted_log):
    true = np.expm1(true_log)
    predicted = np.expm1(predicted_log)

    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(true, predicted)
    r2 = r2_score(true, predicted)
    return mse, rmse, mae, r2

In [59]:
tree_models = {
    "Gradient Boosting": GradientBoostingRegressor(
        subsample=0.6,
        n_estimators=800,
        max_depth=7,
        learning_rate=0.03
    ),
}

In [60]:
for i in range(len(list(tree_models))):
    model = list(tree_models.values())[i]
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    modeltree_train_mse, modeltree_train_rmse, modeltree_train_mae, modeltree_train_r2 = calculate_regression_metrics(y_train, y_train_pred)
    modeltree_test_mse, modeltree_test_rmse, modeltree_test_mae, modeltree_test_r2 = calculate_regression_metrics(y_test, y_test_pred)

    print(list(tree_models.values())[i])

    print("Evaluation for Training Set")
    print("MSE :", modeltree_train_mse)
    print("RMSE :", modeltree_train_rmse)
    print("MAE :", modeltree_train_mae)
    print("R2 Score :", modeltree_train_r2)

    print("------------------------")

    print("Evaluation for Test Set")
    print("MSE :", modeltree_test_mse)
    print("RMSE :", modeltree_test_rmse)
    print("MAE :", modeltree_test_mae)
    print("R2 Score :", modeltree_test_r2)

    print("------------------------")
    print("\n")

GradientBoostingRegressor(learning_rate=0.03, max_depth=7, n_estimators=800,
                          subsample=0.6)
Evaluation for Training Set
MSE : 86549938698.22198
RMSE : 294193.7094810526
MAE : 110976.99542573793
R2 Score : 0.95293609444755
------------------------
Evaluation for Test Set
MSE : 394682270754.8098
RMSE : 628237.4318319546
MAE : 188625.18287659428
R2 Score : 0.7631623635434247
------------------------




In [61]:
preprocessor

0,1,2
,transformers,"[('catboost', ...), ('onehot', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_unknown,'value'
,handle_missing,'value'
,random_state,
,sigma,
,a,1

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [62]:
tree_models

{'Gradient Boosting': GradientBoostingRegressor(learning_rate=0.03, max_depth=7, n_estimators=800,
                           subsample=0.6)}

In [63]:
model.score

<bound method RegressorMixin.score of GradientBoostingRegressor(learning_rate=0.03, max_depth=7, n_estimators=800,
                          subsample=0.6)>

In [64]:
import pickle

In [65]:
with open("istanbulHousePriceModel.pkl", "wb") as f:
    pickle.dump(
        {
            "model": model,                 # GradientBoostingRegressor
            "preprocessor": preprocessor,   # ColumnTransformer
            "target_transform": "log1p"     # for you information
        },f
    )

In [66]:
userInput = {
    "District": "Ataşehir",
    "Neighborhood": "İçerenköy Mah.",
    "m2_(Net)": 190,
    "Livingroom_number": 1,
    "Room_number": 4.5,
    "Building_Age": "4",
    "Floor_location": 19,
    "Number_of_floors": 19,
    "Heating": "Central Heating",
    "Number_of_bathrooms": 3,
    "Available_for_Loan": "Yes",
    "From_who": "From the real estate office",
    "Front_West": 1,
    "Front_East": 1,
    "Front_South": 1,
    "Front_North": 1,
    "Internet": 1,
    "Security_Alarm": 1,
    "Smart_House": 1,
    "Elevator": 1,
    "Balcony": 1,
    "Car_Park": 1,
    "Laminate_Floor": 1,
    "Luxury_Facilities": 1,
    "Airport": 1,
    "Marmaray": 1,
    "Metro": 1,
    "Metrobus": 1,
    "Minibus": 1,
    "Bus_stop": 1,
    "Tram": 1,
    "Railway_station": 1,
    "TEM": 1,
    "E-5": 1
}


In [67]:
input_df = pd.DataFrame([userInput])
input_df["m2_(Net)"] = np.log1p(input_df["m2_(Net)"])
input_encoded = preprocessor.transform(input_df)

pred_log_price = model.predict(input_encoded)
pred_price_tl = np.expm1(pred_log_price)

pred_price_tl[0]




np.float64(1924400.5687316228)

In [68]:
X_test

Unnamed: 0,catboost__District,catboost__Neighborhood,catboost__Building_Age,onehot__Heating_Absent,onehot__Heating_Central Heating,onehot__Heating_Climate,onehot__Heating_Natural Gas,onehot__Heating_Other Heating,onehot__Heating_Stove,onehot__Available_for_Loan_No,...,remainder__Airport,remainder__Marmaray,remainder__Metro,remainder__Metrobus,remainder__Minibus,remainder__Bus_stop,remainder__Tram,remainder__Railway_station,remainder__TEM,remainder__E-5
12272,12.824766,12.722939,13.020074,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
16810,12.797093,12.963999,12.914340,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
18441,13.101365,13.110851,13.088369,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
12338,13.155437,13.516859,13.088369,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
29666,14.408924,15.082679,13.133883,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26293,12.978508,12.779142,13.095925,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
30115,12.603775,12.364753,13.108805,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1730,12.421923,12.356193,13.028834,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12219,12.919566,13.024070,13.020074,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [69]:
pd.DataFrame(X_test).to_csv("data/CleanedHomeSaleData_Xtest.csv", index=False)