In [None]:
#gerekli kütüphaneleri indiriyoruz
%pip install pandas scikit-learn
%pip install matplotlib
%pip install numpy

In [None]:
#test ve train setlerimizi import ediyoruz
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

train = pd.read_csv("evler-train.csv")
test = pd.read_csv("evler-test.csv")

Verilerin yapısı "evNo,fiyat,brutM2,netM2,oda,salon,yas,kat,katMaks,dogalGaz,banyo,amerikan,balkon,asansor,otopark,esya,siteMi,aidat,guneyMi" şeklinde.

In [None]:
#veri ön işleme ile değeri -1 (yani boş) olan verileri, knn kullanarak dolduruyoruz
from sklearn.impute import KNNImputer
import numpy as np

# Convert -1 to NaN so KNN Imputer can recognize them as missing values
train = train.replace(-1, np.nan)
test = test.replace(-1, np.nan)

# Now apply KNN imputation
imputer = KNNImputer(n_neighbors=5)
train = pd.DataFrame(imputer.fit_transform(train), columns=train.columns)
test = pd.DataFrame(imputer.transform(test), columns=test.columns)

print("Missing values after imputation:")
print(f"Train: {train.isna().sum().sum()}")
print(f"Test: {test.isna().sum().sum()}")

In [None]:
# Feature scaling - critical for SVR and can improve other models
from sklearn.preprocessing import StandardScaler

# Separate features and target
X_train = train.drop(columns=["fiyat", "evNo"])
y_train = train["fiyat"]
X_test = test.drop(columns=["fiyat", "evNo"])
y_test = test["fiyat"]

# Scale features
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

print("Feature scaling completed")
print(f"Train shape: {X_train_scaled.shape}")
print(f"Test shape: {X_test_scaled.shape}")

In [None]:
#4 farklı regresyon modeli deniyoruz
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [None]:
#lineer regresyon modeli
lr_model = LinearRegression()
lr_model.fit(train.drop(columns=["fiyat", "evNo"]), train["fiyat"])
lr_predictions = lr_model.predict(test.drop(columns=["fiyat", "evNo"]))

In [None]:
#DecisionTreeRegressor modeli
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(train.drop(columns=["fiyat", "evNo"]), train["fiyat"])
dt_predictions = dt_model.predict(test.drop(columns=["fiyat", "evNo"]))

In [None]:
#RandomForestRegressor modeli
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(train.drop(columns=["fiyat", "evNo"]), train["fiyat"])
rf_predictions = rf_model.predict(test.drop(columns=["fiyat", "evNo"]))

In [None]:
#SVR modeli
svr_model = SVR()
svr_model.fit(train.drop(columns=["fiyat", "evNo"]), train["fiyat"])
svr_predictions = svr_model.predict(test.drop(columns=["fiyat", "evNo"]))

# Train seti tahminlerini alıyoruz
lr_train_predictions = lr_model.predict(train.drop(columns=["fiyat", "evNo"]))
dt_train_predictions = dt_model.predict(train.drop(columns=["fiyat", "evNo"]))
rf_train_predictions = rf_model.predict(train.drop(columns=["fiyat", "evNo"]))
svr_train_predictions = svr_model.predict(train.drop(columns=["fiyat", "evNo"]))

In [None]:
#4 farklı metrik ile modellerimizi değerlendiriyoruz (hem train hem test)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, median_absolute_error

def evaluate_model(true_values, predictions):
    mae = mean_absolute_error(true_values, predictions)
    mse = mean_squared_error(true_values, predictions)
    r2 = r2_score(true_values, predictions)
    medae = median_absolute_error(true_values, predictions)
    return {"MAE": mae, "MSE": mse, "R2": r2, "MedAE": medae}

# Test set evaluation
lr_test_results = evaluate_model(test["fiyat"], lr_predictions)
dt_test_results = evaluate_model(test["fiyat"], dt_predictions)
rf_test_results = evaluate_model(test["fiyat"], rf_predictions)
svr_test_results = evaluate_model(test["fiyat"], svr_predictions)

# Train set evaluation
lr_train_results = evaluate_model(train["fiyat"], lr_train_predictions)
dt_train_results = evaluate_model(train["fiyat"], dt_train_predictions)
rf_train_results = evaluate_model(train["fiyat"], rf_train_predictions)
svr_train_results = evaluate_model(train["fiyat"], svr_train_predictions)

# Test sonuçlarını tablo halinde gösteriyoruz
print("=" * 60)
print("TEST SET RESULTS")
print("=" * 60)
test_results_df = pd.DataFrame({
    "Model": ["Linear Regression", "Decision Tree", "Random Forest", "SVR"],
    "MAE": [lr_test_results["MAE"], dt_test_results["MAE"], rf_test_results["MAE"], svr_test_results["MAE"]],
    "MSE": [lr_test_results["MSE"], dt_test_results["MSE"], rf_test_results["MSE"], svr_test_results["MSE"]],
    "R2": [lr_test_results["R2"], dt_test_results["R2"], rf_test_results["R2"], svr_test_results["R2"]],
    "MedAE": [lr_test_results["MedAE"], dt_test_results["MedAE"], rf_test_results["MedAE"], svr_test_results["MedAE"]]
})

display_test_df = test_results_df.copy()
num_cols = ["MAE", "MSE", "R2", "MedAE"]
display_test_df[num_cols] = display_test_df[num_cols].map(lambda x: f"{x:,.2f}")
print(display_test_df)

# Train sonuçlarını tablo halinde gösteriyoruz
print("\n" + "=" * 60)
print("TRAIN SET RESULTS")
print("=" * 60)
train_results_df = pd.DataFrame({
    "Model": ["Linear Regression", "Decision Tree", "Random Forest", "SVR"],
    "MAE": [lr_train_results["MAE"], dt_train_results["MAE"], rf_train_results["MAE"], svr_train_results["MAE"]],
    "MSE": [lr_train_results["MSE"], dt_train_results["MSE"], rf_train_results["MSE"], svr_train_results["MSE"]],
    "R2": [lr_train_results["R2"], dt_train_results["R2"], rf_train_results["R2"], svr_train_results["R2"]],
    "MedAE": [lr_train_results["MedAE"], dt_train_results["MedAE"], rf_train_results["MedAE"], svr_train_results["MedAE"]]
})

display_train_df = train_results_df.copy()
display_train_df[num_cols] = display_train_df[num_cols].map(lambda x: f"{x:,.2f}")
print(display_train_df)

In [None]:
# numpy'nin uzun çıktıları kesmemesi için
np.set_printoptions(threshold=1000)

# Her modelin tahminlerini gerçek değerlerle karşılaştıran grafik
plt.figure(figsize=(16, 8))

# Gerçek değerler
actual = test["fiyat"].values
x_pos = np.arange(len(actual))

# Bar genişliği
bar_width = 0.15

# Her model için tahminleri çiz
plt.bar(x_pos - 1.5*bar_width, actual, bar_width, label='Gerçek Değer', color='black', alpha=0.7)
plt.bar(x_pos - 0.5*bar_width, lr_predictions, bar_width, label='Linear Regression', color='blue', alpha=0.7)
plt.bar(x_pos + 0.5*bar_width, dt_predictions, bar_width, label='Decision Tree', color='green', alpha=0.7)
plt.bar(x_pos + 1.5*bar_width, rf_predictions, bar_width, label='Random Forest', color='orange', alpha=0.7)
plt.bar(x_pos + 2.5*bar_width, svr_predictions, bar_width, label='SVR', color='red', alpha=0.7)

plt.xlabel('Test Örnekleri')
plt.ylabel('Fiyat')
plt.title('Model Tahminlerinin Gerçek Değerlerle Karşılaştırılması')
plt.xticks(x_pos, range(1, len(actual) + 1))
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
#sonuçları karşılaştırarak en iyi modeli seçiyoruz (test seti bazında)
results = {
    "Linear Regression": lr_test_results,
    "Decision Tree": dt_test_results,
    "Random Forest": rf_test_results,
    "SVR": svr_test_results
}
best_model = min(results, key=lambda x: results[x]["MAE"])
print("En İyi Model:", best_model, "with Test MAE:", results[best_model]["MAE"])

In [None]:
#en iyi modelin; model.intercept_,std_err ve model.coef_ değerlerini yazdırıyoruz

# formatlama fonksiyonları
fmt_float = lambda x: f"{x:,.2f}"
fmt_coef = lambda x: f"{x:,.4f}"
fmt_array = lambda arr, fmt=fmt_coef: ", ".join(fmt(float(v)) for v in np.atleast_1d(arr))

if best_model == "Linear Regression":
    print("En İyi Model: Linear Regression")
    intercept = float(lr_model.intercept_)
    coefs = lr_model.coef_
    # get feature names from training dataframe (preserve column order)
    feature_names = list(train.drop(columns=["fiyat", "evNo"]).columns)
    residuals = lr_model.predict(train.drop(columns=["fiyat", "evNo"])) - train["fiyat"]
    std_err = float(np.std(residuals))
    print("Intercept:", fmt_float(intercept))
    print("Coefficients:")
    # print coefficient with sequence number and feature name for clarity (1-based index)
    for idx, (name, c) in enumerate(zip(feature_names, coefs), start=1):
        print(f"  {idx}. {name}: {fmt_coef(c)}")
    print("Standard Error (train residuals):", fmt_float(std_err))

elif best_model == "Decision Tree":
    print("En İyi Model: Decision Tree")
    residuals = dt_model.predict(train.drop(columns=["fiyat", "evNo"])) - train["fiyat"]
    std_err = float(np.std(residuals))
    print("Intercept: N/A for Decision Tree")
    print("Coefficients: N/A for Decision Tree")
    print("Standard Error (train residuals):", fmt_float(std_err))

elif best_model == "Random Forest":
    print("En İyi Model: Random Forest")
    residuals = rf_model.predict(train.drop(columns=["fiyat", "evNo"])) - train["fiyat"]
    std_err = float(np.std(residuals))
    print("Intercept: N/A for Random Forest")
    print("Coefficients: N/A for Random Forest")
    print("Standard Error (train residuals):", fmt_float(std_err))

elif best_model == "SVR":
    print("En İyi Model: SVR")
    # SVR intercept_ ve support_vectors_ olabilir
    if hasattr(svr_model, "intercept_"):
        intercepts = np.atleast_1d(svr_model.intercept_)
        print("Intercept(s):", fmt_array(intercepts, fmt=fmt_float))
    else:
        print("Intercept: N/A for SVR")
    if hasattr(svr_model, "support_vectors_"):
        sv = svr_model.support_vectors_
        print("Support vectors count:", sv.shape[0])
        # İlk 5 support vector'u göster
        n_show = min(5, sv.shape[0])
        print(f"First {n_show} support vectors:")
        for i in range(n_show):
            print(" ", fmt_array(sv[i], fmt=fmt_coef))
    else:
        print("Support Vectors: N/A")

In [None]:
#model_intercept_, std_err ve model.coef_ değerlerini grafik olarak gösteriyoruz
if best_model == "Linear Regression":
    plt.figure(figsize=(10, 5))
    coef_vals = lr_model.coef_
    feature_names = list(train.drop(columns=["fiyat", "evNo"]).columns)
    # build numbered labels like '1. brutM2', '2. netM2', ... for the x-ticks
    numbered_labels = [f"{i+1}. {n}" for i, n in enumerate(feature_names)]
    plt.bar(range(len(coef_vals)), coef_vals)
    plt.title("Linear Regression Coefficients")
    plt.xlabel("Feature")
    plt.ylabel("Coefficient Value")
    # label x-ticks with feature names and rotate for readability
    plt.xticks(range(len(feature_names)), numbered_labels, rotation=45, ha="right")
    plt.tight_layout()
    plt.show()
else:
    print("Coefficient plot is only available for Linear Regression model.")


In [None]:
# Feature importance grafikleri (tree-based modeller için)
feature_names = list(train.drop(columns=["fiyat", "evNo"]).columns)
numbered_labels = [f"{i+1}. {n}" for i, n in enumerate(feature_names)]

if best_model == "Decision Tree":
    plt.figure(figsize=(12, 6))
    importances = dt_model.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    plt.subplot(1, 2, 1)
    plt.bar(range(len(importances)), importances[indices])
    plt.title("Decision Tree - Feature Importance (Sorted)")
    plt.xlabel("Feature")
    plt.ylabel("Importance")
    plt.xticks(range(len(feature_names)), [numbered_labels[i] for i in indices], rotation=45, ha="right")
    
    plt.subplot(1, 2, 2)
    plt.barh(range(len(importances)), importances[indices])
    plt.title("Decision Tree - Feature Importance (Horizontal)")
    plt.xlabel("Importance")
    plt.ylabel("Feature")
    plt.yticks(range(len(feature_names)), [numbered_labels[i] for i in indices])
    plt.gca().invert_yaxis()
    
    plt.tight_layout()
    plt.show()
    
    # Print top 5 features
    print("\nTop 5 Most Important Features:")
    for i in range(min(5, len(importances))):
        idx = indices[i]
        print(f"{i+1}. {feature_names[idx]}: {importances[idx]:.4f}")

elif best_model == "Random Forest":
    plt.figure(figsize=(12, 6))
    importances = rf_model.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    plt.subplot(1, 2, 1)
    plt.bar(range(len(importances)), importances[indices])
    plt.title("Random Forest - Feature Importance (Sorted)")
    plt.xlabel("Feature")
    plt.ylabel("Importance")
    plt.xticks(range(len(feature_names)), [numbered_labels[i] for i in indices], rotation=45, ha="right")
    
    plt.subplot(1, 2, 2)
    plt.barh(range(len(importances)), importances[indices])
    plt.title("Random Forest - Feature Importance (Horizontal)")
    plt.xlabel("Importance")
    plt.ylabel("Feature")
    plt.yticks(range(len(feature_names)), [numbered_labels[i] for i in indices])
    plt.gca().invert_yaxis()
    
    plt.tight_layout()
    plt.show()
    
    # Print top 5 features
    print("\nTop 5 Most Important Features:")
    for i in range(min(5, len(importances))):
        idx = indices[i]
        print(f"{i+1}. {feature_names[idx]}: {importances[idx]:.4f}")

elif best_model == "Linear Regression":
    # Coefficient visualization already handled in previous cell
    print("\nFeature coefficients already displayed in the previous cell.")
    
elif best_model == "SVR":
    print("\nSVR does not provide direct feature importance.")
    print("Consider using permutation importance or SHAP values for interpretation.")