In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, PowerTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans

In [3]:
df = pd.read_csv("./data/analysed_project.csv", index_col=False)
df.head()

Unnamed: 0,country,location,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_bedrooms,apartment_bathrooms,apartment_total_area,price_in_USD,...,apartment_rooms_missing,apartment_bedrooms_missing,apartment_bathrooms_missing,price_in_USD_missing,building_construction_year_missing,apartment_total_area_m2,apartment_living_area_m2,area_from_title,area_from_title m²,property_type
0,turkey,"mediterranean region, turkey",2021.0,5.0,1.0,3.0,2.0,2.0,120 m²,315209.0,...,0,0,0,0,1,120.0,110,120.0,120.0,apartment
1,turkey,"mediterranean region, antalya, turkey",2021.0,5.0,2.0,2.0,1.0,1.0,65 m²,173211.0,...,0,0,0,0,1,65.0,60,65.0,65.0,apartment
2,thailand,"chon buri province, pattaya, thailand",2020.0,15.0,5.0,2.0,1.0,1.0,,99900.0,...,0,0,0,0,0,8800.0,40,,,apartment
3,thailand,"chon buri province, pattaya, thailand",2026.0,8.0,3.0,3.0,2.0,1.0,,67000.0,...,0,0,0,0,0,8800.0,36,,,apartment
4,georgia,"abkhazia, batumi, georgia",2026.0,5.0,4.0,1.0,2.0,1.0,28 m²,35622.0,...,0,1,0,0,0,28.0,4600,28.0,28.0,apartment


In [4]:
df.describe()

Unnamed: 0,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_bedrooms,apartment_bathrooms,price_in_USD,building_total_floors_missing,apartment_floor_missing,apartment_rooms_missing,apartment_bedrooms_missing,apartment_bathrooms_missing,price_in_USD_missing,building_construction_year_missing,apartment_total_area_m2,apartment_living_area_m2,area_from_title,area_from_title m²
count,118823.0,118823.0,118823.0,118823.0,118823.0,118823.0,118823.0,118823.0,118823.0,118823.0,118823.0,118823.0,118823.0,118823.0,118823.0,118823.0,114507.0,114507.0
mean,2009.191529,6.986551,4.81969,2.2523,2.032006,1.113,180245.910825,0.500829,0.572078,0.436372,0.75153,0.57382,0.021671,0.513486,411.867736,3689.242899,95.702097,95.702097
std,109.527511,5.905022,3.655838,0.885689,10.183834,0.37033,114330.450494,0.500001,0.49478,0.495937,0.432128,0.494523,0.145607,0.49982,1630.212699,1819.153344,75.76567,75.76567
min,1.0,1.0,-2.0,-1.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2021.0,5.0,4.0,2.0,2.0,1.0,91253.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,53.0,4600.0,52.0,52.0
50%,2021.0,5.0,4.0,2.0,2.0,1.0,155415.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,76.0,4600.0,75.0,75.0
75%,2021.0,6.0,4.0,3.0,2.0,1.0,249885.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,120.0,4600.0,113.0,113.0
max,2316.0,115.0,202.0,60.0,2009.0,21.0,485777.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,8800.0,4600.0,998.0,998.0


In [5]:
df.duplicated().sum()

np.int64(5636)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118823 entries, 0 to 118822
Data columns (total 22 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   country                             118719 non-null  object 
 1   location                            118719 non-null  object 
 2   building_construction_year          118823 non-null  float64
 3   building_total_floors               118823 non-null  float64
 4   apartment_floor                     118823 non-null  float64
 5   apartment_rooms                     118823 non-null  float64
 6   apartment_bedrooms                  118823 non-null  float64
 7   apartment_bathrooms                 118823 non-null  float64
 8   apartment_total_area                114507 non-null  object 
 9   price_in_USD                        118823 non-null  float64
 10  building_total_floors_missing       118823 non-null  int64  
 11  apartment_floor_missing   

### Converting the prices to log form for the model to learn better

In [7]:
df.shape

(118823, 22)

In [8]:
# Add log-transformed price
df["log_price"] = np.log1p(df["price_in_USD"])

# Quick check
df[["price_in_USD", "log_price"]].head()

Unnamed: 0,price_in_USD,log_price
0,315209.0,12.660994
1,173211.0,12.062272
2,99900.0,11.511935
3,67000.0,11.112463
4,35622.0,10.480747


## Feature Engineering

**Real-estate prices are naturally skewed because a few apartments are extremely expensive while most are affordable. This skew affects model learning. To fix this, I applied a log transformation to the target variable. The log-transform reduces the impact of extreme values, stabilizes variance, and makes the pattern more linear. This helps the model learn better and improves prediction accuracy.**

In [9]:
# getting the features and the target

#  Define numeric & categorical features
num_features = [
    'building_construction_year', 'building_total_floors', 'apartment_floor',
    'apartment_rooms', 'apartment_bedrooms', 'apartment_bathrooms',
    'apartment_total_area_m2', 'apartment_living_area_m2'
]

cat_features = ['property_type', 'location']

X = df[num_features + cat_features]
y = df['log_price'] # target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [10]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (95058, 10)
X_test shape: (23765, 10)
y_train shape: (95058,)
y_test shape: (23765,)


### Creating Preproceseessing

In [11]:
# Numeric transformer
num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Categorical transformer
cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Full preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features)
    ]
)

### Building the full pipeline

In [12]:
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(n_estimators=100, random_state=42))
])

### Training the model

In [None]:
pipeline.fit(X_train, y_train)

### predicting on test set

In [None]:
# predict on test set
y_pred = pipeline.predict(X_test)
y_pred

### Building Baseline model

In [None]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

rf_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", rf)
])

**Predicting on test set and then getting the metrics**

In [None]:
y_pred = pipeline.predict(X_test)

rf_pipeline =  rf_pipeline.fit(X_train, y_train)

y_pred_rf = rf_pipeline.predict(X_test)

# getting the metrics (i.e. errors)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest RMSE:", rmse_rf)
print("Random Forest R2:", r2_rf)

**Printint converted logged price version inorder fot the RMSE and the R2 to display in the original unit**

In [None]:
# convert to USD
y_test_usd = np.expm1(y_test)
y_pred_usd = np.expm1(y_pred_rf)

rmse_rf_usd = np.sqrt(mean_squared_error(y_test_usd, y_pred_usd))
r2_rf_usd = r2_score(y_test_usd, y_pred_usd)

print("Random Forest RMSE (USD):", rmse_rf_usd)
print("Random Forest R2 (USD):", r2_rf_usd)

### Build a dummy regressor

In [None]:
dummy = DummyRegressor(strategy="mean")
dummy.fit(X_train, y_train)

y_dummy_pred = dummy.predict(X_test)

rmse_dummy = np.sqrt(mean_squared_error(y_test, y_dummy_pred))
r2_dummy = r2_score(y_test, y_dummy_pred)

print("Dumy RMSE: ", rmse_dummy)
print ("R2 score: ", r2_dummy)

**I first built a Dummy Regressor as the baseline model.
This model does not learn from data instead, it simply predicts the average house price for every record.
The baseline achieved an RMSE of 1.15 and an R² close to 0, which is expected.
Any useful model must perform better than this baseline.**

### Getting the cross validation error

In [None]:
cv_rmse = -cross_val_score(
    pipeline,
    X_train,
    y_train,
    cv=5,
    scoring="neg_root_mean_squared_error"
)
print("CV RMSE scores:", cv_rmse)
print("Mean CV RMSE:", np.mean(cv_rmse))

**This result shows that the model is doing great. Just have to tune it and that will lower the mean of the RMSE denoting the model performs accurately well**

### Fit the pipeline on the training set

In [None]:
pipeline.fit(X_train, y_train)

### Evaluate Linear Regression Model

In [None]:
# creating linear regression pipeline
lr = LinearRegression()
lr_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", lr)
])

# fit the model
lr_pipeline.fit(X_train, y_train)

# predict
y_pred_lr = lr_pipeline.predict(X_test)

# Evaluating the model
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)

print("Linear Regression RMSE:", rmse_lr)
print("Linear Regression R²:", r2_lr)

**This shows the performance of the model is still ok even with LinearRegression since R2_score is greater than RMSE and RMSE is not up to 0.5**

### Comparing the three models (DummyRegressor, Linear Regression, Random Forest)
**First storing the models in dictionary so as to makes the data organized and grouped**

In [None]:
results = {
    "Model": ["Dummy Regressor", "Linear Regression", "Random Forest"],
    "RMSE": [rmse_dummy, rmse_lr, rmse_rf],
    "R2": [r2_dummy, r2_lr, r2_rf]
}

**Convert it to DataFrame for easy display**

In [None]:
results_df = pd.DataFrame(results)
results_df

**According to this comparison table, I have decided to chose RandomForest as my final model that i will be working with because it is the model with the lowest RMSE and highest R2. Therefore i will perform hyperparameter tuning on the RF since it is the best of the three models**  

### Visualizing the result

In [None]:
# RMSE comparison
plt.figure(figsize=(8, 5))
sns.barplot(x="RMSE", y="Model", data=results_df)
plt.title("RMSE Comparison of models")
plt.show()

# R2 comparing
plt.figure(figsize=(8, 5))
sns.barplot(x="R2", y="Model", data=results_df)
plt.title("R2 Comparison of models")
plt.show()

**From this comparison, i discorvered that RandomForest model is the best model because it has it RMSE to be 0.4 anad it R2 to be0.85 which apparently is enough to chose it out of the three models that were built. So, I will build hyperpaarameter tuning on the Random Forest to make it more improved**

### Creating Final Comparison OF The Three Models

**Identifying features by creating a copied one from the original one**

In [None]:
df = df.copy()

X = df.drop(["log_price"], axis=1)
y = df["log_price"]

num_feat = X.select_dtypes(include=[np.number]).columns
cat_feat = X.select_dtypes(exclude=[np.number]).columns

### Preprocessor

In [None]:
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
         ("cat", cat_transformer, cat_features)
    ]
)

### Split to train-test split using the 20/80 % format for the splitting

In [None]:
X_train, X_test, X_test, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

### Cresting the dictionary-form of the models to compare

In [None]:
models = {
    "DummyRegressor": DummyRegressor(),
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(n_estimators=150, random_state=42)
}

results

**As at this stage i am convinced the RandomForest model shows out to still be the best in it performance considering it RMSE being the lowest rmse and the r2_score as the highest when compared to other models which have higher RMSE more than that of RF and lower r2_score less than that of RF's** 

### training, predicting and evaluating the models

In [None]:
for name, model in models.items():
    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("model", model)
    ])

    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)

    rmse = np.sqrt(mean_sqaured_error(y_test, pred))
    r2 = r2_score(y_test, preds)

    results.append([name, rmse, r2])

**Now Comparing The Table**

In [None]:
comparison_df = pd.DataFrame(results, columns=["Model", "RMSE", "R2 score"])
print("\n=== MODEL PERFORMANCE COMPARISON ===")
print(comparison_df) 

In [None]:
print(comparison_df)

### Now create Hyperparameter Grid and RandomizedSearchCV

In [None]:
# Define the Hyperparameter grid into Dictionary
param_grid = {
    "model__n_estimators": [100, 200, 300, 400],
    "model__max_depth": [None, 10, 20, 30, 40],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__max_features": ["auto", "sqrt", "log2"]
}

**Randomising search with 5-fold CV**

In [None]:
random_search = RandomizedSearchCV(
    rf_pipeline, # pipeline with preprocessor and RandomFoerst
    param_distributions=param_grid,
    n_iter = 20, # number of random combination
    scoring="r2", # the metrics to optimze
    cv=5,         # 5-fold cross validation
    verbose = 2,
    random_state = 42,
    n_jobs = 1
)

**Fitting the model on training data**

In [None]:
random_search.fit(X_train, y_train)

**Getting the best parameters and scores**

In [None]:
print ("Best R2: ", random_search.best_score_)
print ("Best Parameters: ", random_search.best_params_)

**So, since the reason for comparing model is actually to chose the best out of the three, and have discorvered RandmForest to be the best, i will be making hyperparameter tuning on the RF model since that is the model i chose to perform well. The essence of the Tuning is for the model to perform even more better than when un-tunned**

**Random Forest has many parameters; tuning improves performance. RandomizedSearchCV tries random combinations and picks the best. R2 and RMSE on test set after tuning show real performance. Always keep test set untouched until final evaluation.**

### Define hyperparameter Grid for Random Forest

In [None]:
param_dist = {
    "model__n_estimators": [100, 200, 300, 500],
    "model__max_depth": [None, 10, 20, 30, 40],
    "model__min_sample_split": [2, 5, 10],
    "model__min_sample_leaf": [1, 2, 4],
    "model__max_features": ["sqrt", "log1"]
}

**Pipeline for tuning**

In [None]:
rf_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(random_state=42))
])

**Running RandoMizedSearchCV for the model**

In [None]:
rf_search = RandomizeSearchCV(
    estimator=rf_pipeline,
    para_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring="nrg_mean_squared_error",
    n_jobs=-1,
    random_state=42
)

In [None]:
rf_search.fit(X_train, y_train)

**Printing out the best parameter**

In [None]:
print("Best Parameter: ", rf_search.best_params_)
print("Best CV scores (RMSE): ", -rf_searc.best_score_)

**Evaluating the Model**

In [None]:
best_rf = rf_search.best_estimator_

preds = best_rf.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_preds))
r2 = r2_score(y_test, preds)

print ("Final Tuned RMSE: ", rmse)
print ("Final Tuned R2: ", r2)

y_test = np.exmp1(y_test)
y_pred = np.exmp1(y_preds)

**Compaing RF of before and after the tuning**

In [None]:
comparison = pd.DataFrame({
    "Model": ["Random Forest (Default)", "Random Forest (Tuned)"],
    "RMSE": [rmse_rf, rmse_best],
    "R2": [r2_rf, r2_best]
})

comparison

**Now alalysing feature importance to know which of the features mostly influence the model expecially PRICE_IN_USD**
### Feature Importance

In [None]:
# Extract the random forest inside the pipeline
rf_model = best_model.named_steps["rf"]

# Get one-hot feature names
ohe = best_model.named_steps[
    "preprocessor"].named_transformers_[
    "cat"].named_steps[
    "onehot"]
ohe_features = ohe.get_feature_names_out(cat_features)

# Combine numerical + categorical names
all_features = list(num_features) + list(ohe_features)

# Match RF feature importances with names
importances = rf_model.feature_importances_

feature_importance_df = pd.DataFrame({
    "feature": all_features,
    "importance": importances
}).sort_values(by="importance", ascending=False)

print(feature_importance_df.head(15))

### Using HIstplot to visualize the feature importance

In [None]:
# Creating a horizontal bar plot for the top 15 features
plt.figure(figsize=(10, 8))
sns.barplot(
    x='importance', 
    y='feature', 
    data=feature_importance_df.head(15), 
    palette="magma"
)
plt.title("Top 15 Feature Importances (Random Forest)", fontsize=16)
plt.xlabel("Relative Importance", fontsize=12)
plt.ylabel("Feature Name", fontsize=12)
plt.tight_layout() # Ensures labels don't get cut off
plt.show()

#### For easy and more clearified visualisation, i consider plotting a donut plot for it

In [None]:
# making the top 10 important feature to display, i set it ina variable named 'N'
N = 10 

# preparing data for the variable and making a copy of N 
plot_data = feature_importance_df.head(N).copy()

# Calculating the sum of importance for all features outside the top N
other_importance = feature_importance_df.iloc[N:]["importance"].sum()

# Append the "Other Features" category
other_row = pd.DataFrame([["Other Features", other_importance]], columns=["feature", "importance"])
plot_data = pd.concat([plot_data, other_row], ignore_index=True)

# 2. Create the Donut Chart
plt.figure(figsize=(10, 10))

# Plot the pie chart
wedges, texts, autotexts = plt.pie(
    plot_data["importance"],
    autopct="%1.1f%%", # Format for percentages (e.g., 12.3%)
    startangle=90,
    pctdistance=0.85, # Position of the percentage text
    colors=plt.cm.tab20c(plot_data.index) # Use a good contrasting colormap
)

# Draw a white circle in the center to create the "donut" effect
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

# Ensure the pie is a circle
plt.axis('equal') 

# Add legend using the feature names
plt.legend(
    wedges, 
    plot_data["feature"], 
    title="Feature Groups",
    loc="center left",
    bbox_to_anchor=(1, 0, 0.5, 1) # Position the legend outside the plot
)
plt.title(f"Contribution of Top {N} Features to Total Importance", fontsize=16)
plt.tight_layout()
plt.show()

**These are some of the summary of the modelling** 

**Show Test R2 and RMSE. Mention: We only evaluate on the test set once, after tuning, to ensure the performance is unbiased. Show a prediction for a single apartment as a practical example.**
**We trained the model on log-transformed prices to reduce skewness and stabilize variance. For interpretability, we convert the predictions back to USD using np.expm1().**
**Random Forest provides feature importance. From our model, living area and total area are the strongest predictors of apartment prices, followed by the construction year. Missing data indicators have minor effects, showing that missing values slightly influence predictions.**
**Living area and total apartment area are the two most important predictors, followed by construction year. Features related to missing values have minor influence.**
**This plot shows the correlation of numeric features with apartment prices. Larger living areas and total areas are strongly positively correlated with price, while features indicating missing data have minimal impact.**