# Machine Learning - Price Prediction

------

#### **Import libraries**

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, root_mean_squared_error, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from category_encoders import TargetEncoder
import lightgbm as lgb

## Load & Prepare Data for Price Prediction

In [2]:
df = pd.read_csv('./Divar.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)

  df = pd.read_csv('./Divar.csv')


## Preprocessing for Sell data

##### Drop some features intuitively:
We decided to drop these features since we realizes they can't have a significant effect on our prediction.


In [3]:
price_data_no_null = df[df['price_value'].notna()]
price_data = price_data_no_null[['cat2_slug', 'cat3_slug', 'city_slug','neighborhood_slug',
                                          'land_size', 'building_size', 'deed_type', 'has_business_deed', 'floor',
                                          'rooms_count', 'total_floors_count', 'unit_per_floor', 'has_balcony',
                                          'has_elevator', 'has_warehouse', 'has_parking', 'construction_year',
                                          'is_rebuilt', 'has_warm_water_provider', 'has_heating_system', 
                                          'has_cooling_system', 'has_restroom', 'has_security_guard', 'has_barbecue',
                                          'building_direction', 'has_pool', 'has_jacuzzi', 'has_sauna',
                                          'floor_material', 'property_type', 'location_latitude', 
                                          'location_longitude', 'price_value']]
del df, price_data_no_null

### Missing Values

#### Feature_set_1: Select more important features

In [4]:
price_data_1 = price_data[['cat2_slug', 'cat3_slug', 'city_slug', 'building_size',
                            'deed_type', 'has_business_deed', 'floor','rooms_count', 'is_rebuilt',
                            'floor_material','construction_year','has_security_guard', 'has_barbecue',
                            'has_pool', 'has_jacuzzi', 'has_sauna', 'has_balcony', 'has_elevator', 'has_warehouse',
                            'has_parking', 'total_floors_count','location_latitude', 'location_longitude',
                            'price_value', 'neighborhood_slug'
                           ]]

Create `building_age` from `construction_year` feature:

In [5]:
# Create building_age columns from the Construction_year
# Digits conversion to english
def persian_to_english(sample_input: str):
    persian_digits = '۰۱۲۳۴۵۶۷۸۹'
    english_digits = '0123456789'
    trans_table = str.maketrans(persian_digits, english_digits)
    return sample_input.translate(trans_table)

price_data_1['construction_year'] = price_data_1.loc[:, 'construction_year'].apply(
    lambda x: persian_to_english(x) if isinstance(x, str) else x)

# change to only digits
price_data_1['construction_year'] = price_data_1['construction_year'].replace('قبل از 1370', '1370')

# change to numeric type
price_data_1['construction_year'] = pd.to_numeric(price_data_1['construction_year'], errors='coerce')
price_data_1['building_age'] = 1404 - price_data_1['construction_year']

# Bin the building_age feature
bins = [0, 5, 10, 20, 30, np.inf]
labels = ["new", "relatively_new", "mid_age", "old", "very_old"]
price_data_1["building_age"] = pd.cut(price_data_1["building_age"], bins=bins, labels=labels)

# Handle missings
price_data_1.loc[:, "building_age"] = price_data_1["building_age"].cat.add_categories("unselect").fillna("unselect")
price_data_1.drop(columns=['construction_year'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  price_data_1['construction_year'] = price_data_1.loc[:, 'construction_year'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  price_data_1['construction_year'] = price_data_1['construction_year'].replace('قبل از 1370', '1370')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  price_data_1['constr

#### Luxury_score, Non_luxury_score

In [6]:
# Add Luxury_score and non_luxury_score columns to the dataframe
# Define luxury and non-luxury features

# First fix has_balcony column (map true to True, false to False, and unselect to False)
price_data_1.loc[:, "has_balcony"] = price_data_1["has_balcony"].replace({"unselect": False})
price_data_1.loc[:, "has_balcony"] = price_data_1["has_balcony"].replace({"true": True})
price_data_1.loc[:, "has_balcony"] = price_data_1["has_balcony"].replace({"false": False})

luxury_features = ['has_security_guard', 'has_barbecue', 'has_pool', 'has_jacuzzi', 'has_sauna']
non_luxury_features = ['has_balcony', 'has_elevator', 'has_warehouse', 'has_parking']

price_data_1.loc[:, "luxury_score"] = price_data_1[luxury_features].sum(axis=1).astype('int64')
price_data_1.loc[:, "non_luxury_score"] = price_data_1[non_luxury_features].sum(axis=1).astype('int64')

price_data_1 = price_data_1.drop(columns=luxury_features + non_luxury_features)

def bin_luxury_score(x):
    if pd.isna(x):
        return "unselect"
    elif x == 0:
        return "none"
    elif x <= 2:
        return "low"
    elif x <= 4:
        return "medium"
    else:
        return "high"

# Convert dtype to object to avoid FutureWarning when assigning strings
price_data_1["luxury_score"] = price_data_1["luxury_score"].astype('object')
price_data_1["non_luxury_score"] = price_data_1["non_luxury_score"].astype('object')

price_data_1.loc[:, "luxury_score"] = price_data_1["luxury_score"].apply(bin_luxury_score)
price_data_1.loc[:, "non_luxury_score"] = price_data_1["non_luxury_score"].apply(bin_luxury_score)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  price_data_1.loc[:, "luxury_score"] = price_data_1[luxury_features].sum(axis=1).astype('int64')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  price_data_1.loc[:, "non_luxury_score"] = price_data_1[non_luxury_features].sum(axis=1).astype('int64')


#### rooms_count

In [7]:
# rooms_count column conversion
room_map = {
    "بدون اتاق": "0_rooms",
    "یک": "1_room",
    "دو": "2_rooms",
    "سه": "3_rooms",
    "چهار": "4_rooms",
    "پنج یا بیشتر": "5plus_rooms"
}

# Map the values
price_data_1["rooms_count"] = price_data_1["rooms_count"].map(room_map)

# Handle missing values (map to unselect)
price_data_1["rooms_count"] = price_data_1["rooms_count"].fillna("unselect")

#### cat2_slug and cat3_slug

In [8]:
# cat2_slug and cat3_slug column cleaning for sell data only
price_data_filtered = price_data_1[price_data_1['cat2_slug'].isin(['residential-sell', 'commercial-sell', 'real-estate-services'])].copy()

# handling missing
price_data_filtered.loc[:, "cat2_slug"] = price_data_filtered["cat2_slug"].fillna("unselect")

# cat3_slug cleaning for rent data removal
price_data_filtered = price_data_filtered[~price_data_filtered['cat3_slug'].str.contains('-rent', na=False)].copy()

# handling missing
price_data_filtered.loc[:, "cat3_slug"] = price_data_filtered["cat3_slug"].fillna("unselect")

#### deed_type

In [9]:
# handling missing values
price_data_filtered.loc[:, 'deed_type'] = price_data_filtered['deed_type'].fillna('unselect')

#### floor_material

In [10]:
# floor_material
price_data_filtered.loc[:, 'floor_material'] = price_data_filtered['floor_material'].fillna('unselect')


#### building_size and city_slug

In [11]:
# dropna for building_size because it is 
price_data_filtered = price_data_filtered.dropna(subset=['building_size', 'city_slug'])

### **- Clip Extreme Outliers** <br>
There are very large and very small, unreasonable values for total credit that might cause errors on model. We simply cut them from dataset using a threshhold.

**Chosen total credit range:** `500,000,000` to `80,000,000,000`<br>

**↪Why?**<br>

In Iran's property market, at this time, total prices are mostly below 80 billion, and above 500 million Tomans. As we checked in Divar, one of the largest online classified marketplace applications in Iran, most of advertisements are at this range. In additions, some prices are old and too low counted as noises, so we try to put them aside. <br>

In addition, when we go beyond this range, the values extremely increase and harm model predictions, plus we are using tree based models that act poorly when they get an input out of training range.

**Chosen building size range:** `20` to `3000`<br>

**↪Why?**<br>

According to Iran's market, most of properties published are above 20 and below 3000, as we checked at Divar application. When we go beyond this range, values become very large or low and prices become extreme<br>

In addition, when we go beyond this range, the values extremely increase and harm model predictions, plus we are using tree based models that act poorly when they get an input out of training range.

In [12]:
price_data_filtered = price_data_filtered[(price_data_filtered["building_size"] >= 20) &
                                           (price_data_filtered["building_size"] <= 3000)]
price_data_filtered = price_data_filtered[(price_data_filtered["price_value"] >= 500_000_000) & 
                            (price_data_filtered["price_value"] <= 80_000_000_000)]

In [13]:
# First we need to separate data to: train/validation/test 75/12.5/12.5
train_data = price_data_filtered.dropna(subset=['location_latitude', 'location_longitude'])
train_data, val_data = train_test_split(price_data_filtered, test_size=0.2, random_state=42)

print(f"Train size: {len(train_data)}")
print(f"Validation size: {len(val_data)}")

Train size: 402607
Validation size: 100652


Binning `floor` and `total_floors_count`:

In [14]:
# Train data
train_data['floor'] = pd.to_numeric(train_data['floor'], errors='coerce')
train_data['floor'] = train_data['floor'].apply(lambda x: 0 if x < 0 else x)
floor_mode = train_data['floor'].mode()[0]
train_data['floor'] = train_data['floor'].fillna(floor_mode)
train_data['floor'] = train_data['floor'].astype(int)

train_data['floor_cat'] = pd.cut(train_data['floor'], 
                                 bins=[-1, 0, 5, 10, 20, float('inf')], 
                                 labels=['basement', 'low-height', 'mid-height', 'high-height', 'very-high'],
                                 right=True)

# Validation data
val_data['floor'] = pd.to_numeric(val_data['floor'], errors='coerce')
val_data['floor'] = val_data['floor'].apply(lambda x: 0 if x < 0 else x)
val_data['floor'] = val_data['floor'].fillna(floor_mode)
val_data['floor'] = val_data['floor'].astype(int)

val_data['floor_cat'] = pd.cut(val_data['floor'], 
                               bins=[-1, 0, 5, 10, 20, float('inf')], 
                               labels=['basement', 'low-height', 'mid-height', 'high-height', 'very-high'],
                               right=True)

# total_floors_count
# Train data
train_data['total_floors_count'] = pd.to_numeric(train_data['total_floors_count'], errors='coerce')
train_data['total_floors_count'] = train_data['total_floors_count'].apply(lambda x: 0 if x < 0 else x)
total_floor_mode = train_data['total_floors_count'].mode()[0]
train_data['total_floors_count'] = train_data['total_floors_count'].fillna(total_floor_mode)
train_data['total_floors_count'] = train_data['total_floors_count'].astype(int)

train_data['total_floors_cat'] = pd.cut(train_data['total_floors_count'], 
                                              bins=[0, 5, 15, 20, float('inf')], 
                                              labels=['low', 'medium', 'high', 'very-high'],
                                              right=True)

# Validation data
val_data['total_floors_count'] = pd.to_numeric(val_data['total_floors_count'], errors='coerce')
val_data['total_floors_count'] = val_data['total_floors_count'].apply(lambda x: 0 if x < 0 else x)
val_data['total_floors_count'] = val_data['total_floors_count'].fillna(total_floor_mode)
val_data['total_floors_count'] = val_data['total_floors_count'].astype(int)

val_data['total_floors_cat'] = pd.cut(val_data['total_floors_count'], 
                                            bins=[0, 5, 15, 20, float('inf')], 
                                            labels=['low', 'medium', 'high', 'very-high'],
                                            right=True)

# Remove floor and total_floors_count columns
train_data = train_data.drop(columns=['floor', 'total_floors_count'])
val_data = val_data.drop(columns=['floor', 'total_floors_count'])

#### Fill `location_latitude` and `location_longitude` Based on `city_slug` mean

In [15]:
# # Fill lat,long missing values based on their city_slug mean
# # Calculate mean of all locations
overall_lat_mean = train_data['location_latitude'].mean()
overall_long_mean = train_data['location_longitude'].mean()

# First fill missing based on their city mean
train_data['location_latitude'] = train_data.groupby('city_slug')['location_latitude'].transform(lambda x: x.fillna(x.mean()))
train_data['location_longitude'] = train_data.groupby('city_slug')['location_longitude'].transform(lambda x: x.fillna(x.mean()))

# Fill the rest of the missing values with overall mean
train_data['location_latitude'] = train_data['location_latitude'].fillna(overall_lat_mean)
train_data['location_longitude'] = train_data['location_longitude'].fillna(overall_long_mean)


In [16]:
# # Fill lat,long missing values on validation dataset
city_lat_means = train_data.groupby('city_slug')['location_latitude'].mean()
city_long_means = train_data.groupby('city_slug')['location_longitude'].mean()

val_data['location_latitude'] = val_data.apply(
    lambda row: city_lat_means.get(row['city_slug'], overall_lat_mean) 
    if pd.isna(row['location_latitude']) else row['location_latitude'],
    axis=1
)

val_data['location_longitude'] = val_data.apply(
    lambda row: city_long_means.get(row['city_slug'], overall_long_mean) 
    if pd.isna(row['location_longitude']) else row['location_longitude'],
    axis=1
)

## **Scaling**

As we dont have numerical features to scale, we will skip this phase.

## **Outlier Handling** <br>
Features like land_size, building_size and total_credit have large standard deviations and not normally distributed.

## **Encoding**

In [17]:
X_train = train_data.drop(columns=['price_value'])
y_train = train_data['price_value'] 
X_val = val_data.drop(columns=['price_value'])
y_val = val_data['price_value']

In [18]:
def encode_boolean_features(df, boolean_columns):
    """
    Encodes boolean columns as 0 or 1 (or -1 for missings).
    """
    df = df.copy()
    for col in boolean_columns:
        df[col] = (
            df[col]
            .map({True: 1, False: 0})
            .fillna(-1)
            .astype(int)
        )
    return df

boolean_cols = ["is_rebuilt", 'has_business_deed']

X_train = encode_boolean_features(X_train, boolean_cols)
X_val   = encode_boolean_features(X_val, boolean_cols)

Encode **Categorical Features**:

Apply Target Encoding on `neighborhood_slug` and `city_slug`:

In [19]:
te = TargetEncoder(
    cols=["neighborhood_slug", "city_slug"],
    smoothing=10
)

X_train_encoded = te.fit_transform(X_train, y_train)
X_val_encoded = te.transform(X_val)

Apply One-Hot Encoding to Categorical Features:

In [20]:
# One-hot encode categorical features

categorical_cols = ['cat2_slug', 'cat3_slug', 'deed_type', 'rooms_count', 'floor_material',
                    'luxury_score', 'non_luxury_score', 'building_age', 'floor_cat', 'total_floors_cat']

ohe = OneHotEncoder(
    handle_unknown="ignore",
    sparse_output=False
)

X_train_cat = ohe.fit_transform(X_train_encoded[categorical_cols])
X_val_cat = ohe.transform(X_val_encoded[categorical_cols])

# Convert encoded variables to pandas dataframe
X_train_cat = pd.DataFrame(X_train_cat, 
                            columns=ohe.get_feature_names_out(categorical_cols),
                            index=X_train_encoded.index)

X_val_cat = pd.DataFrame(X_val_cat, 
                           columns=ohe.get_feature_names_out(categorical_cols),
                           index=X_val_encoded.index)

# Recombine the numeric and categorical features
X_train_final = pd.concat([
    X_train_encoded.drop(columns=categorical_cols),
    X_train_cat
], axis=1)

X_val_final = pd.concat([
    X_val_encoded.drop(columns=categorical_cols),
    X_val_cat
], axis=1)


## Modeling and Prediction

### Train Random Forrest Model

In [21]:
# Best parameters: {'n_estimators': 355, 'max_depth': 22, 'min_samples_split': 13, 'min_samples_leaf': 5}

rf = RandomForestRegressor(n_estimators=355, max_depth=22, min_samples_split=13, 
                           min_samples_leaf=5, random_state=42, n_jobs=-1)
rf.fit(X_train_final, y_train)

0,1,2
,n_estimators,355
,criterion,'squared_error'
,max_depth,22
,min_samples_split,13
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [22]:
y_train_pred = rf.predict(X_train_final)
y_pred = rf.predict(X_val_final)


print("Training R2 Score:", r2_score(y_train, y_train_pred))
print("----------------------------------------")
print(f"Validation R2 Score: {r2_score(y_val, y_pred)}")
print(f"MAE: {mean_absolute_error(y_val, y_pred)}")
print(f"MSE: {np.sqrt(mean_squared_error(y_val, y_pred))}")

Training R2 Score: 0.7976371632139638
----------------------------------------
Validation R2 Score: 0.7004842408125016
MAE: 1863372662.5438197
MSE: 4394434838.369714


### Train LGBoost Model

In [23]:
best_params = {'n_estimators': 1147,
                'learning_rate': 0.013036971615551675,
                'num_leaves': 129,
                'max_depth': 22,
                'min_child_samples': 97,
                'subsample': 0.6410214697523549,
                'colsample_bytree': 0.6209832697195263,
                "random_state": 42,
                "n_jobs": -1
                }


lgb_model = lgb.LGBMRegressor(**best_params)
lgb_model.fit(X_train_final, y_train)

y_train_pred = lgb_model.predict(X_train_final)
y_pred = lgb_model.predict(X_val_final)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026286 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 402607, number of used features: 57
[LightGBM] [Info] Start training from score 5648548811.299476


In [24]:
print("Training R2 Score:", r2_score(y_train, y_train_pred))
print("----------------------------------------")
print(f"Validation R2 Score: {r2_score(y_val, y_pred)}")
print(f"MAE: {mean_absolute_error(y_val, y_pred)}")
print(f"MSE: {np.sqrt(mean_squared_error(y_val, y_pred))}")

Training R2 Score: 0.7420853657876887
----------------------------------------
Validation R2 Score: 0.7125351828460385
MAE: 1871345692.3176231
MSE: 4305122757.128823


In [None]:
# delete variables to free up memory
del price_data, price_data_1, price_data_filtered, train_data, val_data, X_train
del X_val_cat, y_train, y_train_pred, y_val, X_train_cat, X_train_encoded
del X_train_final, X_val, X_val_encoded, X_val_final, y_pred