# Machine Learning - Total Credit Prediction

---

#### **Import libraries**

In [96]:
import dtale
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Load & Prepare Data for Total Credit Prediction

In [131]:
data = pd.read_csv('../Divar Dataset/Divar.csv')
data.drop('Unnamed: 0', axis=1, inplace=True)

data.head()


Columns (11,27,29,53) have mixed types. Specify dtype option on import or set low_memory=False.



Unnamed: 0,cat2_slug,cat3_slug,city_slug,neighborhood_slug,created_at_month,user_type,description,title,rent_mode,rent_value,...,property_type,regular_person_capacity,extra_person_capacity,cost_per_extra_person,rent_price_on_regular_days,rent_price_on_special_days,rent_price_at_weekends,location_latitude,location_longitude,location_radius
0,temporary-rent,villa,karaj,mehrshahr,2024-08-01 00:00:00,مشاور املاک,۵۰۰متر\n۲۰۰متر بنا دوبلکس\n۳خواب\nاستخر آبگرم ...,باغ ویلا اجاره روزانه استخر داخل لشکرآباد سهیلیه,,,...,,4.0,6.0,350000.0,1500000.0,3500000000.0,3500000.0,35.811684,50.9366,500.0
1,residential-sell,apartment-sell,tehran,gholhak,2024-05-01 00:00:00,مشاور املاک,دسترسی عالی به مترو و شریعتی \nمشاعات تمیز \nب...,۶۰ متر قلهک فول امکانات,,,...,,,,,,,,,,500.0
2,residential-rent,apartment-rent,tehran,tohid,2024-10-01 00:00:00,,تخلیه پایان ماه,آپارتمان ۳ خوابه ۱۳۲ متر,مقطوع,26000000.0,...,,,,,,,,35.703865,51.373459,
3,commercial-rent,office-rent,tehran,elahiyeh,2024-06-01 00:00:00,,فرشته تاپ لوکیشن\n۹۰ متر موقعیت اداری\nیک اتاق...,فرشته ۹۰ متر دفتر کار مدرن موقعیت اداری,مقطوع,95000000.0,...,,,,,,,,,,
4,residential-sell,apartment-sell,mashhad,emamreza,2024-05-01 00:00:00,مشاور املاک,هلدینگ ساختمانی اکبری\n\nهمراه شما هستیم برای ...,۱۱۵ متری/شمالی رو به آفتاب/اکبری,,,...,,,,,,,,,,


**Extract records with rent/credit values:**

In [265]:
# Extract rows where price_value is NaN
credit_data = data.loc[data['price_value'].isna()]
credit_data = credit_data.drop('price_value', axis=1) # Drop the price_value column

credit_data.shape

(431654, 59)

In [266]:
def calculate_total_credit(rent, credit):
    """
    Calculate the total credit based on the given rent and credit values.
    """
    if rent < 0 or credit < 0:
        return 0

    credit_total = credit + (rent * 100) / 3
    transformed_price = credit_total * 6
    
    return transformed_price

In [267]:
# Delete rows where either credit_value or rent_value is NaN
credit_data = credit_data.dropna(subset=["rent_value", "credit_value"])

# Apply the function to calculate the total credit
credit_data["total_credit"] = credit_data.apply(
    lambda row: calculate_total_credit(row["rent_value"], row["credit_value"]),
    axis=1
)

# Drop rent_value and credit_value columns
credit_data = credit_data.drop(["rent_value", "credit_value"], axis=1)

credit_data.total_credit.head()

2     9.700000e+09
3     2.470000e+10
5     2.700000e+09
6     4.100000e+09
11    7.200000e+09
Name: total_credit, dtype: float64

In [268]:
credit_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 351192 entries, 2 to 999999
Data columns (total 58 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   cat2_slug                   351192 non-null  object 
 1   cat3_slug                   351192 non-null  object 
 2   city_slug                   351191 non-null  object 
 3   neighborhood_slug           177341 non-null  object 
 4   created_at_month            351192 non-null  object 
 5   user_type                   102689 non-null  object 
 6   description                 351192 non-null  object 
 7   title                       351161 non-null  object 
 8   rent_mode                   351192 non-null  object 
 9   rent_to_single              15 non-null      object 
 10  rent_type                   102217 non-null  object 
 11  price_mode                  0 non-null       object 
 12  credit_mode                 351192 non-null  object 
 13  rent_credit_transfo

## Preprocessing

### **- Feature Engineering**

##### Drop some features intuitively:
We decided to drop these features since we realizes they can't have a significant effect on our prediction.


In [269]:
credit_data = credit_data.drop(['title', 'description', 'created_at_month', 'user_type', 'location_radius'], axis=1)

Drop the unnecessary features related to price:

In [270]:
credit_data = credit_data.drop(
    ['rent_mode', 'rent_to_single', 'rent_type', 'price_mode', 'credit_mode', 'rent_credit_transform',
        'transformable_price', 'transformable_credit', 'transformed_credit', 'transformable_rent', 'transformed_rent'], axis=1)

Drop features with too many missing values:

In [271]:
credit_data = credit_data.drop(
    ['has_water', 'has_electricity', 'has_gas', 'regular_person_capacity', 'extra_person_capacity',
    'cost_per_extra_person', 'rent_price_on_regular_days', 'rent_price_on_special_days', 'rent_price_at_weekends',
    'property_type', 'deed_type', 'has_business_deed'], axis=1)

In [272]:
# Only numeric features
credit_data.describe()

Unnamed: 0,land_size,building_size,location_latitude,location_longitude,total_credit
count,64398.0,351190.0,234791.0,234791.0,351192.0
mean,4622.749,3298.652,34.883997,51.765549,8500659000000.0
std,119472.6,121169.9,2.34877,3.196032,779275500000000.0
min,1.0,1.0,23.636976,43.305359,206.0
25%,93.0,68.0,34.437214,50.877888,1640000000.0
50%,140.0,92.0,35.719067,51.37101,2900000000.0
75%,210.0,130.0,35.854118,51.716091,5720000000.0
max,10000000.0,10000000.0,39.808537,74.51162,2.06e+17


**Numeric Features:**

land size and building size??
latitude and longitude??

Binning `floor` and `total_floors_count`:

In [273]:
# First clean the features
import numpy as np

def clean_floor(x):
    if pd.isna(x):
        return np.nan
    if x == "unselect":
        return np.nan
    x = str(x).strip()
    if x.endswith("+"):
        return float(x.replace("+", ""))
    return float(x)

credit_data["floor"] = credit_data["floor"].apply(clean_floor)
credit_data["total_floors_count"] = credit_data["total_floors_count"].apply(clean_floor)

In [274]:
# Bin the floor and total_floors_count columns
floor_bins = [-1, 0, 3, 7, 12, 20, 30, np.inf]
floor_labels = ["underground", "low_floors", "mid_low_floors", "mid_floors","high_floors","very_high", "top_floors"]

total_floor_bins = [0, 3, 7, 12, 20, 30, np.inf]
total_floor_labels = ["low_floors", "mid_low_floors", "mid_floors","high_floors","very_high", "top_floors"]

credit_data["floor"] = pd.cut(credit_data["floor"], bins=floor_bins, labels=floor_labels)
credit_data["total_floors_count"] = pd.cut(credit_data["total_floors_count"], bins=total_floor_bins, labels=total_floor_labels)

In [275]:
# Handle their missings under the unselect category
credit_data["floor"] = credit_data["floor"].cat.add_categories("unselect")
credit_data["floor"] = credit_data["floor"].fillna("unselect")

credit_data["total_floors_count"] = credit_data["total_floors_count"].cat.add_categories("unselect")
credit_data["total_floors_count"] = credit_data["total_floors_count"].fillna("unselect")

In [276]:
credit_data['floor'].value_counts()

floor
low_floors        153061
unselect          122668
mid_low_floors     49331
underground        20561
mid_floors          3914
high_floors         1161
very_high            496
top_floors             0
Name: count, dtype: int64

Make the `rooms_count` values English:

In [277]:
room_map = {
    "بدون اتاق": "0_rooms",
    "یک": "1_room",
    "دو": "2_rooms",
    "سه": "3_rooms",
    "چهار": "4_rooms",
    "پنج یا بیشتر": "5plus_rooms"
}

# Map the values
credit_data["rooms_count"] = credit_data["rooms_count"].map(room_map)

In [278]:
# Handle missing values (map to unselect)
credit_data["rooms_count"] = credit_data["rooms_count"].fillna("unselect")

In [279]:
credit_data.rooms_count.value_counts()

rooms_count
2_rooms        154435
1_room          90030
3_rooms         50309
0_rooms         45345
4_rooms          6193
5plus_rooms      4580
unselect          300
Name: count, dtype: int64

Bin `unit_per_floor` feature:

In [280]:
# First convert everything to numeric if possible
def clean_unit_per_floor(x):
    if pd.isna(x):
        return np.nan

    x = str(x).strip()
    
    if x == "unselect":
        return np.nan
    
    if x == "more_than_8":
        return 9.0   # Arbitrary value for binning
    
    try:
        return float(x)
    except ValueError:
        return np.nan

credit_data["unit_per_floor"] = credit_data["unit_per_floor"].apply(clean_unit_per_floor)

In [281]:
# Bin the unit_per_floor columns
bins = [0, 1, 2, 3, 4, 6, 8, np.inf]
labels = ["1", "2", "3", "4", "5-6", "7-8", "9+"]

credit_data["unit_per_floor"] = pd.cut(credit_data["unit_per_floor"], bins=bins, labels=labels, include_lowest=True)

In [282]:
# Handle missings
credit_data["unit_per_floor"] = (credit_data["unit_per_floor"].cat.add_categories("unselect").fillna("unselect"))

In [283]:
credit_data.unit_per_floor.value_counts()

unit_per_floor
unselect    234386
2            44562
1            43515
4            12190
3            10880
5-6           3486
7-8           1526
9+             647
Name: count, dtype: int64

Create `building_age` from `construction_year` feature:

In [284]:
# First convert construction_year to building_age

# Digits conversion to english
def persian_to_english(sample_input: str):
    persian_digits = '۰۱۲۳۴۵۶۷۸۹'
    english_digits = '0123456789'
    trans_table = str.maketrans(persian_digits, english_digits)
    return sample_input.translate(trans_table)

credit_data['construction_year'] = credit_data.loc[:, 'construction_year'].apply(
    lambda x: persian_to_english(x) if isinstance(x, str) else x)

# change to only digits
credit_data['construction_year'] = credit_data['construction_year'].replace('قبل از 1370', '1370')
# change to numeric type
credit_data['construction_year'] = pd.to_numeric(credit_data['construction_year'], errors='coerce')

# Create building_age column
credit_data['building_age'] = 1404 - credit_data['construction_year']
credit_data = credit_data.drop(columns=['construction_year'])

In [285]:
credit_data.building_age.describe()

count    350800.000000
mean         10.837041
std           7.848751
min           1.000000
25%           4.000000
50%           9.000000
75%          15.000000
max          34.000000
Name: building_age, dtype: float64

In [286]:
# Bin the building_age feature
bins = [0, 5, 10, 20, 30, np.inf]
labels = ["new", "relatively_new", "mid_age", "old", "very_old"]

credit_data["building_age"] = pd.cut(credit_data["building_age"], bins=bins, labels=labels)

In [287]:
# Handle missings
credit_data["building_age"] = (credit_data["building_age"].cat.add_categories("unselect").fillna("unselect"))

In [288]:
credit_data.building_age.value_counts()

building_age
mid_age           116359
new               103506
relatively_new     92336
old                27016
very_old           11583
unselect             392
Name: count, dtype: int64

**Categorical Features:**

neighborhood_slug & city_slug have too many unique values ->. Encoding Issue <br>

Handle missings of categorical features: Map **NaNs** to '**unselect**'

In [289]:
credit_data["has_warm_water_provider"] = credit_data["has_warm_water_provider"].fillna("unselect")
credit_data.has_warm_water_provider.value_counts()

has_warm_water_provider
unselect        201274
package          69992
water_heater     65930
powerhouse       13996
Name: count, dtype: int64

In [290]:
credit_data["has_heating_system"] = credit_data["has_heating_system"].fillna("unselect")
credit_data.has_heating_system.value_counts()

has_heating_system
unselect         207195
shoofaj           71830
heater            55312
duct_split         8666
floor_heating      2362
split              2089
fan_coil           2005
fireplace          1733
Name: count, dtype: int64

In [291]:
credit_data["has_cooling_system"] = credit_data["has_cooling_system"].fillna("unselect")
credit_data.has_cooling_system.value_counts()

has_cooling_system
unselect           214362
water_cooler        94807
air_conditioner     16571
split               12645
duct_split          10820
fan_coil             1987
Name: count, dtype: int64

In [292]:
credit_data["has_restroom"] = credit_data["has_restroom"].fillna("unselect")
credit_data.has_restroom.value_counts()

has_restroom
unselect      191778
squat          80845
squat_seat     75207
seat            3362
Name: count, dtype: int64

In [293]:
credit_data["building_direction"] = credit_data["building_direction"].fillna("unselect")
credit_data.building_direction.value_counts()

building_direction
unselect    233748
south        58556
north        51781
east          4421
west          2686
Name: count, dtype: int64

In [294]:
credit_data["floor_material"] = credit_data["floor_material"].fillna("unselect")
credit_data.floor_material.value_counts()

floor_material
unselect            192603
ceramic             116366
stone                16156
carpet                9472
mosaic                6735
wood_parquet          5482
laminate_parquet      3670
floor_covering         708
Name: count, dtype: int64

In [263]:
# Remove 1 missing value from city_slug
credit_data["city_slug"] = credit_data["city_slug"].fillna("unselect")
credit_data.city_slug.value_counts()

city_slug
tehran           83595
mashhad          28301
karaj            18843
shiraz           14771
isfahan          13200
                 ...  
aalasht              1
nokandeh             1
arjmand              1
unselect             1
khoshroud-pey        1
Name: count, Length: 415, dtype: int64

In [264]:
credit_data["neighborhood_slug"] = credit_data["neighborhood_slug"].fillna("unselect")
credit_data.neighborhood_slug.value_counts()

neighborhood_slug
unselect                173851
saadat-abad               2671
poonak                    2334
ghasemabad                2310
elahiyehblvd              2100
                         ...  
shahrak-pardisan             1
shahid-avini-lahijan         1
zeynabeiyeh                  1
moallem                      1
janbazan-rasht               1
Name: count, Length: 1111, dtype: int64

**Boolean Features:**

Create two new columns called `has_luxury` & `has_non_luxury`:

In [252]:
# First fix has_balcony column (map true to True, false to False, and unselect to False)
credit_data["has_balcony"] = credit_data["has_balcony"].replace({"true": True})
credit_data["has_balcony"] = credit_data["has_balcony"].replace({"false": False})
credit_data["has_balcony"] = credit_data["has_balcony"].replace({"unselect": False})

In [253]:
# Create a columns called has_luxury
luxury_cols = ["has_pool", "has_sauna", "has_barbecue", "has_jacuzzi", "has_security_guard"]
non_luxury_cols = ['has_balcony', 'has_elevator', 'has_warehouse', 'has_parking']

credit_data["luxury_score"] = credit_data[luxury_cols].sum(axis=1) #! It can also convert to boolean
credit_data["non_luxury_score"] = credit_data[non_luxury_cols].sum(axis=1)

credit_data = credit_data.drop(luxury_cols, axis=1)
credit_data = credit_data.drop(non_luxury_cols, axis=1)

### **- Handle Missing & Unselect Values**

**Numeric Features:**

**Categorical Features**

**Boolean Features:**

### **- Handle Extreme Outliers** <br>
There are very large and very small, unreasonable values for total credit that might cause errors on model. We simply cut them from dataset.

### **- Train/Val/Test Split** <br>


Split off the test set:

In [57]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X = credit_data.drop(columns=["total_credit"])
y = credit_data["total_credit"]

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X,
    y,
    test_size=0.15,
    random_state=42
)

Split training into train and validation:

In [58]:
# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val,
    y_train_val,
    test_size=0.1765,
    random_state=42
)

## Train the Model

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

## Evaluate the Model

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

y_pred = rf.predict(X_val)

print("R2:", r2_score(y_val, y_pred))
print("MAE:", mean_absolute_error(y_val, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_val, y_pred)))