# Machine Learning - Total Credit Prediction

---

#### **Import libraries**

In [2430]:
import dtale
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Load & Prepare Data for Total Credit Prediction

In [2431]:
data = pd.read_csv('../Divar Dataset/Divar.csv')
data.drop('Unnamed: 0', axis=1, inplace=True)

data.head()


Columns (11,27,29,53) have mixed types. Specify dtype option on import or set low_memory=False.



Unnamed: 0,cat2_slug,cat3_slug,city_slug,neighborhood_slug,created_at_month,user_type,description,title,rent_mode,rent_value,...,property_type,regular_person_capacity,extra_person_capacity,cost_per_extra_person,rent_price_on_regular_days,rent_price_on_special_days,rent_price_at_weekends,location_latitude,location_longitude,location_radius
0,temporary-rent,villa,karaj,mehrshahr,2024-08-01 00:00:00,مشاور املاک,۵۰۰متر\n۲۰۰متر بنا دوبلکس\n۳خواب\nاستخر آبگرم ...,باغ ویلا اجاره روزانه استخر داخل لشکرآباد سهیلیه,,,...,,4.0,6.0,350000.0,1500000.0,3500000000.0,3500000.0,35.811684,50.9366,500.0
1,residential-sell,apartment-sell,tehran,gholhak,2024-05-01 00:00:00,مشاور املاک,دسترسی عالی به مترو و شریعتی \nمشاعات تمیز \nب...,۶۰ متر قلهک فول امکانات,,,...,,,,,,,,,,500.0
2,residential-rent,apartment-rent,tehran,tohid,2024-10-01 00:00:00,,تخلیه پایان ماه,آپارتمان ۳ خوابه ۱۳۲ متر,مقطوع,26000000.0,...,,,,,,,,35.703865,51.373459,
3,commercial-rent,office-rent,tehran,elahiyeh,2024-06-01 00:00:00,,فرشته تاپ لوکیشن\n۹۰ متر موقعیت اداری\nیک اتاق...,فرشته ۹۰ متر دفتر کار مدرن موقعیت اداری,مقطوع,95000000.0,...,,,,,,,,,,
4,residential-sell,apartment-sell,mashhad,emamreza,2024-05-01 00:00:00,مشاور املاک,هلدینگ ساختمانی اکبری\n\nهمراه شما هستیم برای ...,۱۱۵ متری/شمالی رو به آفتاب/اکبری,,,...,,,,,,,,,,


##### **Extract records with rent/credit values:**

In [2432]:
# Extract rows where price_value is NaN
credit_data = data.loc[data['price_value'].isna()]
credit_data = credit_data.drop('price_value', axis=1) # Drop the price_value column

credit_data.shape

(431654, 59)

In [2433]:
def calculate_total_credit(rent, credit):
    """
    Calculate the total credit based on the given rent and credit values.
    """
    if rent < 0 or credit < 0:
        return 0

    credit_total = credit + (rent * 100) / 3
    transformed_price = credit_total * 6
    
    return transformed_price

In [2434]:
# Delete rows where either credit_value or rent_value is NaN
credit_data = credit_data.dropna(subset=["rent_value", "credit_value"])

# Apply the function to calculate the total credit
credit_data["total_credit"] = credit_data.apply(
    lambda row: calculate_total_credit(row["rent_value"], row["credit_value"]),
    axis=1
)
# Drop rent_value and credit_value columns
credit_data = credit_data.drop(["rent_value", "credit_value"], axis=1)

credit_data.total_credit.head()

2     9.700000e+09
3     2.470000e+10
5     2.700000e+09
6     4.100000e+09
11    7.200000e+09
Name: total_credit, dtype: float64

In [2435]:
credit_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 351192 entries, 2 to 999999
Data columns (total 58 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   cat2_slug                   351192 non-null  object 
 1   cat3_slug                   351192 non-null  object 
 2   city_slug                   351191 non-null  object 
 3   neighborhood_slug           177341 non-null  object 
 4   created_at_month            351192 non-null  object 
 5   user_type                   102689 non-null  object 
 6   description                 351192 non-null  object 
 7   title                       351161 non-null  object 
 8   rent_mode                   351192 non-null  object 
 9   rent_to_single              15 non-null      object 
 10  rent_type                   102217 non-null  object 
 11  price_mode                  0 non-null       object 
 12  credit_mode                 351192 non-null  object 
 13  rent_credit_transfo

## Preprocessing

### **- Feature Engineering**

##### Drop some features intuitively:
We decided to drop these features since we realizes they can't have a significant effect on our prediction.


In [2436]:
credit_data = credit_data.drop(['title', 'description', 'created_at_month', 'user_type', 'location_radius'], axis=1)

Drop the unnecessary features related to price:

In [2437]:
credit_data = credit_data.drop(
    ['rent_mode', 'rent_to_single', 'rent_type', 'price_mode', 'credit_mode', 'rent_credit_transform',
        'transformable_price', 'transformable_credit', 'transformed_credit', 'transformable_rent', 'transformed_rent'], axis=1)

Drop features with too many missing values:

In [2438]:
credit_data = credit_data.drop(
    ['has_water', 'has_electricity', 'has_gas', 'regular_person_capacity', 'extra_person_capacity',
    'cost_per_extra_person', 'rent_price_on_regular_days', 'rent_price_on_special_days', 'rent_price_at_weekends',
    'property_type', 'deed_type', 'has_business_deed'], axis=1)

In [2439]:
# Only numeric features
credit_data.describe()

Unnamed: 0,land_size,building_size,location_latitude,location_longitude,total_credit
count,64398.0,351190.0,234791.0,234791.0,351192.0
mean,4622.749,3298.652,34.883997,51.765549,8500659000000.0
std,119472.6,121169.9,2.34877,3.196032,779275500000000.0
min,1.0,1.0,23.636976,43.305359,206.0
25%,93.0,68.0,34.437214,50.877888,1640000000.0
50%,140.0,92.0,35.719067,51.37101,2900000000.0
75%,210.0,130.0,35.854118,51.716091,5720000000.0
max,10000000.0,10000000.0,39.808537,74.51162,2.06e+17


**Numeric Features:**

Delete land_size since it has many missing values;

In [2440]:
credit_data = credit_data.drop(['land_size'], axis=1)

Binning `floor` and `total_floors_count`:

In [2441]:
# First clean the features
import numpy as np

def clean_floor(x):
    if pd.isna(x):
        return np.nan
    if x == "unselect":
        return np.nan
    x = str(x).strip()
    if x.endswith("+"):
        return float(x.replace("+", ""))
    return float(x)

credit_data["floor"] = credit_data["floor"].apply(clean_floor)
credit_data["total_floors_count"] = credit_data["total_floors_count"].apply(clean_floor)

In [2442]:
# Bin the floor and total_floors_count columns
floor_bins = [-1, 0, 3, 7, 12, 20, 30]
floor_labels = ["underground", "low_floors", "mid_low_floors", "mid_floors","high_floors","very_high"]

total_floor_bins = [0, 3, 7, 12, 20, 30]
total_floor_labels = ["low_floors", "mid_low_floors", "mid_floors","high_floors","very_high"]

credit_data["floor"] = pd.cut(credit_data["floor"], bins=floor_bins, labels=floor_labels)
credit_data["total_floors_count"] = pd.cut(credit_data["total_floors_count"], bins=total_floor_bins, labels=total_floor_labels)

In [2443]:
# Handle their missings under the unselect category
credit_data["floor"] = credit_data["floor"].cat.add_categories("unselect")
credit_data["floor"] = credit_data["floor"].fillna("unselect")

credit_data["total_floors_count"] = credit_data["total_floors_count"].cat.add_categories("unselect")
credit_data["total_floors_count"] = credit_data["total_floors_count"].fillna("unselect")

In [2444]:
credit_data['floor'].value_counts()

floor
low_floors        153061
unselect          122668
mid_low_floors     49331
underground        20561
mid_floors          3914
high_floors         1161
very_high            496
Name: count, dtype: int64

Make the `rooms_count` values English:

In [2445]:
room_map = {
    "بدون اتاق": "0_rooms",
    "یک": "1_room",
    "دو": "2_rooms",
    "سه": "3_rooms",
    "چهار": "4_rooms",
    "پنج یا بیشتر": "5plus_rooms"
}

# Map the values
credit_data["rooms_count"] = credit_data["rooms_count"].map(room_map)

In [2446]:
# Handle missing values (map to unselect)
credit_data["rooms_count"] = credit_data["rooms_count"].fillna("unselect")
credit_data.rooms_count.value_counts()

rooms_count
2_rooms        154435
1_room          90030
3_rooms         50309
0_rooms         45345
4_rooms          6193
5plus_rooms      4580
unselect          300
Name: count, dtype: int64

Bin `unit_per_floor` feature:

In [2447]:
# First convert everything to numeric if possible
def clean_unit_per_floor(x):
    if pd.isna(x):
        return np.nan

    x = str(x).strip()
    
    if x == "unselect":
        return np.nan
    
    if x == "more_than_8":
        return 9.0   # Arbitrary value for binning
    
    try:
        return float(x)
    except ValueError:
        return np.nan

credit_data["unit_per_floor"] = credit_data["unit_per_floor"].apply(clean_unit_per_floor)

In [2448]:
# Bin the unit_per_floor columns
bins = [0, 1, 2, 3, 4, 6, 8, np.inf]
labels = ["1", "2", "3", "4", "5-6", "7-8", "9+"]

credit_data["unit_per_floor"] = pd.cut(credit_data["unit_per_floor"], bins=bins, labels=labels, include_lowest=True)

In [2449]:
# Handle missings
credit_data["unit_per_floor"] = (credit_data["unit_per_floor"].cat.add_categories("unselect").fillna("unselect"))
credit_data.unit_per_floor.value_counts()

unit_per_floor
unselect    234386
2            44562
1            43515
4            12190
3            10880
5-6           3486
7-8           1526
9+             647
Name: count, dtype: int64

Create `building_age` from `construction_year` feature:

In [2450]:
# First convert construction_year to building_age

# Digits conversion to english
def persian_to_english(sample_input: str):
    persian_digits = '۰۱۲۳۴۵۶۷۸۹'
    english_digits = '0123456789'
    trans_table = str.maketrans(persian_digits, english_digits)
    return sample_input.translate(trans_table)

credit_data['construction_year'] = credit_data.loc[:, 'construction_year'].apply(
    lambda x: persian_to_english(x) if isinstance(x, str) else x)

# change to only digits
credit_data['construction_year'] = credit_data['construction_year'].replace('قبل از 1370', '1370')
# change to numeric type
credit_data['construction_year'] = pd.to_numeric(credit_data['construction_year'], errors='coerce')

# Create building_age column
credit_data['building_age'] = 1404 - credit_data['construction_year']
credit_data = credit_data.drop(columns=['construction_year'])

credit_data.building_age.describe()

count    350800.000000
mean         10.837041
std           7.848751
min           1.000000
25%           4.000000
50%           9.000000
75%          15.000000
max          34.000000
Name: building_age, dtype: float64

In [2451]:
# Bin the building_age feature
bins = [0, 5, 10, 20, 30, np.inf]
labels = ["new", "relatively_new", "mid_age", "old", "very_old"]

credit_data["building_age"] = pd.cut(credit_data["building_age"], bins=bins, labels=labels)

In [2452]:
# Handle missings
credit_data["building_age"] = (credit_data["building_age"].cat.add_categories("unselect").fillna("unselect"))
credit_data.building_age.value_counts()

building_age
mid_age           116359
new               103506
relatively_new     92336
old                27016
very_old           11583
unselect             392
Name: count, dtype: int64

**Categorical Features:**

Group rare categories as `Other` in `cat3_slug` and `cat2_slug`:

In [2453]:
def group_rare(series, min_freq=0.01):
    freq = series.value_counts(normalize=True)
    rare = freq[freq < min_freq].index
    return series.replace(rare, "Other")

credit_data["cat3_slug"] = group_rare(credit_data["cat3_slug"])
credit_data.cat3_slug.value_counts()

cat3_slug
apartment-rent                        210859
house-villa-rent                       64405
shop-rent                              45744
office-rent                            21157
industry-agriculture-business-rent      9016
Other                                     11
Name: count, dtype: int64

In [2454]:
credit_data["cat2_slug"] = group_rare(credit_data["cat2_slug"])
credit_data.cat2_slug.value_counts()

cat2_slug
residential-rent    275264
commercial-rent      75917
Other                   11
Name: count, dtype: int64

**Boolean Features:**

Create two new columns called `has_luxury` & `has_non_luxury`:

In [2455]:
# First fix has_balcony column (map true to True, false to False, and unselect to False)
credit_data["has_balcony"] = credit_data["has_balcony"].replace({"true": True})
credit_data["has_balcony"] = credit_data["has_balcony"].replace({"false": False})
credit_data["has_balcony"] = credit_data["has_balcony"].replace({"unselect": False})

In [2456]:
# Create a columns called has_luxury
luxury_cols = ["has_pool", "has_sauna", "has_barbecue", "has_jacuzzi", "has_security_guard"]
non_luxury_cols = ['has_balcony', 'has_elevator', 'has_warehouse', 'has_parking']

credit_data["luxury_items"] = credit_data[luxury_cols].sum(axis=1)
credit_data["non_luxury_items"] = credit_data[non_luxury_cols].sum(axis=1)

credit_data = credit_data.drop(luxury_cols, axis=1)
credit_data = credit_data.drop(non_luxury_cols, axis=1)

In [2457]:
def bin_luxury_items(x):
    if pd.isna(x):
        return "unselect"
    elif x == 0:
        return "none"
    elif x <= 2:
        return "low"
    elif x <= 4:
        return "medium"
    else:
        return "high"

In [2458]:
# Bin these features #! You might be able to map NaNs to -1
credit_data["luxury_items"] = credit_data["luxury_items"].apply(bin_luxury_items)
credit_data["non_luxury_items"] = credit_data["non_luxury_items"].apply(bin_luxury_items)

In [2459]:
credit_data['non_luxury_items'].value_counts()

non_luxury_items
medium    152365
low       118172
none       80655
Name: count, dtype: int64

### **- Clip Extreme Outliers** <br>
There are very large and very small, unreasonable values for total credit that might cause errors on model. We simply cut them from dataset using a threshhold.

In [2460]:
credit_data.describe()

Unnamed: 0,building_size,location_latitude,location_longitude,total_credit
count,351190.0,234791.0,234791.0,351192.0
mean,3298.652,34.883997,51.765549,8500659000000.0
std,121169.9,2.34877,3.196032,779275500000000.0
min,1.0,23.636976,43.305359,206.0
25%,68.0,34.437214,50.877888,1640000000.0
50%,92.0,35.719067,51.37101,2900000000.0
75%,130.0,35.854118,51.716091,5720000000.0
max,10000000.0,39.808537,74.51162,2.06e+17


**Chosen total credit range:** `50,000,000` to `80,000,000,000`<br>

**↪Why?**<br>

In Iran's property market, at this time, total rents are mostly below 50 billion, and above 50 million Tomans. As we checked in Divar, one of the largest online classified marketplace applications in Iran, most of advertisements are at this range. In additions, some prices are old and too low counted as noises, so we try to put them aside. <br>

In addition, when we go beyond this range, the values extremely increase and harm model predictions, plus we are using tree based models that act poorly when they get an input out of training range.

In [2461]:
# Remove outliers from total_credit
credit_data = credit_data[(credit_data["total_credit"] <= 80_000_000_000) & (credit_data["total_credit"] >= 50_000_000)]

**Chosen building size range:** `20` to `3000`<br>

**↪Why?**<br>

According to Iran's market, most of properties published are above 20 and below 3000, as we checked at Divar application. When we go beyond this range, values become very large or low and prices become extreme<br>

In addition, when we go beyond this range, the values extremely increase and harm model predictions, plus we are using tree based models that act poorly when they get an input out of training range.

In [2462]:
# Remove outliers from building_size
credit_data = credit_data[(credit_data["building_size"] <= 3000) & (credit_data["building_size"] >= 20)]

In [2463]:
credit_data.describe()

Unnamed: 0,building_size,location_latitude,location_longitude,total_credit
count,331164.0,222258.0,222258.0,331164.0
mean,119.76274,34.889715,51.781805,5424158000.0
std,152.081575,2.341752,3.188716,7824053000.0
min,20.0,23.636976,43.305359,50000000.0
25%,70.0,34.544466,50.887169,1760000000.0
50%,95.0,35.719765,51.371387,3000000000.0
75%,130.0,35.854941,51.723539,5800000000.0
max,3000.0,39.673721,74.51162,80000000000.0


### **- Train/Val/Test Split** <br>


Split off the **test set**:

In [2464]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X = credit_data.drop(columns=["total_credit"])
y = credit_data["total_credit"]

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X,
    y,
    test_size=0.15,
    random_state=42
)

Split training into **train** and **validation**:

In [2465]:
# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val,
    y_train_val,
    test_size=0.1275,
    random_state=42
)

### **- Handle Missing Values with Imputation**

In [2466]:
# Concat data sets
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

**Numeric Features:**

Remove rows with missing **building_size**, since they are only two rwos:

In [2467]:
# Remove rows where building size is missing
train_data = train_data.dropna(subset=['building_size'])
val_data = val_data.dropna(subset=['building_size'])
test_data = test_data.dropna(subset=['building_size'])

Impute coordinate features (`location_latitude`, `location_longitude`) by groupbying on `city_slug`:

In [2468]:
# Fill lat,long missing values based on their city_slug mean
# Calculate mean of all locations
overall_lat_mean = train_data['location_latitude'].mean()
overall_long_mean = train_data['location_longitude'].mean()

# First fill missing based on their city mean
train_data['location_latitude'] = train_data.groupby('city_slug')['location_latitude'].transform(lambda x: x.fillna(x.mean()))
train_data['location_longitude'] = train_data.groupby('city_slug')['location_longitude'].transform(lambda x: x.fillna(x.mean()))

# Fill the rest of the missing values with overall mean
train_data['location_latitude'] = train_data['location_latitude'].fillna(overall_lat_mean)
train_data['location_longitude'] = train_data['location_longitude'].fillna(overall_long_mean)


In [2469]:
# Fill lat,long missing values on validation dataset
city_lat_means = train_data.groupby('city_slug')['location_latitude'].mean()
city_long_means = train_data.groupby('city_slug')['location_longitude'].mean()

val_data['location_latitude'] = val_data.apply(
    lambda row: city_lat_means.get(row['city_slug'], overall_lat_mean) 
    if pd.isna(row['location_latitude']) else row['location_latitude'],
    axis=1
)

val_data['location_longitude'] = val_data.apply(
    lambda row: city_long_means.get(row['city_slug'], overall_long_mean) 
    if pd.isna(row['location_longitude']) else row['location_longitude'],
    axis=1
)

In [2470]:
# Do the same for test_data
# location_lat
test_data['location_latitude'] = test_data.apply(
    lambda row: city_lat_means.get(row['city_slug'], overall_lat_mean) 
    if pd.isna(row['location_latitude']) else row['location_latitude'],
    axis=1
)

# location_long
test_data['location_longitude'] = test_data.apply(
    lambda row: city_long_means.get(row['city_slug'], overall_long_mean) 
    if pd.isna(row['location_longitude']) else row['location_longitude'],
    axis=1
)

**Categorical Features**

Remove rows with missing `city_slug`:

In [2471]:
train_data['city_slug'] = train_data['city_slug'].dropna()
val_data['city_slug'] = val_data['city_slug'].dropna()
test_data['city_slug'] = test_data['city_slug'].dropna()

Handle missings of categorical features: Map **NaNs** to '**unselect**'

In [2472]:
train_data["has_warm_water_provider"] = train_data["has_warm_water_provider"].fillna("unselect")
val_data["has_warm_water_provider"] = val_data["has_warm_water_provider"].fillna("unselect")
test_data["has_warm_water_provider"] = test_data["has_warm_water_provider"].fillna("unselect")

train_data["has_heating_system"] = train_data["has_heating_system"].fillna("unselect")
val_data["has_heating_system"] = val_data["has_heating_system"].fillna("unselect")
test_data["has_heating_system"] = test_data["has_heating_system"].fillna("unselect")

train_data["has_cooling_system"] = train_data["has_cooling_system"].fillna("unselect")
val_data["has_cooling_system"] = val_data["has_cooling_system"].fillna("unselect")
test_data["has_cooling_system"] = test_data["has_cooling_system"].fillna("unselect")

train_data["building_direction"] = train_data["building_direction"].fillna("unselect")
val_data["building_direction"] = val_data["building_direction"].fillna("unselect")
test_data["building_direction"] = test_data["building_direction"].fillna("unselect")

train_data["floor_material"] = train_data["floor_material"].fillna("unselect")
val_data["floor_material"] = val_data["floor_material"].fillna("unselect")
test_data["floor_material"] = test_data["floor_material"].fillna("unselect")

train_data["has_restroom"] = train_data["has_restroom"].fillna("unselect")
val_data["has_restroom"] = val_data["has_restroom"].fillna("unselect")
test_data["has_restroom"] = test_data["has_restroom"].fillna("unselect")

##### Concat train, val, test:

In [2473]:
X_train = train_data.drop(columns=["total_credit"])
y_train = train_data["total_credit"]

X_val = val_data.drop(columns=["total_credit"])
y_val = val_data["total_credit"]

X_test = test_data.drop(columns=["total_credit"])
y_test = test_data["total_credit"]

### **- Feature Encoding** <br>


Encoding **Boolean Features**:

In [2474]:
def encode_boolean_features(df, boolean_columns):
    """
    Encodes boolean columns as 0 or 1 (or -1 for missings).
    """
    df = df.copy()
    for col in boolean_columns:
        df[col] = (
            df[col]
            .map({True: 1, False: 0})
            .fillna(-1)
            .astype(int)
        )
    return df

In [2475]:
boolean_cols = ["is_rebuilt"]

X_train = encode_boolean_features(X_train, boolean_cols)
X_val   = encode_boolean_features(X_val, boolean_cols)
X_test  = encode_boolean_features(X_test, boolean_cols)

Encode **Categorical Features**:

Apply Target Encoding on `neighborhood_slug` and `city_slig`:

In [2476]:
from category_encoders import TargetEncoder

te = TargetEncoder(
    cols=["neighborhood_slug", "city_slug"],
    smoothing=10
)

X_train_encoded = te.fit_transform(X_train, y_train)
X_val_encoded = te.transform(X_val)
X_test_encoded = te.transform(X_test)

X_train_encoded[["neighborhood_slug", "city_slug"]].describe()

Unnamed: 0,neighborhood_slug,city_slug
count,245599.0,245599.0
mean,5437989000.0,5435972000.0
std,5037214000.0,3239365000.0
min,1254402000.0,1231083000.0
25%,3187169000.0,3017770000.0
50%,3187169000.0,4315107000.0
75%,5604635000.0,5972301000.0
max,34622820000.0,16511880000.0


Apply One-Hot Encoding to Categorical Features:

In [2477]:
# One-hot encode categorical features
from sklearn.preprocessing import OneHotEncoder


categorical_cols = ["cat2_slug",'cat3_slug', 'luxury_items', 'non_luxury_items', 'floor', 'total_floors_count', 'building_age',
                    'rooms_count', 'unit_per_floor', 'has_warm_water_provider', 'has_heating_system', 'has_cooling_system', 'has_restroom',
                    'building_direction', 'floor_material']

ohe = OneHotEncoder(
    handle_unknown="ignore",
    sparse_output=False
)

X_train_cat = ohe.fit_transform(X_train_encoded[categorical_cols])
X_val_cat = ohe.transform(X_val_encoded[categorical_cols])
X_test_cat = ohe.transform(X_test_encoded[categorical_cols])

# Convert encoded variables to pandas dataframe
X_train_cat = pd.DataFrame(X_train_cat, 
                            columns=ohe.get_feature_names_out(categorical_cols),
                            index=X_train_encoded.index)

X_val_cat = pd.DataFrame(X_val_cat, 
                           columns=ohe.get_feature_names_out(categorical_cols),
                           index=X_val_encoded.index)

X_test_cat = pd.DataFrame(X_test_cat, 
                           columns=ohe.get_feature_names_out(categorical_cols),
                           index=X_test_encoded.index)

# Recombine the numeric and categorical features
X_train_final = pd.concat([
    X_train_encoded.drop(columns=categorical_cols),
    X_train_cat
], axis=1)

X_val_final = pd.concat([
    X_val_encoded.drop(columns=categorical_cols),
    X_val_cat
], axis=1)

X_test_final = pd.concat([
    X_test_encoded.drop(columns=categorical_cols),
    X_test_cat
], axis=1)

In [2478]:
X_train_final.shape

(245599, 91)

## Train Random Forrest Model

#### Hyperparameter tuning using ‌BayesSearchCV:

In [2479]:
import optuna
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 400),
        "max_depth": trial.suggest_int("max_depth", 5, 30),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
        "random_state": 42,
        "n_jobs": -1
    }

    model = RandomForestRegressor(**params)
    model.fit(X_train_final, y_train)

    preds = model.predict(X_val_final)
    rmse = root_mean_squared_error(y_val, preds)

    return rmse

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

In [None]:
best_params = study.best_params
print(f"Best parameters: {best_params}")

Best parameters: {'n_estimators': 355, 'max_depth': 22, 'min_samples_split': 13, 'min_samples_leaf': 5}


#### Train the model:

In [2482]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=355, max_depth=22, min_samples_split=13, min_samples_leaf=5, random_state=42, n_jobs=-1)
rf.fit(X_train_final, y_train)

0,1,2
,n_estimators,355
,criterion,'squared_error'
,max_depth,22
,min_samples_split,13
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


#### Evaluate Model on Training and Validation Set

In [2485]:
# Evaluate model on training & validation set
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

y_train_pred = rf.predict(X_train_final)
y_pred = rf.predict(X_val_final)

print("Training R2 Score:", r2_score(y_train, y_train_pred))
print("----------------------------------------")
print(f"Validation R2 Score: {r2_score(y_val, y_pred)}")
print(f"MAE: {mean_absolute_error(y_val, y_pred)}")
print(f"MSE: {np.sqrt(mean_squared_error(y_val, y_pred))}")

Training R2 Score: 0.8402003466645869
----------------------------------------
Validation R2 Score: 0.7411609071015335
MAE: 1591485567.2407506
MSE: 3957320608.0626574


#### Evaluate the Model on Test Set

In [2486]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

y_pred = rf.predict(X_test_final)

print("===== FINAL TEST PERFORMANCE =====")
print(f"R2 Score: {r2_score(y_test, y_pred)}")
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"MSE: {np.sqrt(mean_squared_error(y_test, y_pred))}")

===== FINAL TEST PERFORMANCE =====
R2 Score: 0.7481218609374374
MAE: 1567528851.181402
MSE: 3911627743.749535


## Train LGBoost Model

#### Hyperparameter tuning using ‌BayesSearchCV:

In [2487]:
import lightgbm as lgb
from sklearn.metrics import root_mean_squared_error
def objective(trial):

    params = {
        "n_estimators": trial.suggest_int("n_estimators", 300, 1500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
        "max_depth": trial.suggest_int("max_depth", -1, 20),
        "min_child_samples": trial.suggest_int("min_child_samples", 20, 100),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "random_state": 42,
        "n_jobs": -1
    }

    model = lgb.LGBMRegressor(**params)

    model.fit(
        X_train_final,
        y_train,
        eval_set=[(X_val_final, y_val)],
        eval_metric="rmse",
    )

    preds = model.predict(X_val_final)

    rmse = root_mean_squared_error(y_val, preds)
    return rmse

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

#### Train the Model

In [2490]:
import lightgbm as lgb

lgb_model = lgb.LGBMRegressor(
    n_estimators=1438, learning_rate=0.026898330387707108, num_leaves=141,
    max_depth=15, min_child_samples=41, subsample=0.7476375984955851, 
    colsample_bytree=0.6303192344346847, random_state=42, n_jobs=-1)

lgb_model.fit(
    X_train_final, y_train, eval_set=[(X_val_final, y_val)], 
    eval_metric="rmse"
    )

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002863 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1315
[LightGBM] [Info] Number of data points in the train set: 245599, number of used features: 88
[LightGBM] [Info] Start training from score 5422392437.376488


0,1,2
,boosting_type,'gbdt'
,num_leaves,141
,max_depth,15
,learning_rate,0.026898330387707108
,n_estimators,1438
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


#### Evaluate Model on Training and Validation Set

In [2491]:
# Evaluate model on training & validation set
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

y_train_pred = lgb_model.predict(X_train_final)
y_pred = lgb_model.predict(X_val_final)

print("Training R2 Score:", r2_score(y_train, y_train_pred))
print("----------------------------------------")
print(f"Validation R2 Score: {r2_score(y_val, y_pred)}")
print(f"MAE: {mean_absolute_error(y_val, y_pred)}")
print(f"MSE: {np.sqrt(mean_squared_error(y_val, y_pred))}")

Training R2 Score: 0.8362424948926881
----------------------------------------
Validation R2 Score: 0.7467391038203293
MAE: 1545942398.4524536
MSE: 3914446589.816684


#### Evaluate the Model on Test Set

In [2492]:
# Evaluate model on training & validation set
y_pred = lgb_model.predict(X_test_final)

print("===== FINAL TEST PERFORMANCE =====")
print(f"R2 Score: {r2_score(y_test, y_pred)}")
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"MSE: {np.sqrt(mean_squared_error(y_test, y_pred))}")

===== FINAL TEST PERFORMANCE =====
R2 Score: 0.7549091038515141
MAE: 1539790556.5525806
MSE: 3858565435.7501316


In [2493]:
# Delete variables
del X_train_final, X_val_final, X_test_final, X_train_encoded, X_val_encoded, X_test_encoded
del X_train, X_val, X_test, y_train, y_val, y_test
del data, credit_data, rf, lgb_model
del y_train_pred, y_pred, X_train_cat, X_val_cat, X_test_cat