In [9]:
# Import modules

import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import time
import optuna

In [10]:
optimization_time_in_seconds = -time.time()

In [11]:
# Load and prepare data

df = pd.read_csv("houses_to_rent_v2.csv")

df['floor'] = df['floor'].replace('-',0)
df['floor'] = df['floor'].astype('int64')

num_var = []
for i in df.select_dtypes(include=['int64']):
    num_var.append(i)
    
cat_var = []
for i in df.select_dtypes(include=['object']):
    cat_var.append(i)

df = df.rename(columns={'hoa (R$)':'hoa','rent amount (R$)':'rent_amount','property tax (R$)':'property_tax','fire insurance (R$)':'fire_insurance','total (R$)':'target','parking spaces':'parking_spaces'})

dummies_cols = ['city', 'furniture', 'animal']
df= pd.get_dummies(df, columns = dummies_cols, drop_first=True)
df.drop(['hoa', 'rent_amount', 'property_tax', 'fire_insurance'], axis=1, inplace=True)

In [12]:
# Split data to train and test sets

df=df.copy()

def preprocessing_(df):
    df = df.sample(frac=1.0, random_state=0).reset_index(drop=True)
    X = df.drop('target', axis=1)
    y = df.target
    
    X = pd.DataFrame(X, index=X.index, columns=X.columns)
    
    return X,y

X, y = preprocessing_(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [13]:
# Defining objective function

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 5, 40)
    min_samples_split = trial.suggest_int('min_samples_split', 4, 10)
    
    random_forest_regressor = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split)
    random_forest_regressor.fit(X_train, y_train)
    
    return np.sqrt(mean_squared_error(y_test, random_forest_regressor.predict(X_test)))

In [14]:
# Hyperparameter optimization

study = optuna.create_study(study_name='RandomForestRegressor')

#optimization_time_in_seconds = -time.time()
study.optimize(objective, n_trials=1000)

[I 2024-08-10 17:49:06,123] A new study created in memory with name: RandomForestRegressor
[I 2024-08-10 17:49:16,263] Trial 0 finished with value: 7907.0375989392305 and parameters: {'n_estimators': 966, 'max_depth': 35, 'min_samples_split': 8}. Best is trial 0 with value: 7907.0375989392305.
[I 2024-08-10 17:49:22,241] Trial 1 finished with value: 6372.897458158431 and parameters: {'n_estimators': 607, 'max_depth': 18, 'min_samples_split': 9}. Best is trial 1 with value: 6372.897458158431.
[I 2024-08-10 17:49:24,373] Trial 2 finished with value: 9202.195285451904 and parameters: {'n_estimators': 197, 'max_depth': 25, 'min_samples_split': 6}. Best is trial 1 with value: 6372.897458158431.
[I 2024-08-10 17:49:35,523] Trial 3 finished with value: 9132.993764074155 and parameters: {'n_estimators': 902, 'max_depth': 23, 'min_samples_split': 4}. Best is trial 1 with value: 6372.897458158431.
[I 2024-08-10 17:49:39,450] Trial 4 finished with value: 6729.116275258216 and parameters: {'n_esti

In [15]:
optimization_time_in_seconds += time.time()

In [16]:
# Results

print("Best params:", study.best_params)
print("Best value:", study.best_value)
minutes = int(optimization_time_in_seconds) // 60
seconds = int(optimization_time_in_seconds)
seconds = seconds % 60 + (optimization_time_in_seconds - int(optimization_time_in_seconds))    
print("Optimization time: {} m {} s".format(minutes, seconds))

Best params: {'n_estimators': 180, 'max_depth': 5, 'min_samples_split': 10}
Best value: 4351.007539862343
Optimization time: 25 m 41.835272550582886 s
