In [1]:
# Import modules

import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import time
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load and prepare data

df = pd.read_csv("houses_to_rent_v2.csv")

df['floor'] = df['floor'].replace('-',0)
df['floor'] = df['floor'].astype('int64')

num_var = []
for i in df.select_dtypes(include=['int64']):
    num_var.append(i)
    
cat_var = []
for i in df.select_dtypes(include=['object']):
    cat_var.append(i)

df = df.rename(columns={'hoa (R$)':'hoa','rent amount (R$)':'rent_amount','property tax (R$)':'property_tax','fire insurance (R$)':'fire_insurance','total (R$)':'target','parking spaces':'parking_spaces'})

dummies_cols = ['city', 'furniture', 'animal']
df= pd.get_dummies(df, columns = dummies_cols, drop_first=True)
df.drop(['hoa', 'rent_amount', 'property_tax', 'fire_insurance'], axis=1, inplace=True)

In [3]:
# Split data to train and test sets

df=df.copy()

def preprocessing_(df):
    df = df.sample(frac=1.0, random_state=0).reset_index(drop=True)
    X = df.drop('target', axis=1)
    y = df.target
    
    X = pd.DataFrame(X, index=X.index, columns=X.columns)
    
    return X,y

X, y = preprocessing_(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [4]:
# Defining objective function

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 5, 40)
    min_samples_split = trial.suggest_int('min_samples_split', 4, 10)
    
    random_forest_regressor = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split)
    random_forest_regressor.fit(X_train, y_train)
    
    return np.sqrt(mean_squared_error(y_test, random_forest_regressor.predict(X_test)))

In [5]:
# Hyperparameter optimization

study = optuna.create_study(study_name='RandomForestRegressor')

optimization_time_in_seconds = -time.time()
study.optimize(objective, n_trials=10)
optimization_time_in_seconds += time.time()

[I 2024-08-10 09:51:48,459] A new study created in memory with name: RandomForestRegressor
[I 2024-08-10 09:52:15,389] Trial 0 finished with value: 8508.558933915529 and parameters: {'n_estimators': 688, 'max_depth': 15, 'min_samples_split': 7}. Best is trial 0 with value: 8508.558933915529.
[I 2024-08-10 09:52:28,148] Trial 1 finished with value: 5858.479717646554 and parameters: {'n_estimators': 395, 'max_depth': 23, 'min_samples_split': 9}. Best is trial 1 with value: 5858.479717646554.
[I 2024-08-10 09:52:46,834] Trial 2 finished with value: 5644.259484774211 and parameters: {'n_estimators': 572, 'max_depth': 17, 'min_samples_split': 10}. Best is trial 2 with value: 5644.259484774211.
[I 2024-08-10 09:53:20,217] Trial 3 finished with value: 8669.151372678885 and parameters: {'n_estimators': 825, 'max_depth': 40, 'min_samples_split': 7}. Best is trial 2 with value: 5644.259484774211.
[I 2024-08-10 09:53:51,353] Trial 4 finished with value: 5412.379838270166 and parameters: {'n_estim

In [6]:
# Results

print("Best params", study.best_params)
print("Best value", study.best_value)
print("Optimization time in seconds", optimization_time_in_seconds)

Best params {'n_estimators': 942, 'max_depth': 21, 'min_samples_split': 10}
Best value 5412.379838270166
Optimization time in seconds 182.71023416519165
