##Model Development

###Pre-Development

#### Install libraries


In [1]:
pip install scikit-learn numpy pandas google.colab tensorflow keras-tuner bayesian-optimization

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bayesian-optimization
  Downloading bayesian_optimization-1.4.3-py3-none-any.whl (18 kB)
Collecting jedi>=0.16 (from ipython==7.34.0->google.colab)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Collecting colorama>=0.4.6 (from bayesian-optimization)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: kt-legacy, jedi, colorama, keras-tuner, bayesian-optimization
Successfully installed bayesian-optimization-1.4.3 colorama-0.4.6 jedi-0.19.1 keras-tuner-1.4.7 kt-legacy-1.0.5


####Import Packages

In [2]:
from google.colab import drive
import pandas
import numpy
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from bayes_opt import UtilityFunction
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score

###Data Splitting

In [3]:
drive.mount('/content/drive')
data = pandas.read_csv('/content/drive/My Drive/modified_NYC_property_sales.csv')

data['log_sale_price'] = numpy.log(data['sale_price'])
X = data.drop(['sale_price', 'log_sale_price'], axis=1)
Y = data['log_sale_price']
categorical_columns = X.select_dtypes(include=['object']).columns

label_encoder = LabelEncoder()
for column in categorical_columns:
    label_encoder.fit(X[column])
    X[column] = label_encoder.transform(X[column])



X_train, X_sec, Y_train, Y_sec = train_test_split(X, Y, test_size=0.4, random_state=0)
X_test, X_valid, Y_test, Y_valid = train_test_split(X_sec, Y_sec, test_size=0.5, random_state=0)

X_train = X_train
X_test = X_test
X_valid = X_valid

Mounted at /content/drive


###Hyperparameter tuning

In [4]:
def rf_cv(n_estimators, min_samples_split, max_features, max_depth, data, targets):
    estimator = RandomForestRegressor(
        n_estimators=int(n_estimators),
        min_samples_split=int(min_samples_split),
        max_features=min(max_features, 0.999),
        max_depth=int(max_depth),
        random_state=42
    )
    cval = cross_val_score(estimator, data, targets, scoring='neg_mean_squared_error', cv=3)
    return cval.mean()


param_bounds = {
    'n_estimators': (100, 300),
    'min_samples_split': (2, 20),
    'max_features': (0.1, 0.999),
    'max_depth': (10, 50)
}


optimizer = BayesianOptimization(
    f=rf_cv,
    pbounds=param_bounds,
    random_state=1,
    verbose=2
)

utility = UtilityFunction(kind="ei", kappa=2.576, xi=0.0)


for _ in range(30):
    next_point = optimizer.suggest(utility)
    target = rf_cv(data=X_train, targets=Y_train, **next_point)
    optimizer.register(params=next_point, target=target)


###Model Set Up

In [5]:
best_params = optimizer.max['params']
model = RandomForestRegressor(
    n_estimators=int(best_params['n_estimators']),
    min_samples_split=int(best_params['min_samples_split']),
    max_features=best_params['max_features'],
    max_depth=int(best_params['max_depth']),
    random_state=42
)

###Training

In [6]:

model.fit(X_train, Y_train)
predictions = model.predict(X_test)
mae = mean_absolute_error(Y_test, predictions)
print("Mean Absolute Error:", mae)

Mean Absolute Error: 0.29354711134676675
