In [59]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from joblib import dump
from math import sqrt


In [60]:

# load and print dataset
url = "https://raw.githubusercontent.com/Mateo486/Housing-Market-Capstone/main/miami-housing%5B1%5D.csv"
print("Loading data...")
data = pd.read_csv(url)


Unnamed: 0,LATITUDE,LONGITUDE,PARCELNO,SALE_PRC,LND_SQFOOT,TOT_LVG_AREA,SPEC_FEAT_VAL,RAIL_DIST,OCEAN_DIST,WATER_DIST,CNTR_DIST,SUBCNTR_DI,HWY_DIST,age,avno60plus,month_sold,structure_quality
0,25.891031,-80.160561,622280070620,440000.0,9375,1753,0,2815.9,12811.4,347.6,42815.3,37742.2,15954.9,67,0,8,4
1,25.891324,-80.153968,622280100460,349000.0,9375,1715,0,4359.1,10648.4,337.8,43504.9,37340.5,18125.0,63,0,9,4
2,25.891334,-80.15374,622280100470,800000.0,9375,2276,49206,4412.9,10574.1,297.1,43530.4,37328.7,18200.5,61,0,2,4
3,25.891765,-80.152657,622280100530,988000.0,12450,2058,10033,4585.0,10156.5,0.0,43797.5,37423.2,18514.4,63,0,9,4
4,25.891825,-80.154639,622280100200,755000.0,12800,1684,16681,4063.4,10836.8,326.6,43599.7,37550.8,17903.4,42,0,7,4


In [61]:
# prepare data features and target variable
X = data[['LND_SQFOOT', 'TOT_LVG_AREA', 'SPEC_FEAT_VAL', 'RAIL_DIST', 'OCEAN_DIST', 'WATER_DIST', 'CNTR_DIST', 'SUBCNTR_DI', 'HWY_DIST', 'age', 'structure_quality']]
y = data['SALE_PRC']

# split dataset into training and test sets
print("Splitting data...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [62]:
# setup and configure model pipeline
print("Setting up the model pipeline...")
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # handle missing values
    ('regressor', RandomForestRegressor(random_state=42))  # regression model
])

# fit model to training data
print("Fitting the model...")
pipeline.fit(X_train, y_train)

In [63]:
# setup grid search for hyperparameter tuning
param_grid = {
    'regressor__n_estimators': [100, 120],  # define number of trees
    'regressor__max_depth': [None, 7],  # define maximum depth of trees
    'regressor__min_samples_split': [2, 5],  # define minimum samples to split a node
    'regressor__min_samples_leaf': [1, 2]  # define minimum samples at each leaf node
}


In [None]:
# execute grid search to find best model parameters
print("Starting grid search...")
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# retrieve best model from grid search
best_model = grid_search.best_estimator_
print("Grid search complete.")

# save best model to file
model_path = 'C:\\Users\\mnm4m\\project\\best_model3.joblib'
print(f"Saving the model to {model_path}...")
dump(best_model, model_path)
print("Model saved.")