In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

standardScaler = joblib.load('../pipeline/StandardScalerPipeline.pkl')

In [41]:
X = pd.read_csv('../dataset/X.csv')
y = pd.read_csv('../dataset/y.csv')

In [42]:
X.head()

Unnamed: 0,Size,Bedrooms,Bathrooms,Lat,Long,Amenities_Score,BedtoBath_Ratio,Furnished,Apartment,Duplex,Penthouse,Room,Studio,TotalRooms,Level_category,accessibility_score
0,165,3,3,30.093319,31.637916,8,1.0,1,1,0,0,0,0,6,0,0
1,225,3,3,30.023628,31.304425,5,1.0,1,1,0,0,0,0,6,0,0
2,280,4,3,30.052118,31.342205,7,0.75,0,1,0,0,0,0,7,0,0
3,120,3,2,30.052118,31.342205,6,0.666667,1,1,0,0,0,0,5,0,0
4,197,3,3,30.051086,31.537079,8,1.0,0,1,0,0,0,0,6,0,0


In [43]:
y = np.array(y).ravel()
y[:5]

array([35000, 77000, 30000,  1500, 22000], dtype=int64)

In [None]:
stdX = standardScaler.transform(X)

In [45]:
X_train, X_test, y_train, y_test = train_test_split(stdX, y, test_size=0.1, random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(9548, 16)
(9548,)
(1061, 16)
(1061,)


In [57]:
def print_metrics(y_true, preds):
    mae = mean_absolute_error(y_true, preds)
    mse = mean_squared_error(y_true, preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, preds)
    print(f'MAE: {mae:.2f}')
    print(f'RMSE: {rmse:.2f}')
    print(f'R2:{r2:.4f}\n')

def cv_evaluate(model):
    # Check CV Scores
    scores = cross_val_score(model, X_train, y_train, cv=5)
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    print('Cross Validation Scores:')
    print(f'CV_scores = {scores}')
    print('CV mean:', "{:.4f}".format(np.mean(scores)))
    print('CV Std:', "{:.4f}".format(np.std(scores)), '\n')

    # print Accuracy Scores
    print("Train Scores:")
    print_metrics(y_train, train_preds)
    print("Test Scores:")
    print_metrics(y_test, test_preds)

In [79]:
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    min_samples_leaf=5,
    min_samples_split=15,
    max_features='sqrt',
    bootstrap=True,
)
rf_model.fit(X_train, y_train)
cv_evaluate(rf_model)

Cross Validation Scores:
CV_scores = [0.6097911  0.60774522 0.6433173  0.60973638 0.60498382]
CV mean: 0.6151
CV Std: 0.0142 

Train Scores:
MAE: 5258.64
RMSE: 8034.89
R2:0.6940

Test Scores:
MAE: 5840.10
RMSE: 8736.15
R2:0.6149



In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 15, 20],
    'min_samples_leaf': [5, 10, 15],
    'min_samples_split': [5, 10, 15],
    'max_features': ['sqrt', 'log2']
}

from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)
print(f'Best parameters found: {grid_search.best_params_}')

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best parameters found: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 200}


In [96]:
best_rf_model = grid_search.best_estimator_
cv_evaluate(best_rf_model)

Cross Validation Scores:
CV_scores = [0.6069467  0.60466436 0.64711822 0.61329866 0.60594205]
CV mean: 0.6156
CV Std: 0.0160 

Train Scores:
MAE: 5167.73
RMSE: 7911.88
R2:0.7033

Test Scores:
MAE: 5825.16
RMSE: 8695.94
R2:0.6184



## Train on all dataset

In [102]:
best_rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    min_samples_leaf=5,
    min_samples_split=5,
    max_features='sqrt',
    bootstrap=True,
)
best_rf_model.fit(stdX, y)
print_metrics(y, best_rf_model.predict(stdX))

MAE: 5139.55
RMSE: 7870.47
R2:0.7046



## Save the model to file

In [103]:
joblib.dump(best_rf_model, '../pipeline/rf_model.pkl')

['../pipeline/rf_model.pkl']