In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [2]:
train = pd.read_csv("output/train_clean.csv")
predict = pd.read_csv("output/predict_clean.csv")

##### Train Test Split

In [3]:
X = train.drop(columns='price')
y = train.price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(32364, 10)
(32364,)
(8091, 10)
(8091,)


In [4]:
print(f"-------RandomForestRegressor-------")
model = RandomForestRegressor()
model = model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("RMSE", round(np.sqrt(mean_squared_error(y_test,y_pred)),2))
scores = cross_val_score(model,X,y, cv=10, n_jobs=-1)
print(f"RandomForest Accuracy {np.mean(scores)}")

-------RandomForestRegressor-------
RMSE 552.79
RandomForest Accuracy 0.9796128879278792


##### GridSearchCV

In [5]:
parameters = {'max_depth': [10, 20, 40, 60, 80, 100],
              'max_features': [0, 5, 10],
              'min_samples_leaf': [10, 20, 30],
              'min_samples_split': [10, 15, 20],
              'n_estimators': [50, 100, 200, 400, 800, 1000, 1600]}

In [None]:
rfc = RandomForestRegressor()
grid = GridSearchCV(rfc, parameters, verbose=1, n_jobs=-1, cv=5)
grid.fit(X_train,y_train)

In [None]:
print(grid.best_params_)

In [9]:
model = RandomForestRegressor(max_depth=10, max_features=10, min_samples_leaf=20, min_samples_split=15, n_estimators=50)
model = model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("RMSE", round(np.sqrt(mean_squared_error(y_test,y_pred)),2))
scores = cross_val_score(model,X,y, cv=10, n_jobs=-1)
print(f"RandomForest Accuracy {np.mean(scores)}")

RMSE 629.77
RandomForest Accuracy 0.9735143215550446
