In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [2]:
train = pd.read_csv("output/train_clean.csv", index_col=0)
predict = pd.read_csv("output/predict_clean.csv", index_col=0)

##### Train Test Split

In [3]:
X = train.drop(columns='price')
y = train.price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(32364, 9)
(32364,)
(8091, 9)
(8091,)


In [4]:
print(f"-------ExtraTreesRegressor-------")
model = ExtraTreesRegressor()
model = model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("RMSE", round(np.sqrt(mean_squared_error(y_test,y_pred)),2))
scores = cross_val_score(model,X,y, cv=10, n_jobs=-1)
print(f"ExtraTrees Accuracy {np.mean(scores)}")

-------ExtraTreesRegressor-------
RMSE 578.66
ExtraTrees Accuracy 0.9801052965821431


##### GridSearchCV

In [5]:
params = {"n_estimators":[150,250,300],
        "max_depth":[10,25,50],
        "min_samples_split":[5,10,20]
       }

efr = ExtraTreesRegressor()
grid = GridSearchCV(efr,scoring="neg_root_mean_squared_error",verbose=1,n_jobs=-1,param_grid=params,cv=5)
grid.fit(X_train,y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed: 11.9min finished


GridSearchCV(cv=5, estimator=ExtraTreesRegressor(), n_jobs=-1,
             param_grid={'max_depth': [10, 25, 50],
                         'min_samples_split': [5, 10, 20],
                         'n_estimators': [150, 250, 300]},
             scoring='neg_root_mean_squared_error', verbose=1)

In [6]:
grid.best_params_

{'max_depth': 50, 'min_samples_split': 10, 'n_estimators': 300}

In [12]:
print(f"-------ExtraTreesRegressor-------")
model = ExtraTreesRegressor(n_estimators=300, min_samples_split=10, max_depth= 25) 
model = model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("RMSE", round(np.sqrt(mean_squared_error(y_test,y_pred)),2))
scores = cross_val_score(model,X,y, cv=10, n_jobs=-1)
print(f"ExtraTrees Accuracy {np.mean(scores)}")

#subm_1

-------ExtraTreesRegressor-------
RMSE 539.32
ExtraTrees Accuracy 0.9815047637902312


In [13]:
print(f"-------ExtraTreesRegressor-------")
model = ExtraTreesRegressor(n_estimators=350, min_samples_split=10, max_depth= 25) 
model = model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("RMSE", round(np.sqrt(mean_squared_error(y_test,y_pred)),2))
scores = cross_val_score(model,X,y, cv=10, n_jobs=-1)
print(f"ExtraTrees Accuracy {np.mean(scores)}")

#subm_3

-------ExtraTreesRegressor-------
RMSE 538.76
ExtraTrees Accuracy 0.9815266495008279


In [16]:
print(f"-------ExtraTreesRegressor-------")
model = ExtraTreesRegressor(n_estimators=1000, min_samples_split=10, max_depth= 50) 
model = model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("RMSE", round(np.sqrt(mean_squared_error(y_test,y_pred)),2))
scores = cross_val_score(model,X,y, cv=10, n_jobs=-1)
print(f"ExtraTrees Accuracy {np.mean(scores)}")

#subm_4

-------ExtraTreesRegressor-------
RMSE 539.14
ExtraTrees Accuracy 0.9815658668434859


In [25]:
print(f"-------ExtraTreesRegressor-------")
model = ExtraTreesRegressor(n_estimators=1800, min_samples_split=10, max_depth= 50) 
model = model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("RMSE", round(np.sqrt(mean_squared_error(y_test,y_pred)),2))
scores = cross_val_score(model,X,y, cv=10, n_jobs=-1)
print(f"ExtraTrees Accuracy {np.mean(scores)}")

#subm_5

-------ExtraTreesRegressor-------
RMSE 538.47
ExtraTrees Accuracy 0.9815571670615825
