In [1]:
## Bibliotecas

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
import missingno as msn
import seaborn as sns



housing = fetch_california_housing()
data_housing = pd.DataFrame(data=housing.data, columns=housing.feature_names)
data_housing_target = pd.DataFrame(data=housing.target, columns=housing.target_names)
data_housing.head()
data_housing_target.head()

Unnamed: 0,MedHouseVal
0,4.526
1,3.585
2,3.521
3,3.413
4,3.422


# Import Sklearn 

In [25]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.tree import DecisionTreeRegressor

# Split Data - Trainning and Tests

In [3]:
X_train, X_test, y_train, y_test = train_test_split(data_housing, data_housing_target, test_size=0.3,random_state=109) 

# Regressão Linear

In [10]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred = lin_reg.predict(X_test)

lin_mse = mean_squared_error(y_test, y_pred)

print("MSE = ", lin_mse)
print("RMSE = ", np.sqrt(lin_mse))

MSE =  0.554866775996053
RMSE =  0.7448938018241614


# Random Forest

In [16]:
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train, y_train.values.ravel())
y_pred = rf_reg.predict(X_test)

lin_mse = mean_squared_error(y_test, y_pred)

print("MSE = ", lin_mse)
print("RMSE = ", np.sqrt(lin_mse))

MSE =  0.25188588101992576
RMSE =  0.5018823378242412


## Grid Search

In [19]:
param_grid = [
{'n_estimators': [3, 10, 30, 60], 'max_features': [2, 4, 6, 8]},
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
scoring='neg_mean_squared_error',
return_train_score=True)
grid_search.fit(X_train, y_train.values.ravel())




forest_reg_opt = RandomForestRegressor()
forest_reg_opt.set_params(**grid_search.best_params_)



RandomForestRegressor(max_features=4, n_estimators=60)

In [20]:
forest_reg_opt.fit(X_train, y_train.values.ravel())
y_pred = forest_reg_opt.predict(X_test)

lin_mse = mean_squared_error(y_test, y_pred)

print("MSE = ", lin_mse)
print("RMSE = ", np.sqrt(lin_mse))

MSE =  0.24689040325746214
RMSE =  0.4968806730568841


# SVM - Support Vector Machines

In [24]:
svm_reg = svm.SVR()
svm_reg.fit(X_train, y_train.values.ravel())
y_pred = svm_reg.predict(X_test)

lin_mse = mean_squared_error(y_test, y_pred)

print("MSE = ", lin_mse)
print("RMSE = ", np.sqrt(lin_mse))

MSE =  1.3799652101364526
RMSE =  1.1747192048044726


# Decision Tree

In [26]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train.values.ravel())
y_pred = tree_reg.predict(X_test)

lin_mse = mean_squared_error(y_test, y_pred)

print("MSE = ", lin_mse)
print("RMSE = ", np.sqrt(lin_mse))

MSE =  0.512067449461434
RMSE =  0.7155888829917874


## Grid Search

In [32]:
param_grid = [
{'criterion': ["squared_error", "friedman_mse", "absolute_error", "poisson"], 'splitter': ['best', 'random'], 'max_depth': [0, 5, 10], 'min_samples_split': [2, 10], 'min_samples_leaf': [1, 2, 4]},]


dt_reg = DecisionTreeRegressor()
grid_search = GridSearchCV(dt_reg, param_grid, cv=5,
scoring='neg_mean_squared_error',
return_train_score=True)
grid_search.fit(X_train, y_train.values.ravel())
dt_reg_opt = DecisionTreeRegressor()
dt_reg_opt.set_params(**grid_search.best_params_)

240 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Matheus\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Matheus\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\tree\_classes.py", line 1315, in fit
    super().fit(
  File "C:\Users\Matheus\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\tree\_classes.py", line 306, in fit
    raise ValueError("max_depth must be greater than zero. ")
ValueError: max_depth must be greater than zero

DecisionTreeRegressor(criterion='friedman_mse', max_depth=10,
                      min_samples_leaf=4, min_samples_split=10)

In [33]:
dt_reg_opt.fit(X_train, y_train.values.ravel())
y_pred = dt_reg_opt.predict(X_test)

lin_mse = mean_squared_error(y_test, y_pred)

print("MSE = ", lin_mse)
print("RMSE = ", np.sqrt(lin_mse))

MSE =  0.3808632458395553
RMSE =  0.617141187929922


# Notes

## Why using Ravel on Random Forest 

Porque usar values.ravel()

Explanation:

.values will give the values in a numpy array (shape: (n,1))

.ravel will convert that array shape to (n, ) (i.e. flatten it)