Estamos tendo problemas com datasets (train.csv e test.csv) que estão com features diferentes por conta do processamento. Vou tentar aplicar um pipeline igual nos dois para que as features não fiquem diferentes

### Imports

In [32]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

## Dataset intocado

In [33]:
df = pd.read_csv('../train.csv')

In [34]:
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

Decidindo os melhores parâmetros:

In [35]:
from sklearn.model_selection import GridSearchCV

SEED = 42
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [0.2, 0.5]
}

rfr = RandomForestRegressor(random_state=SEED)

# Initialize the GridSearchCV
grid_search = GridSearchCV(estimator=rfr, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Melhores parâmetros: {best_params}")

Fitting 5 folds for each of 162 candidates, totalling 810 fits
[CV] END max_depth=10, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=10, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=10, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=10, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=10, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; tota

ValueError: 
All the 810 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
162 fits failed with the following error:
Traceback (most recent call last):
  File "/home/educg550/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/educg550/.local/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/educg550/.local/lib/python3.10/site-packages/sklearn/ensemble/_forest.py", line 363, in fit
    X, y = self._validate_data(
  File "/home/educg550/.local/lib/python3.10/site-packages/sklearn/base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/educg550/.local/lib/python3.10/site-packages/sklearn/utils/validation.py", line 1273, in check_X_y
    X = check_array(
  File "/home/educg550/.local/lib/python3.10/site-packages/sklearn/utils/validation.py", line 1007, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/home/educg550/.local/lib/python3.10/site-packages/sklearn/utils/_array_api.py", line 746, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: 'RL'

--------------------------------------------------------------------------------
648 fits failed with the following error:
Traceback (most recent call last):
  File "/home/educg550/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/educg550/.local/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/educg550/.local/lib/python3.10/site-packages/sklearn/ensemble/_forest.py", line 363, in fit
    X, y = self._validate_data(
  File "/home/educg550/.local/lib/python3.10/site-packages/sklearn/base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/educg550/.local/lib/python3.10/site-packages/sklearn/utils/validation.py", line 1273, in check_X_y
    X = check_array(
  File "/home/educg550/.local/lib/python3.10/site-packages/sklearn/utils/validation.py", line 1007, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/home/educg550/.local/lib/python3.10/site-packages/sklearn/utils/_array_api.py", line 746, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: 'RM'


In [None]:
best_rfr = RandomForestRegressor(**best_params, random_state=SEED)
best_rfr.fit(X_train, y_train)

In [None]:
test_data = pd.read_csv('../test.csv')
X_test = test_data

In [None]:
# Make predictions using the best model
y_test_pred = best_rfr.predict(X_test)

# Output the predictions if the target variable is not available
print("Predictions:", y_test_pred)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Exterior1st_None
- Exterior2nd_None
- Functional_None
- Id
- KitchenQual_None
- ...
Feature names seen at fit time, yet now missing:
- Condition2_RRAe
- Condition2_RRAn
- Condition2_RRNn
- Exterior1st_ImStucc
- Exterior1st_Stone
- ...


In [None]:
# Salvar submission com ID da linha como coluna 'Id' e SalePrice como coluna 'SalePrice'

y_test_pred_df = pd.DataFrame(y_test_pred, columns=['SalePrice'])
y_test_pred_df['Id'] = y_test_pred_df.index + 1
y_test_pred_df = y_test_pred_df[['Id', 'SalePrice']]
y_test_pred_df.to_csv('submission_intocado_rfr.csv', index=False)

## Dataset sem normalizar

In [2]:
df = pd.read_csv('../train_cleaned_manual.csv')

In [24]:
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

Decidindo os melhores parâmetros:

In [25]:
from sklearn.model_selection import GridSearchCV

SEED = 42
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [0.2, 0.5]
}

rfr = RandomForestRegressor(random_state=SEED)

# Initialize the GridSearchCV
grid_search = GridSearchCV(estimator=rfr, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Melhores parâmetros: {best_params}")

Fitting 5 folds for each of 162 candidates, totalling 810 fits
[CV] END max_depth=10, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END max_depth=10, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END max_depth=10, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END max_depth=10, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END max_depth=10, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END max_depth=10, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.0s
[CV] END max_depth=10, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.2s
[CV] END max_depth=10, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; tota

In [29]:
best_rfr = RandomForestRegressor(**best_params, random_state=SEED)
best_rfr.fit(X_train, y_train)

In [30]:
test_data = pd.read_csv('../test_cleaned_manual.csv')
X_test = test_data

In [31]:
# Make predictions using the best model
y_test_pred = best_rfr.predict(X_test)

# Output the predictions if the target variable is not available
print("Predictions:", y_test_pred)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Exterior1st_None
- Exterior2nd_None
- Functional_None
- Id
- KitchenQual_None
- ...
Feature names seen at fit time, yet now missing:
- Condition2_RRAe
- Condition2_RRAn
- Condition2_RRNn
- Exterior1st_ImStucc
- Exterior1st_Stone
- ...


In [None]:
# Salvar submission com ID da linha como coluna 'Id' e SalePrice como coluna 'SalePrice'

y_test_pred_df = pd.DataFrame(y_test_pred, columns=['SalePrice'])
y_test_pred_df['Id'] = y_test_pred_df.index + 1
y_test_pred_df = y_test_pred_df[['Id', 'SalePrice']]
y_test_pred_df.to_csv('submission.csv', index=False)

## Dataset normalizado

In [None]:
df = pd.read_csv('../train_cleaned_manual_normalized.csv')

In [None]:
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

Decidindo os melhores parâmetros:

In [None]:
from sklearn.model_selection import GridSearchCV

SEED = 42
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [0.2, 0.5]
}

rfr = RandomForestRegressor(random_state=SEED)

# Initialize the GridSearchCV
grid_search = GridSearchCV(estimator=rfr, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Melhores parâmetros: {best_params}")

Fitting 5 folds for each of 162 candidates, totalling 810 fits
[CV] END max_depth=10, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END max_depth=10, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END max_depth=10, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END max_depth=10, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END max_depth=10, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END max_depth=10, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.0s
[CV] END max_depth=10, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.2s
[CV] END max_depth=10, max_features=0.2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; tota

In [None]:
best_rfr = RandomForestRegressor(**best_params, random_state=SEED)
best_rfr.fit(X_train, y_train)

In [None]:
test_data = pd.read_csv('../test_cleaned_manual.csv')
X_test = test_data

In [None]:
# Make predictions using the best model
y_test_pred = best_rfr.predict(X_test)

# Output the predictions if the target variable is not available
print("Predictions:", y_test_pred)

Predictions: [292047.40233706 232056.06438547 258404.70625698 226986.95625698
 138151.21666667 106860.37666667 203582.30581006  87457.60333333
 270586.11128492 106133.16666667 108822.43333333 227864.94666667
  88777.47       164133.44        85267.87       135827.77292365
 107298.52666667 128579.25       108576.77333333 247899.94812849
 215713.55397579 194598.57333333 135575.43       261802.36625698
 223419.69918063 117650.13       228477.20438547 247470.49356611
 135935.6        229022.08333333 300240.73631285 148942.79333333
 302057.65192737 149885.57       138025.91666667 114556.33333333
 135951.51812849 135938.56333333 179177.08333333 238081.66771881
 203561.98105214 142192.08       252480.45625698 211743.91812849
 189261.08914339 106566.22730912 219546.3358473  307174.0375419
 136710.         185026.96666667 157901.19479516 216387.3467784
 126521.49666667 194058.1958473  190522.65513966 192943.27292365
 359726.64608007 203620.28       213281.15       129892.12666667
 130197.496666

In [None]:
# Salvar submission com ID da linha como coluna 'Id' e SalePrice como coluna 'SalePrice'

y_test_pred_df = pd.DataFrame(y_test_pred, columns=['SalePrice'])
y_test_pred_df['Id'] = y_test_pred_df.index + 1
y_test_pred_df = y_test_pred_df[['Id', 'SalePrice']]
y_test_pred_df.to_csv('submission_normalized_rfr.csv', index=False)