In [1]:
pip install mlxtend

Note: you may need to restart the kernel to use updated packages.


In [39]:
import pandas as pd
import numpy as np
from mlxtend.regressor import StackingCVRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib

In [10]:
train_processed_path = r'C:\Users\cardo\Projeto House Prices\data\processed\train_processed.csv'
test_processed_path = r'C:\Users\cardo\Projeto House Prices\data\processed\test_processed.csv'

In [11]:
df_train_processed = pd.read_csv(train_processed_path)
df_test_processed = pd.read_csv(test_processed_path)

In [12]:
X = df_train_processed.drop('SalePrice', axis=1)
y = df_train_processed['SalePrice']

In [17]:
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_normalized = scaler_X.fit_transform(X)
y_normalized = scaler_y.fit_transform(y.values.reshape(-1, 1))
y_normalized = y_normalized.ravel()

# dividing data in test and train

X_train, X_test, y_train, y_test = train_test_split(X_normalized, y_normalized, test_size=0.2, random_state=42)

In [46]:
# Definindo os modelos de primeiro nível
lasso = joblib.load(r'C:\\Users\cardo\Projeto House Prices\models\lasso_model.pkl')
svm = joblib.load(r'C:\\Users\cardo\Projeto House Prices\models\svm_model.pkl')


# Definindo o modelo de segundo nível (meta-modelo)
# Nesse caso, usaremos um regressor linear, mas poderia ser qualquer modelo
meta_model = RandomForestRegressor()

# Definindo o ensemble de stacking
stack = StackingCVRegressor(regressors=(svm, rf),
                            meta_regressor=meta_model, 
                            cv=5,
                            use_features_in_secondary=True)

# Treinando o ensemble de stacking
stack.fit(X_train, y_train)

y_pred = stack.predict(X_test)

# model perfomance

mse = mean_squared_error(y_test, y_pred)
print('Mean Square Error:', mse)
rmse = np.sqrt(mse)
print('Root Mean Square Error:', rmse)
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolut Error:", mae)

Mean Square Error: 0.0008623477060157421
Root Mean Square Error: 0.029365757371737276
Mean Absolut Error: 0.0196903804883915


In [47]:
X_test_normalized = scaler_X.transform(df_test_processed.drop('Id',axis = 1))

In [48]:
predictions_normalized = lasso.predict(X_test_normalized)
predictions_denormalized = scaler_y.inverse_transform(predictions_normalized.reshape(-1, 1))

In [49]:
df_results = pd.DataFrame({'Id': df_test_processed['Id'], 'SalePrice': predictions_denormalized.flatten()})

In [50]:
df_results.to_csv(r'C:\\Users\cardo\Projeto House Prices\data\final\ensemble_model.csv', index = False)

In [51]:
df_results

Unnamed: 0,Id,SalePrice
0,1461,104864.564607
1,1462,160909.052812
2,1463,173493.179092
3,1464,191085.723794
4,1465,194595.894494
...,...,...
1454,2915,67023.044350
1455,2916,71573.066811
1456,2917,156997.791803
1457,2918,124799.952523
