In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
train_processed_path = r'C:\Users\cardo\Projeto House Prices\data\processed\train_processed.csv'
test_processed_path = r'C:\Users\cardo\Projeto House Prices\data\processed\test_processed.csv'

In [4]:
df_train_processed = pd.read_csv(train_processed_path)
df_test_processed = pd.read_csv(test_processed_path)

In [5]:
X = df_train_processed.drop('SalePrice', axis=1)
y = df_train_processed['SalePrice']

In [6]:
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_normalized = scaler_X.fit_transform(X)
y_normalized = scaler_y.fit_transform(y.values.reshape(-1, 1))
y_normalized = y_normalized.ravel()

# dividing data in test and train

X_train, X_test, y_train, y_test = train_test_split(X_normalized, y_normalized, test_size=0.2, random_state=42)

In [26]:
model = RandomForestRegressor(n_estimators=200, random_state=42)

# training the model

model.fit(X_train, y_train)

# predictions

y_pred = model.predict(X_test)

# model performance

mse = mean_squared_error(y_test, y_pred)
print('Mean Square Error:', mse)
rmse = np.sqrt(mse)
print('Root Mean Square Error:', rmse)
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolut Error:", mae)

Mean Square Error: 0.0009003868804941122
Root Mean Square Error: 0.030006447315437265
Mean Absolut Error: 0.02045863248756496


In [27]:
X_test_normalized = scaler_X.transform(df_test_processed.drop('Id',axis = 1))

In [28]:
predictions_normalized = model.predict(X_test_normalized)
predictions_denormalized = scaler_y.inverse_transform(predictions_normalized.reshape(-1, 1))

In [29]:
df_results = pd.DataFrame({'Id': df_test_processed['Id'], 'SalePrice': predictions_denormalized.flatten()})

In [30]:
df_results.to_csv(r'C:\\Users\cardo\Projeto House Prices\data\final\random_forest_model.csv', index = False)

In [31]:
df_results

Unnamed: 0,Id,SalePrice
0,1461,126386.000
1,1462,152849.745
2,1463,178494.950
3,1464,182760.465
4,1465,197767.180
...,...,...
1454,2915,84410.250
1455,2916,91761.305
1456,2917,154398.270
1457,2918,107887.500


In [32]:
# Saving model

import joblib

joblib.dump(model, r'C:\\Users\cardo\Projeto House Prices\models\random_forest_model.pkl')

['C:\\\\Users\\cardo\\Projeto House Prices\\models\\random_forest_model.pkl']