<a href="https://www.kaggle.com/code/marcelozagonellevek/kc-house-sales-predict?scriptVersionId=144092884" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
df = pd.read_csv('/kaggle/input/housesalesprediction/kc_house_data.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df = df.drop('date', axis=1)

In [None]:
# Correlation
df.corr()

In [None]:
# Dropping less importants features
df = df.drop(['id', 'sqft_lot', 'condition', 'yr_built', 'zipcode', 'long', 'sqft_lot15'], axis = 1)
df.head()


In [None]:
# Rplace the 0 values of the columns (bedrooms and bathrooms) with the average value and remove the others with zero values
cols_to_replace = ['bedrooms', 'bathrooms']
for col in cols_to_replace:
    mean_value = df[col][df[col] != 0].mean()
    df[col] = df[col].replace(0, mean_value)

cols_to_drop = colunms=['waterfront', 'view', 'sqft_basement','yr_renovated']
df = df.drop(columns=cols_to_drop, index=1)
df.head()

<h1>LINEAR REGRESSION</h1>

predict variable: sqft_living

target variable: price

In [None]:
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values

In [None]:
print(X.shape, y.shape)

Splitting into training and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25, random_state = 42)

Create and fit the model

In [None]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

* Predicting

In [None]:
y_pred = linear_model.predict(X_test)
y_pred

Metrics

In [None]:
rmse = mean_squared_error(y_test, y_pred, squared = False)
r2 = r2_score(y_test, y_pred)
print(f'RMSE: {rmse:.2f}')
print(f'R2_Score: {r2:.2f}')

Predicting with new data and based on sqft_living

In [None]:
sqft_living_to_predict = 2000
input_data = np.array([3, 2.0, sqft_living_to_predict, 1.0, 7, 2170, 47.7210, 1690 ]).reshape(1, -1)
predicted_price = linear_model.predict(input_data)
print(f'The estimated price of the house based on {sqft_living_to_predict} sqft_living is $ {predicted_price[0]:.2f}')

<h1>HOUSE PRICE PREDICT - LASSO METHOD</h1>

In [None]:
from sklearn.linear_model import Lasso

Create the Lasso model

In [None]:
lasso_model = Lasso(alpha = 1.0, selection='random')

Training the model

In [None]:
lasso_model.fit(X_train, y_train)

Predicting

In [None]:
y_pred = lasso_model.predict(X_test)

Metrics

In [None]:
rmse = mean_squared_error(y_test, y_pred, squared = False)
r2 = r2_score(y_test, y_pred)
print(f'RMSE: {rmse:.2f}')
print(f'R2: {r2:.2f}')

<h1>HOUSE PRICE PREDICT - RANDOM FOREST</h1>

In [None]:
from sklearn.ensemble import RandomForestRegressor


Create and fit the model

In [None]:
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

Predict

In [None]:
y_pred = rf_model.predict(X_test)

Metrics

In [None]:
rmse = mean_squared_error(y_test, y_pred, squared = False)
r2 = r2_score(y_test, y_pred)
print(f'RMSE: {rmse:.2f}')
print(f'R2: {r2:.2f}')

<h1>HOUSE PREDICT - XGBoost </h1>

In [None]:
from xgboost import XGBRegressor

Create and fit the model

In [None]:
xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train)

Predict

In [None]:
y_pred = xgb_model.predict(X_test)

Metrics

In [None]:
rmse = mean_squared_error(y_test, y_pred, squared = False)
r2 = r2_score(y_test, y_pred)
print(f'RMSE: {rmse:.2f}')
print(f'R2: {r2:.2f}')

# Create a GridSearchCV object to find the best hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5]
}
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)
grid_search

Train the model with different hyperparametersTrain the model with different hyperparameters

In [None]:
grid_search.fit(X_train, y_train)

Get the best hyperparameters found

In [None]:
best_params = grid_search.best_params_

Create a new template with the best hyperparameters

In [None]:
xgb_model_best = XGBRegressor(**best_params)

Train the final model with the best hyperparameters

In [None]:
xgb_model_best.fit(X_train, y_train)

Predictions with the optimized model

In [None]:
y_pred_best = xgb_model_best.predict(X_test)

Metrics

In [None]:
rmse_best = mean_squared_error(y_test, y_pred_best, squared=False)
r2_best = r2_score(y_test, y_pred_best)

print(f'Best Hiperparameters: {best_params}')
print(f'RMSE (optimized model): {rmse_best:.2f}')
print(f'R2 Score (optimized model): {r2_best:.2f}')

<h2>Comparing the R2_score results of home price forecasting with the algorithms used:</h2>
<ol>
    <li>Linear Regression r2_score: 0.60</li>
    <li>Random Forest r2_score: 0.78</li>
    <li>XGBoost r2_score: 0.76</li>
    <li>Optimized XGBoost r2_score: 0.78</li>
</ol>

### Using the best model to predict the house price with new data, in this case there are 2 better models,  we will choose the Optimized XGBoost

In [None]:
sqft_living_to_predict = 2000
input_data = np.array([3, 2.0, sqft_living_to_predict, 1.0, 7, 2170, 47.7210, 1690 ]).reshape(1, -1)
predicted_price = xgb_model_best.predict(input_data)
print(f'The estimated price of the house based on {sqft_living_to_predict} sqft_living is $ {predicted_price[0]:.2f}')