<center><h1 style="font-size:35px; font-family: 'Times New Roman'; letter-spacing: 0.1em;">House Price Prediction 🏡</h1></center>

<center><img src="https://images.unsplash.com/photo-1516156008625-3a9d6067fab5?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=870&q=80"></center>

<h1 style="font-family: 'Times New Roman'; letter-spacing: 0.08em;">Importing the Essential Libraries, Metrics</h1>

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures

<h1 style="font-family: 'Times New Roman'; letter-spacing: 0.08em;">Loading the Data</h1>

In [6]:
df = pd.read_csv("data.csv")

<h1 style="font-family: 'Times New Roman'; letter-spacing: 0.08em;">Exploratory Data Analysis</h1>

***Taking a look at the first 5 rows of the dataset***

In [7]:
df

Unnamed: 0,Jumlah Kamar Tidur,Jumlah Kamar Mandi,Luas Bangunan,Luas Tanah,Harga
0,7,4,300.0,270.0,Rp 1.500.000.000
1,2,1,36.0,98.0,Rp 355.500.000
2,3,2,200.0,300.0,Rp 800.000.000
3,2,1,40.0,175.0,Rp 800.000.000
4,2,1,62.0,97.5,Rp 475.000.000
...,...,...,...,...,...
595,2,1,27.0,72.0,Rp 168.000.000
596,4,3,470.0,529.0,Rp 3.000.000.000
597,5,5,240.0,280.0,Rp 2.300.000.000
598,2,1,45.0,100.0,Rp 450.000.000


***Checking the shape—i.e. size—of the data***

In [None]:
df.shape

***Learning the dtypes of columns' and how many non-null values are there in those columns***

In [None]:
df.info()

***Getting the statistical summary of dataset***

In [None]:
df.describe().T

***Checking for the missing values***

In [None]:
print("Missing Values by Column")
print("-"*30)
print(df.isna().sum())
print("-"*30)
print("TOTAL MISSING VALUES:", df.isna().sum().sum())

<h1 style="font-family: 'Times New Roman'; letter-spacing: 0.08em;">X, y Split</h1>

***Splitting the data into X and y chunks***

In [None]:
X = df.drop(["NO", "NAMA RUMAH", "HARGA"], axis=1)
y = df["HARGA"]

<h1 style="font-family: 'Times New Roman'; letter-spacing: 0.08em;">Standardizing the Data</h1>

***Standardizing the numerical columns in X dataset. StandardScaler() adjusts the mean of the features as 0 and standard deviation of features as 1. Formula that StandardScaler() uses is as follows:***

<center><img width="300px" src="https://www.thoughtco.com/thmb/gItmqGd5HlnhyPIiLm1YHXOlTnw=/330x242/filters:fill(auto,1)/zscore-56a8fa785f9b58b7d0f6e87b.GIF"></center>

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
X

In [None]:
X_df = pd.DataFrame(X)

X_df.columns = ['LB', 'LT', 'KT', 'KM', 'GRS']
X_df

<h1 style="font-family: 'Times New Roman'; letter-spacing: 0.08em;">Train-Test Split</h1>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

***Defining several evaluation functions for convenience***

In [None]:
def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5)).mean()
    return rmse
    

def evaluation(y, predictions):
    rmse = round(np.sqrt(mean_squared_error(y, predictions)), 2)
    r_squared = round(r2_score(y, predictions), 4)
    return rmse, r_squared

<h1 style="font-family: 'Times New Roman'; letter-spacing: 0.08em;">Machine Learning Models</h1>

In [None]:
models = pd.DataFrame(columns=["Model","RMSE","R2 Score","RMSE (Cross-Validation)"])

<h2 style="font-family: 'Times New Roman'; letter-spacing: 0.05em;">Linear Regression</h2>

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
predictions = lin_reg.predict(X_test)

rmse, r_squared = evaluation(y_test, predictions)
print("RMSE:", rmse)
print("R2 Score:", r_squared)
print("-"*30)
rmse_cross_val = rmse_cv(lin_reg)
print("RMSE Cross-Validation:", rmse_cross_val)

new_row = {"Model": "MultipleLinearRegression","RMSE": rmse, "R2 Score": r_squared, "RMSE (Cross-Validation)": rmse_cross_val}
models = models.append(new_row, ignore_index=True)

<h2 style="font-family: 'Times New Roman'; letter-spacing: 0.05em;">Ridge Regression</h2>

In [None]:
ridge = Ridge()
ridge.fit(X_train, y_train)
predictions = ridge.predict(X_test)

rmse, r_squared = evaluation(y_test, predictions)
print("RMSE:", rmse)
print("R2 Score:", r_squared)
print("-"*30)
rmse_cross_val = rmse_cv(ridge)
print("RMSE Cross-Validation:", rmse_cross_val)

new_row = {"Model": "Ridge", "RMSE": rmse, "R2 Score": r_squared, "RMSE (Cross-Validation)": rmse_cross_val}
models = models.append(new_row, ignore_index=True)

<h2 style="font-family: 'Times New Roman'; letter-spacing: 0.05em;">Lasso Regression</h2>

In [None]:
lasso = Lasso()
lasso.fit(X_train, y_train)
predictions = lasso.predict(X_test)

rmse, r_squared = evaluation(y_test, predictions)
print("RMSE:", rmse)
print("R2 Score:", r_squared)
print("-"*30)
rmse_cross_val = rmse_cv(lasso)
print("RMSE Cross-Validation:", rmse_cross_val)

new_row = {"Model": "Lasso", "RMSE": rmse, "R2 Score": r_squared, "RMSE (Cross-Validation)": rmse_cross_val}
models = models.append(new_row, ignore_index=True)

<h2 style="font-family: 'Times New Roman'; letter-spacing: 0.05em;">Random Forest Regressor</h2>

In [None]:
random_forest = RandomForestRegressor(n_estimators=100)
random_forest.fit(X_train, y_train)
predictions = random_forest.predict(X_test)

rmse, r_squared = evaluation(y_test, predictions)
print("RMSE:", rmse)
print("R2 Score:", r_squared)
print("-"*30)
rmse_cross_val = rmse_cv(random_forest)
print("RMSE Cross-Validation:", rmse_cross_val)

new_row = {"Model": "RandomForestRegressor", "RMSE": rmse, "R2 Score": r_squared, "RMSE (Cross-Validation)": rmse_cross_val}
models = models.append(new_row, ignore_index=True)

<h2 style="font-family: 'Times New Roman'; letter-spacing: 0.05em;">Polynomial Regression (Degree=2)</h2>

In [None]:
poly_reg = PolynomialFeatures(degree=2)
X_train_2d = poly_reg.fit_transform(X_train)
X_test_2d = poly_reg.transform(X_test)

lin_reg = LinearRegression()
lin_reg.fit(X_train_2d, y_train)
predictions = lin_reg.predict(X_test_2d)

rmse, r_squared = evaluation(y_test, predictions)
print("RMSE:", rmse)
print("R2 Score:", r_squared)
print("-"*30)
rmse_cross_val = rmse_cv(lin_reg)
print("RMSE Cross-Validation:", rmse_cross_val)

new_row = {"Model": "Polynomial Regression (degree=2)", "RMSE": rmse, "R2 Score": r_squared, "RMSE (Cross-Validation)": rmse_cross_val}
models = models.append(new_row, ignore_index=True)

<h1 style="font-family: 'Times New Roman'; letter-spacing: 0.08em;">Model Comparison</h1>

***The less the Root Mean Squared Error (RMSE), The better the model is.***

In [None]:
models.sort_values(by="RMSE (Cross-Validation)")

In [None]:
plt.figure(figsize=(12,8))
sns.lineplot(data= models, x="Model", y="RMSE (Cross-Validation)")
plt.title("Models' RMSE Scores (Cross-Validated)", size=20)
plt.xticks(rotation=30, size=15)
plt.yticks(rotation=30, size=15)
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.lineplot(x=models["Model"], y=models["RMSE"])
plt.title("Models' RMSE Scores", size=15)
plt.xticks(rotation=30, size=15)
plt.yticks(rotation=30, size=15)
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.lineplot(x=models["Model"], y=models["R2 Score"])
plt.title("Models' R2 Scores", size=15)
plt.xticks(rotation=30, size=15)
plt.yticks(rotation=30, size=15)
plt.show()

In [None]:
models