In [1]:
# Import Dependencies
import pandas as pd
from pathlib import Path
import numpy as np
import hvplot.pandas
from sklearn.linear_model import LinearRegression


In [3]:
# Read in the .csv
file_path = Path("../Resources/kingcountysales_2020_2023.csv")
df = pd.read_csv(file_path)

# Preview the dataframe
df.head()

Unnamed: 0,sale_id,sale_date,sale_price,sale_nbr,latitude,longitude,city,zoning,subdivision,land_val,...,grade,fbsmt_grade,condition,stories,beds,bath_full,bath_3qtr,bath_half,garb_sqft,gara_sqft
0,2020..204,2020-01-05,411900,1.0,47.468289,-122.348669,BURIEN,R-6,DASHLEYS ADD NO. 02,50000,...,5,0,3,1.0,2,1,0,0,0,0
1,2020..532,2020-01-03,625000,,47.704662,-122.319315,SEATTLE,SF 5000,MAPLE LEAF TO GREEN LAKE CIRCLE POR OF,80000,...,6,0,4,1.0,2,1,0,0,0,0
2,2020..643,2020-01-06,460000,2.0,47.403445,-122.296944,DES MOINES,RM1.8,EAST DES MOINES FIVE-ACRE TRACTS,25000,...,6,0,3,1.0,3,1,0,0,0,0
3,2020..1508,2020-01-15,930000,3.0,47.413624,-122.287447,SEA-TAC,RS15000,,75000,...,6,0,3,1.0,3,1,0,0,0,0
4,2020..1681,2020-01-22,815000,,47.708265,-122.308711,SEATTLE,SF 7200,VICTORY HEIGHTS ADD,61000,...,7,0,4,2.0,4,2,0,1,0,480


In [17]:
# Create a scatter plot of bedrooms versus the sale price information
price_br_plot = df.hvplot.scatter(
    x="beds",
    y="sale_price",
    title="Expected Sale Price based Bedroom Size"
)
price_br_plot

# Data Preperation

In [4]:
# Reformat data of the independent variable X as a single-column array
X = df["beds"].values.reshape(-1, 1)

# Display sample data
X[:5]

array([[2],
       [2],
       [3],
       [3],
       [4]])

In [5]:
# The shape of X is 30 samples, with a single feature (column)
X.shape

(72963, 1)

In [6]:
# Create an array for the dependent variable y
y = df["sale_price"]

# Build a Linear Regression Model

In [7]:
# Create a model with scikit-learn
model = LinearRegression()

In [8]:
# Fit the data into the model
model.fit(X, y)

In [9]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [186767.93868706]


In [10]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: 431165.4523213053


In [11]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = 431165.4523213053 + 186767.93868706154X


In [12]:
# Display the formula to predict the value of a home with 7 bedrooms
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]} * 7")

# Predict the value for a home with 7 bedrooms
y_7 = model.intercept_ + model.coef_[0] * 7

# Display the prediction
print(f"Predicted sale price for a home with 7 bedrooms: ${y_7:.2f}")

Model's formula: y = 431165.4523213053 + 186767.93868706154 * 7
Predicted sale price for a home with 7 bedrooms: $1738541.02


In [13]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [14]:
# Create a copy of the original data
df_predicted = df.copy()

# Add a column with the predicted salary values
df_predicted["sale_predicted"] = predicted_y_values

# Display sample data
df_predicted.head()

Unnamed: 0,sale_id,sale_date,sale_price,sale_nbr,latitude,longitude,city,zoning,subdivision,land_val,...,fbsmt_grade,condition,stories,beds,bath_full,bath_3qtr,bath_half,garb_sqft,gara_sqft,sale_predicted
0,2020..204,2020-01-05,411900,1.0,47.468289,-122.348669,BURIEN,R-6,DASHLEYS ADD NO. 02,50000,...,0,3,1.0,2,1,0,0,0,0,804701.3
1,2020..532,2020-01-03,625000,,47.704662,-122.319315,SEATTLE,SF 5000,MAPLE LEAF TO GREEN LAKE CIRCLE POR OF,80000,...,0,4,1.0,2,1,0,0,0,0,804701.3
2,2020..643,2020-01-06,460000,2.0,47.403445,-122.296944,DES MOINES,RM1.8,EAST DES MOINES FIVE-ACRE TRACTS,25000,...,0,3,1.0,3,1,0,0,0,0,991469.3
3,2020..1508,2020-01-15,930000,3.0,47.413624,-122.287447,SEA-TAC,RS15000,,75000,...,0,3,1.0,3,1,0,0,0,0,991469.3
4,2020..1681,2020-01-22,815000,,47.708265,-122.308711,SEATTLE,SF 7200,VICTORY HEIGHTS ADD,61000,...,0,4,2.0,4,2,0,1,0,480,1178237.0


In [16]:
# Create a line plot of the number of bedrooms versus the predicted sale price values
best_fit_line = df_predicted.hvplot.line(
    x = "beds",
    y = "sale_predicted",
    color = "red"
)
best_fit_line

In [18]:
# Superpose the original data and the best fit line
price_br_plot * best_fit_line

# Linear Regression Model Assessment

In [19]:
# Import relevant metrics from scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

In [20]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.04485178285824887.
The r2 is 0.04485178285824887.
The mean squared error is 647636855327.5693.
The root mean squared error is 804758.8802415102.
The standard deviation is 823437.0293276046.
