In [16]:
# Import Dependencies
import pandas as pd
from pathlib import Path
import numpy as np
import hvplot.pandas
from sklearn.linear_model import LinearRegression

In [17]:
# Read in the CSV file
file = Path('Resources/st_paul_sold_properties .csv')
df = pd.read_csv(file)

In [18]:
# Preview the data
df.head()

Unnamed: 0,list_date,list_price,sold_date,sold_price,beds,baths,sqft,lot_sqft,city,street,zip,latitude,longitude
0,2025-01-31,209900.0,2025-03-31,209900,2.0,1.5,904.0,4704.0,Saint Paul,868 Algonquin Ave,55119,44.967562,-93.017718
1,2024-11-15,374900.0,2024-12-13,391000,3.0,1.5,1534.0,4966.0,Saint Paul,1246 Bayard Ave,55116,44.922167,-93.152861
2,2024-12-20,275000.0,2025-01-10,266800,3.0,1.0,1297.0,6534.0,Saint Paul,967 California Ave W,55117,44.991249,-93.140914
3,2025-01-23,500000.0,2025-02-28,505003,3.0,2.0,1636.0,5009.0,Saint Paul,2098 Pinehurst Ave,55116,44.918628,-93.189055
4,2025-03-18,425000.0,2025-04-08,475000,4.0,1.0,1750.0,7362.0,Saint Paul,2129 Sargent Ave,55105,44.935472,-93.190323


## Prepare the data

In [42]:
# Review the data types
print(df.dtypes)

list_date             object
list_price           float64
sold_date     datetime64[ns]
sold_price             int64
beds                 float64
baths                float64
sqft                 float64
lot_sqft             float64
city                  object
street                object
zip                    int64
latitude             float64
longitude            float64
dtype: object


In [None]:
# Change the data type of the sold_date column from an object to date time
df['sold_date'] = pd.to_datetime(df['sold_date'])
print(df.dtypes)

list_date             object
list_price           float64
sold_date     datetime64[ns]
sold_price             int64
beds                 float64
baths                float64
sqft                 float64
lot_sqft             float64
city                  object
street                object
zip                    int64
latitude             float64
longitude            float64
dtype: object


In [49]:
# Drop NaN values from Beds
df = df.dropna(subset=['beds'])

In [50]:
df.head()

Unnamed: 0,list_date,list_price,sold_date,sold_price,beds,baths,sqft,lot_sqft,city,street,zip,latitude,longitude
0,2025-01-31,209900.0,2025-03-31,209900,2.0,1.5,904.0,4704.0,Saint Paul,868 Algonquin Ave,55119,44.967562,-93.017718
1,2024-11-15,374900.0,2024-12-13,391000,3.0,1.5,1534.0,4966.0,Saint Paul,1246 Bayard Ave,55116,44.922167,-93.152861
2,2024-12-20,275000.0,2025-01-10,266800,3.0,1.0,1297.0,6534.0,Saint Paul,967 California Ave W,55117,44.991249,-93.140914
3,2025-01-23,500000.0,2025-02-28,505003,3.0,2.0,1636.0,5009.0,Saint Paul,2098 Pinehurst Ave,55116,44.918628,-93.189055
4,2025-03-18,425000.0,2025-04-08,475000,4.0,1.0,1750.0,7362.0,Saint Paul,2129 Sargent Ave,55105,44.935472,-93.190323


In [51]:
# Create a scatter plot of bedrooms versus the sale price information
price_br_plot = df.hvplot.scatter(
    x="beds",
    y="sold_price",
    title="Expected Sale Price based Bedroom Size"
)
price_br_plot


In [52]:
# Reformat data of the independent variable X as a single-column array
X = df["beds"].values.reshape(-1, 1)

# Display sample data
X[:5]

array([[2.],
       [3.],
       [3.],
       [3.],
       [4.]])

In [53]:
# The shape of X is 30 samples, with a single feature (column)
X.shape

(1299, 1)

In [54]:
# Create an array for the dependent variable y
y = df["sold_price"]

## Build a Linear Regression Model

In [55]:
# Create a model with scikit-learn
model = LinearRegression()

In [56]:
# Fit the data into the model
model.fit(X, y)

In [57]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [118652.18671307]


In [58]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: -22688.850005480694


In [59]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = -22688.850005480694 + 118652.18671306953X


In [60]:
# Display the formula to predict the value of a home with 7 bedrooms
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]} * 7")

# Predict the value for a home with 7 bedrooms
y_7 = model.intercept_ + model.coef_[0] * 7

# Display the prediction
print(f"Predicted sale price for a home with 7 bedrooms: ${y_7:.2f}")

Model's formula: y = -22688.850005480694 + 118652.18671306953 * 7
Predicted sale price for a home with 7 bedrooms: $807876.46


In [61]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [62]:
# Create a copy of the original data
df_predicted = df.copy()

# Add a column with the predicted salary values
df_predicted["sale_predicted"] = predicted_y_values

# Display sample data
df_predicted.head()

Unnamed: 0,list_date,list_price,sold_date,sold_price,beds,baths,sqft,lot_sqft,city,street,zip,latitude,longitude,sale_predicted
0,2025-01-31,209900.0,2025-03-31,209900,2.0,1.5,904.0,4704.0,Saint Paul,868 Algonquin Ave,55119,44.967562,-93.017718,214615.523421
1,2024-11-15,374900.0,2024-12-13,391000,3.0,1.5,1534.0,4966.0,Saint Paul,1246 Bayard Ave,55116,44.922167,-93.152861,333267.710134
2,2024-12-20,275000.0,2025-01-10,266800,3.0,1.0,1297.0,6534.0,Saint Paul,967 California Ave W,55117,44.991249,-93.140914,333267.710134
3,2025-01-23,500000.0,2025-02-28,505003,3.0,2.0,1636.0,5009.0,Saint Paul,2098 Pinehurst Ave,55116,44.918628,-93.189055,333267.710134
4,2025-03-18,425000.0,2025-04-08,475000,4.0,1.0,1750.0,7362.0,Saint Paul,2129 Sargent Ave,55105,44.935472,-93.190323,451919.896847


In [63]:
# Create a line plot of the number of bedrooms versus the predicted sale price values
best_fit_line = df_predicted.hvplot.line(
    x = "beds",
    y = "sale_predicted",
    color = "red"
)
best_fit_line

In [64]:
# Superpose the original data and the best fit line
price_br_plot * best_fit_line

## Linear Regression Model Assessment

In [66]:
# Import relevant metrics from scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

In [67]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.2606587395050354.
The r2 is 0.2606587395050354.
The mean squared error is 38861183584.60544.
The root mean squared error is 197132.4011536547.
The standard deviation is 229263.82827385992.
