In [None]:

#What we did on Wednesday
#import needed libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

#our basic area and price data -- training data
square_feet  = np.array([1000,1500,2000,2500,3000,3500]).reshape(-1,1)
prices = np.array([200000, 280000,350000,430000,500000,580000])

#data visualization -- Exploratory Data Analysis (EDA)
plt.scatter(square_feet, prices, color = 'blue', s = 100)
plt.xlabel("Square Feet")
plt.ylabel("Price ($)")
plt.title("House Prices vs Square Footage")
plt.show()

#create and train a regression model
model = LinearRegression()
model.fit(square_feet,prices)

# Look at what our model predicts for the given data
predicted_prices = model.predict(square_feet)
for area,p_price, price in zip(square_feet,predicted_prices,prices):
  print(f"Home area: {area[0]}ft^2 Predicted Price: ${p_price:,.0f} Actual Price: ${price:,}")

#visualize the model
plt.scatter(square_feet, prices, color='blue', s=100, label='Actual')
plt.plot(square_feet, predicted_prices, color='red', linewidth=2, label='Model')
plt.xlabel('Square Feet')
plt.ylabel('Price ($)')
plt.title('Our ML Model')
plt.legend()
plt.show()

#make predictions for a new house of 2750
new_house = np.array([[2750]])
predicted = model.predict(new_house)
print(f"Predicted price for 2750 sq ft house: ${predicted[0]:,.0f}")

# view and consider what the model learned -- is this a highly "interpretable" model?
print(f"The model learned: Price = ${model.coef_[0]:.2f} x square feet + ${model.intercept_:.2f}")


In [None]:

#calculate how wrong we are
errors = prices - predicted_prices
for i, (sf, actual, pred, err) in enumerate(zip(square_feet, prices, predicted_prices, errors)):
    print(f"House {i+1}: {sf[0]} sq ft - Actual: ${actual:,}, Predicted: ${pred:,.0f}, Error: ${err:,.0f}")

#calculate the mean squared error (MSE)
mse = np.mean(errors**2)
print(f"Mean Squared Error: ${mse:,.0f}")


#visualize errors
plt.subplot(1, 2, 1)
plt.scatter(square_feet, prices, color='blue', s=100, label='Actual', zorder=3)
plt.plot(square_feet, predicted_prices, color='red', linewidth=2, label='Predicted')
# Draw error lines
for sf, actual, pred in zip(square_feet, prices, predicted_prices):
    plt.plot([sf[0], sf[0]], [actual, pred], 'k--', alpha=0.5, linewidth=3)
plt.xlabel('Square Feet')
plt.ylabel('Price ($)')
plt.title('Prediction Errors')
plt.legend()

#now let's plot a bar plot with actual vs predicted ...




In [None]:
#Training vs Testing

# Let's get more data (simulated for demo purposes)
np.random.seed(42)
all_square_feet = np.random.randint(800, 4000, size=30).reshape(-1, 1)
# True relationship with some noise
all_prices = 170 * all_square_feet.flatten() + 50000 + np.random.normal(0, 25000, size=30)

# Visualize all our data
plt.scatter(all_square_feet, all_prices, alpha=0.6)
plt.xlabel('Square Feet')
plt.ylabel('Price ($)')
plt.title('Our Full Dataset (30 houses)')
plt.show()

# now we'll get into train/test split

In [None]:

# Now with multiple features
# Square feet, bedrooms, age (years)
X_multi = np.array([
    [1000, 2, 30],
    [1500, 3, 20],
    [2000, 3, 10],
    [2500, 4, 5],
    [3000, 4, 2],
    [3500, 5, 1]
])
prices = np.array([200000, 280000, 350000, 430000, 500000, 580000])

#Train a multi-feature model and make predictions


In [None]:
#getting tricked by our model, i.e. overfitting


from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

#Generate a small, noisy dataset
np.random.seed(42)
n_samples = 30

true_fun = lambda X: np.cos(1.5 * np.pi * X)
X = np.sort(np.random.rand(n_samples)).reshape(-1,1)
y = true_fun(X).flatten() + np.random.randn(n_samples) * 0.1


#Now let's trick our model
