# Task 1

In [None]:
import pandas as pd
import numpy as np
from faker import Faker
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

fake = Faker()

n = 500  

data = {
    "square_feet": np.random.randint(300, 2000, n),
    "bedrooms": np.random.randint(1, 5, n),
    "age_years": np.random.randint(0, 40, n)
}

df = pd.DataFrame(data)


df["rent_price"] = (
    df["square_feet"] * 1.8 +
    df["bedrooms"] * 150 +
    (40 - df["age_years"]) * 20 +      
    np.random.normal(0, 250, n)       
)

print(df.head())


X = df[["square_feet", "bedrooms", "age_years"]]
y = df["rent_price"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("=="*40)
print("R^2 Score:", r2)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)


   square_feet  bedrooms  age_years   rent_price
0         1069         3         27  2931.364678
1         1717         3         35  3579.991396
2         1216         4         30  2956.599961
3         1751         3         14  3764.080433
4          374         4         22  2077.724439
R^2 Score: 0.9092906137372576
MAE: 217.89368540651833
MSE: 68683.74644450874
RMSE: 262.07584101650565


# Task 2

In [9]:
# the dataset

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

n = 600  

df = pd.DataFrame({
    "square_feet": np.random.randint(500, 5000, n),
    "lot_size_acres": np.random.uniform(0.1, 2.5, n),
    "num_bedrooms": np.random.randint(1, 7, n),
    "distance_to_downtown_km": np.random.uniform(0.5, 45, n)
})

df["house_price"] = (
    df["square_feet"] * 250 +
    df["lot_size_acres"] * 50000 +
    df["num_bedrooms"] * 8000 -
    df["distance_to_downtown_km"] * 1500 +
    np.random.normal(0, 20000, n)   # noise
)

df.head()

# without scaling

X = df[["square_feet", "lot_size_acres", "num_bedrooms", "distance_to_downtown_km"]]
y = df["house_price"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

model_no_scaling = LinearRegression()
model_no_scaling.fit(X_train, y_train)

print('without scaling coefficients:\n')   
for feature, coef in zip(X.columns, model_no_scaling.coef_):
    print(f"{feature}: {coef}")


y_pred_no_scaling = model_no_scaling.predict(X_test)
r2_no_scaling = r2_score(y_test, y_pred_no_scaling)
print("\nR^2 without scaling:", r2_no_scaling)

# with scaling

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model_scaled = LinearRegression()
model_scaled.fit(X_train_scaled, y_train)

print("\nwith scaling: ")
for feature, coef in zip(X.columns, model_scaled.coef_):
    print(f"{feature}: {coef}")

y_pred_scaled = model_scaled.predict(X_test_scaled)
r2_scaled = r2_score(y_test, y_pred_scaled)
print("\nR^2 with scaling:", r2_scaled)




without scaling coefficients:

square_feet: 249.13068983435454
lot_size_acres: 51852.517172439286
num_bedrooms: 8064.835587162992
distance_to_downtown_km: -1483.561999853724

R^2 without scaling: 0.995991981908693

with scaling: 
square_feet: 328866.97302860353
lot_size_acres: 36216.226780862955
num_bedrooms: 14215.34513346119
distance_to_downtown_km: -18558.927368861692

R^2 with scaling: 0.995991981908693


# Task 3:

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score



n = 800

np.random.seed(42)

df = pd.DataFrame({
    "square_footage": np.random.randint(600, 5000, n),
    "num_bedrooms": np.random.randint(1, 7, n),
    "house_age": np.random.randint(0, 50, n),
    "distance_to_city_center": np.random.uniform(0.5, 40, n),
    "neighborhood_type": np.random.choice(["A", "B", "C"], n)
})

df["price"] = (
    df["square_footage"] * 220 +
    df["num_bedrooms"] * 15000 -
    df["house_age"] * 1200 -
    df["distance_to_city_center"] * 900 +
    df["neighborhood_type"].map({"A": 60000, "B": 30000, "C": 0}) +
    np.random.normal(0, 30000, n)   # noise
)

df.head()


X = df.drop("price", axis=1)
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

categorical = ["neighborhood_type"]
numeric = ["square_footage", "num_bedrooms", "house_age", "distance_to_city_center"]

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
    ("num", "passthrough", numeric)
])

A_train = X_train[["square_footage"]]
A_test = X_test[["square_footage"]]

model_A = LinearRegression()
model_A.fit(A_train, y_train)

y_pred_train_A = model_A.predict(A_train)
y_pred_test_A = model_A.predict(A_test)

print("\nMODEL A — Simple Linear Regression")
print("Train R^2:", r2_score(y_train, y_pred_train_A))
print("Test R^2:", r2_score(y_test, y_pred_test_A))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_test_A)))

model_B = Pipeline([
    ("prep", preprocessor),
    ("reg", LinearRegression())
])

model_B.fit(X_train, y_train)

y_pred_train_B = model_B.predict(X_train)
y_pred_test_B = model_B.predict(X_test)

print("\nMODEL B — Multiple Linear Regression")
print("Train R^2:", r2_score(y_train, y_pred_train_B))
print("Test R^2:", r2_score(y_test, y_pred_test_B))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_test_B)))

poly = PolynomialFeatures(degree=3, include_bias=False)

model_C = Pipeline([
    ("prep", preprocessor),
    ("poly", poly),
    ("reg", LinearRegression())
])

model_C.fit(X_train, y_train)

y_pred_train_C = model_C.predict(X_train)
y_pred_test_C = model_C.predict(X_test)

print("\nMODEL C — Polynomial Regression (degree 3)")
print("Train R^2:", r2_score(y_train, y_pred_train_C))
print("Test R^2:", r2_score(y_test, y_pred_test_C))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_test_C)))



MODEL A — Simple Linear Regression
Train R^2: 0.963986358214895
Test R^2: 0.9676520401525914
RMSE: 50602.74881512229

MODEL B — Multiple Linear Regression
Train R^2: 0.9891104587805102
Test R^2: 0.986843988165227
RMSE: 32271.02717468601

MODEL C — Polynomial Regression (degree 3)
Train R^2: 0.989924636236232
Test R^2: 0.9871045272712045
RMSE: 31949.884682407137


## Why is cross-validation more reliable than a single train-test split?
### A single split depends heavily on how the data was divided

### It may accidentally create:

### An easy test set => overestimation of accuracy

### A hard test set => underestimation

### Cross-validation trains on multiple different splits and averages performance

## Model C has highest mean score but also highest std (0.085). What does that mean? Which model is better for production?

### Model C performs extremely well on some folds but terribly on others

### High standard deviation = unstable, inconsistent performance

### This means Model C overfits

### we should use Lasso, because:

### It has good accuracy (0.81 mean)

### It has the smallest std (0.011) → most stable

### It generalizes better

### It is more robust to noise

## CV mean = 0.81 but test R^2 = 0.77. Should you be concerned? Why do they differ?

### No small differences are normal.

### The test set may contain patterns not seen in training folds

### andom variation in the split

### The cross-validation average may be optimistic

## Code to perform 5-fold CV and print scores

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

categorical = ["neighborhood_type"]
numeric = [col for col in X.columns if col not in categorical]

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
    ("num", "passthrough", numeric)
])

model = Pipeline([
    ("prep", preprocessor),
    ("reg", LinearRegression())
])

scores = cross_val_score(model, X, y, cv=5, scoring="r2")

print("Fold scores:", scores)
print("Mean:", scores.mean())
print("Std:", scores.std())


Fold scores: [0.98673037 0.98975909 0.98834981 0.98793627 0.98865001]
Mean: 0.9882851099630819
Std: 0.0009847703249073398


#  Task 3.3 – Final Model Interpretation

Given:
- Final model R² = 0.80  
- RMSE = $45,000  
- Average house price = $350,000  

---

## **1. Explain what R² = 0.80 means in practical terms.  
Is the model accurate enough for business decisions?  
What additional information is needed?**

An R² value of **0.80** means the model explains **80% of the variation in house prices**.  
In practical (non-technical) terms:

> “The model captures most of the important factors that influence house prices.  
> It correctly identifies general pricing patterns but does not explain everything.”

**Is this accurate enough?**  
It depends on the application:

- For **quick estimates** or **trend analysis**, R² = 0.80 is quite good.  
- For **high-stakes decisions** (appraisals, investments), it may not be accurate enough.

**Additional information needed:**
- What error level is acceptable to the business?  
- Does the model systematically overprice or underprice certain neighborhoods?  
- How large is the worst-case prediction error?  
- Are certain types of houses predicted poorly?

Without this context, R² alone cannot determine business usefulness.

---

## 2. RMSE = $45,000. Explain what this means and whether it is acceptable.**

RMSE represents the **average size of the errors** the model makes when predicting house prices.

> RMSE = $45,000 means the model’s predictions are typically off by about **$45K per house**.

Compared to the average home price ($350,000):

\[
\frac{45,000}{350,000} \approx 0.128 = 12.8\%
\]

So the model has roughly **13% error**.

**Is this acceptable?**
- For rough price estimation → probably yes  
- For mortgage decisions or exact pricing → probably not  
- Depends on the risk tolerance of the company  

A real estate company may require lower error for accurate appraisals.

---

## 3. RMSE increases from $45,000 to $65,000 after 6 months.  
Give three reasons why performance may degrade over time.**

### **1. Data Drift / Market Changes**
Housing markets change due to interest rates, inflation, economic shifts, etc.  
The relationships the model learned no longer hold.

### **2. Concept Drift**
The importance of features changes.  
Examples:
- Buyers suddenly value larger homes more (post-COVID effect)  
- Distance to downtown becomes less relevant  
- Neighborhood popularity changes  

### **3. Model Not Updated**
The model was trained once and never retrained.  
Real-world data evolves, and without retraining, the model becomes outdated.



