In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
Houses = pd.read_csv("California_Houses.csv")
Houses

Unnamed: 0,Median_House_Value,Median_Income,Median_Age,Tot_Rooms,Tot_Bedrooms,Population,Households,Latitude,Longitude,Distance_to_coast,Distance_to_LA,Distance_to_SanDiego,Distance_to_SanJose,Distance_to_SanFrancisco
0,452600.0,8.3252,41,880,129,322,126,37.88,-122.23,9263.040773,556529.158342,735501.806984,67432.517001,21250.213767
1,358500.0,8.3014,21,7099,1106,2401,1138,37.86,-122.22,10225.733072,554279.850069,733236.884360,65049.908574,20880.600400
2,352100.0,7.2574,52,1467,190,496,177,37.85,-122.24,8259.085109,554610.717069,733525.682937,64867.289833,18811.487450
3,341300.0,5.6431,52,1274,235,558,219,37.85,-122.25,7768.086571,555194.266086,734095.290744,65287.138412,18031.047568
4,342200.0,3.8462,52,1627,280,565,259,37.85,-122.25,7768.086571,555194.266086,734095.290744,65287.138412,18031.047568
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,78100.0,1.5603,25,1665,374,845,330,39.48,-121.09,162031.481121,654530.186299,830631.543047,248510.058162,222619.890417
20636,77100.0,2.5568,18,697,150,356,114,39.49,-121.21,160445.433537,659747.068444,836245.915229,246849.888948,218314.424634
20637,92300.0,1.7000,17,2254,485,1007,433,39.43,-121.22,153754.341182,654042.214020,830699.573163,240172.220489,212097.936232
20638,84700.0,1.8672,18,1860,409,741,349,39.43,-121.32,152005.022239,657698.007703,834672.461887,238193.865909,207923.199166


In [27]:
X = Houses.drop(columns=["Median_House_Value"])
Y = Houses["Median_House_Value"]

In [29]:
Houses[Houses.isnull().any(axis=1)]  # There are no missing values

Unnamed: 0,Median_House_Value,Median_Income,Median_Age,Tot_Rooms,Tot_Bedrooms,Population,Households,Latitude,Longitude,Distance_to_coast,Distance_to_LA,Distance_to_SanDiego,Distance_to_SanJose,Distance_to_SanFrancisco


In [31]:
Houses.corr()

Unnamed: 0,Median_House_Value,Median_Income,Median_Age,Tot_Rooms,Tot_Bedrooms,Population,Households,Latitude,Longitude,Distance_to_coast,Distance_to_LA,Distance_to_SanDiego,Distance_to_SanJose,Distance_to_SanFrancisco
Median_House_Value,1.0,0.688075,0.105623,0.134153,0.050594,-0.02465,0.065843,-0.14416,-0.045967,-0.46935,-0.130678,-0.09251,-0.04159,-0.030559
Median_Income,0.688075,1.0,-0.119034,0.19805,-0.008093,0.004834,0.013033,-0.079809,-0.015176,-0.243443,-0.065421,-0.055253,-0.036796,-0.022424
Median_Age,0.105623,-0.119034,1.0,-0.361262,-0.320485,-0.296244,-0.302916,0.011173,-0.108197,-0.226621,-0.031435,0.036113,-0.089753,-0.101447
Tot_Rooms,0.134153,0.19805,-0.361262,1.0,0.929893,0.857126,0.918484,-0.0361,0.044568,-0.00154,-0.019777,-0.038935,0.031862,0.03292
Tot_Bedrooms,0.050594,-0.008093,-0.320485,0.929893,1.0,0.878026,0.979829,-0.066318,0.068378,-0.02229,-0.055809,-0.067627,0.059663,0.06025
Population,-0.02465,0.004834,-0.296244,0.857126,0.878026,1.0,0.907222,-0.108785,0.099773,-0.040273,-0.110407,-0.109691,0.079117,0.088618
Households,0.065843,0.013033,-0.302916,0.918484,0.979829,0.907222,1.0,-0.071035,0.05531,-0.062035,-0.062118,-0.069091,0.047861,0.050157
Latitude,-0.14416,-0.079809,0.011173,-0.0361,-0.066318,-0.108785,-0.071035,1.0,-0.924664,0.303645,0.941857,0.991571,-0.855075,-0.897397
Longitude,-0.045967,-0.015176,-0.108197,0.044568,0.068378,0.099773,0.05531,-0.924664,1.0,0.007865,-0.892026,-0.958315,0.923991,0.954864
Distance_to_coast,-0.46935,-0.243443,-0.226621,-0.00154,-0.02229,-0.040273,-0.062035,0.303645,0.007865,1.0,0.197672,0.214531,-0.077524,-0.068248


In [41]:
x_train, x_, y_train,y_ = train_test_split(X, Y, test_size=0.3, random_state=1)
x_cv, x_test, y_cv,y_test = train_test_split(x_, y_, test_size=0.5, random_state=1)

print(f"Train set: {len(x_train)} rows")
print(f"Validation set: {len(x_cv)} rows")
print(f"Test set: {len(x_test)} rows")

Train set: 14448 rows
Validation set: 3096 rows
Test set: 3096 rows


### 🔸 **L1 Regularization (Lasso)**

**Penalty term**:  
$\lambda \sum |\beta_j| \quad \text{(absolute values)}$

**Effects**:
- Can **shrink some coefficients to exactly zero**
- Performs **feature selection** (i.e., selects a simpler model)

---

### 🔹 **L2 Regularization (Ridge)**

**Penalty term**:  
$\lambda \sum \beta_j^2 \quad \text{(squares)}$

**Effects**:
- Shrinks all coefficients **closer to zero**, but **none go exactly to zero**
- Helps prevent overfitting, but **keeps all features**
- Leads to **smooth solutions**


In [22]:
linear_model = LinearRegression()
ridge_model = Ridge(alpha=1.0)
lasso_model = Lasso(alpha=1.0, max_iter=5000)

In [53]:
models = {
    'Linear Regression': LinearRegression(), 
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=1.0, max_iter=5000)
}

In [63]:
results = {}
for name, model in models.items():
    model.fit(x_train, y_train)
    
    y_cv_pred = model.predict(x_cv)
    y_cv_pred = y_cv_pred.round()
    
    mse = mean_squared_error(y_cv, y_cv_pred)
    mae = mean_absolute_error(y_cv, y_cv_pred)
    
    results[name] = {
        'model': model,
        'MSE': mse,
        'MAE': mae
    }

In [71]:
print("Model Performance on validation Set:")
print("{:<20} {:<15} {:<15}".format('Model', 'MSE', 'MAE'))
print("-" * 50)
for name, metrics in results.items():
    print("{:<20} {:<15.4f} {:<15.4f}".format(
        name, 
        metrics['MSE'], 
        metrics['MAE']
    ))

Model Performance on validation Set:
Model                MSE             MAE            
--------------------------------------------------
Linear Regression    4825652900.2070 50353.1857     
Ridge Regression     4825631471.4309 50353.4612     
Lasso Regression     4825647045.3847 50353.2988     


In [None]:
eg:
final_results {
name: {
        MSE : result of the model MSE
        MAE : result of the model MAE
      }
}

In [65]:
final_results = {}
for name in results:
    model = results[name]['model']
    
    y_test_pred = model.predict(x_test)
    y_test_pred = y_test_pred.round()
    
    mse_test = mean_squared_error(y_test, y_test_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)
    
    final_results[name] = {
        'Test MSE': mse_test,
        'Test MAE': mae_test
    }

In [67]:
print("Model Performance on Test Set:")
print("{:<20} {:<15} {:<15}".format('Model', 'MSE', 'MAE'))
print("-" * 50)
for name, metrics in final_results.items():
    print("{:<20} {:<15.4f} {:<15.4f}".format(
        name, 
        metrics['Test MSE'], 
        metrics['Test MAE']
    ))

Model Performance on Test Set:
Model                MSE             MAE            
--------------------------------------------------
Linear Regression    4668754168.8530 49746.7290     
Ridge Regression     4668667683.1554 49746.7561     
Lasso Regression     4668721485.5898 49746.7313     


In [75]:
Houses.describe()["Median_House_Value"]

# error is less than std

count     20640.000000
mean     206855.816909
std      115395.615874
min       14999.000000
25%      119600.000000
50%      179700.000000
75%      264725.000000
max      500001.000000
Name: Median_House_Value, dtype: float64