# Importing Data

In [1]:
import numpy as np
import pandas as pd

In [12]:
df = pd.read_csv('/content/housing.csv')
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


# Finding Null Values in Dataset

In [13]:
df.isnull().sum()

Unnamed: 0,0
longitude,0
latitude,0
housing_median_age,0
total_rooms,0
total_bedrooms,207
population,0
households,0
median_income,0
median_house_value,0
ocean_proximity,0


In [14]:
df['total_bedrooms'].describe() # cheking the range of values

Unnamed: 0,total_bedrooms
count,20433.0
mean,537.870553
std,421.38507
min,1.0
25%,296.0
50%,435.0
75%,647.0
max,6445.0


fixing null values by taking a mean

In [15]:
mean = df['total_bedrooms'].mean()
df['total_bedrooms'].fillna(mean, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['total_bedrooms'].fillna(mean, inplace=True)


In [17]:
df['total_bedrooms'].isnull().sum() #Verifying if null is removed

np.int64(0)

# Encoding Categorical Variables

In [18]:
df['ocean_proximity'].value_counts()

Unnamed: 0_level_0,count
ocean_proximity,Unnamed: 1_level_1
<1H OCEAN,9136
INLAND,6551
NEAR OCEAN,2658
NEAR BAY,2290
ISLAND,5


Here we shall use an ordinal encoding because the proximity to the ocean is directly proportial to cost, hence the category is progressive

In [23]:
from sklearn.preprocessing import OrdinalEncoder

ordered_categories = ['<1H OCEAN',
'INLAND',
'NEAR OCEAN',
'NEAR BAY',
'ISLAND']

encoder = OrdinalEncoder(categories=[ordered_categories])

df['ocean_proximity'] = encoder.fit_transform(df[['ocean_proximity']])

In [24]:
df['ocean_proximity'].value_counts()

Unnamed: 0_level_0,count
ocean_proximity,Unnamed: 1_level_1
0.0,9136
1.0,6551
2.0,2658
3.0,2290
4.0,5


# Splitting the Test and Train Datasets

In [25]:
from sklearn.model_selection import train_test_split

X = df.drop('median_house_value', axis=1)
y = df['median_house_value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Building Baseline linear regression model

In [26]:
from sklearn.linear_model import LinearRegression

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Testing Accuracy with test

In [29]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

lr = model
y_train_pred = model.predict(X_train)

# Calculate RMSE for the training set
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))

# Calculate R2 score for the training set
r2_train = r2_score(y_train, y_train_pred)

# Make predictions on the test set
y_test_pred = model.predict(X_test)

# Calculate RMSE for the test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

# Calculate MAE for the test set
mae_test = mean_absolute_error(y_test, y_test_pred)

# Calculate R2 score for the test set i.e accuracy
r2_test = r2_score(y_test, y_test_pred)

print(f"RMSE (Train): {rmse_train:.2f}")
print(f"RMSE (Test): {rmse_test:.2f}")
print(f"MAE (Test): {mae_test:.2f}")
print(f"R2 Score (Train): {r2_train:.2f}")
print(f"R2 Score (Test): {r2_test:.2f}")

RMSE (Train): 69339.27
RMSE (Test): 71129.36
MAE (Test): 51858.55
R2 Score (Train): 0.64
R2 Score (Test): 0.61


# Alright we are getting an ok score of 64% accuary with the train data so now lets train and test the other models

# Ridge Regression Model and testing

In [34]:
from sklearn.linear_model import Lasso

# Initialize the Lasso Regression model with a higher alpha value
model = Lasso(alpha=10)

# Train the model
model.fit(X_train, y_train)

In [36]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

rr = model
y_train_pred = model.predict(X_train)

# Calculate RMSE for the training set
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))

# Calculate R2 score for the training set
r2_train = r2_score(y_train, y_train_pred)

# Make predictions on the test set
y_test_pred = model.predict(X_test)

# Calculate RMSE for the test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

# Calculate MAE for the test set
mae_test = mean_absolute_error(y_test, y_test_pred)

# Calculate R2 score for the test set i.e accuracy
r2_test = r2_score(y_test, y_test_pred)

print(f"RMSE (Train): {rmse_train:.2f}")
print(f"RMSE (Test): {rmse_test:.2f}")
print(f"MAE (Test): {mae_test:.2f}")
print(f"R2 Score (Train): {r2_train:.2f}")
print(f"R2 Score (Test): {r2_test:.2f}")

RMSE (Train): 69339.28
RMSE (Test): 71128.04
MAE (Test): 51857.39
R2 Score (Train): 0.64
R2 Score (Test): 0.61


# Decision  Tree

In [37]:
from sklearn.tree import DecisionTreeRegressor

# Initialize the Decision Tree Regressor model
model = DecisionTreeRegressor(random_state=42)

# Train the model
model.fit(X_train, y_train)

In [38]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

dt = model
y_train_pred = model.predict(X_train)

# Calculate RMSE for the training set
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))

# Calculate R2 score for the training set
r2_train = r2_score(y_train, y_train_pred)

# Make predictions on the test set
y_test_pred = model.predict(X_test)

# Calculate RMSE for the test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

# Calculate MAE for the test set
mae_test = mean_absolute_error(y_test, y_test_pred)

# Calculate R2 score for the test set i.e accuracy
r2_test = r2_score(y_test, y_test_pred)

print(f"RMSE (Train): {rmse_train:.2f}")
print(f"RMSE (Test): {rmse_test:.2f}")
print(f"MAE (Test): {mae_test:.2f}")
print(f"R2 Score (Train): {r2_train:.2f}")
print(f"R2 Score (Test): {r2_test:.2f}")

RMSE (Train): 0.00
RMSE (Test): 68872.48
MAE (Test): 43657.26
R2 Score (Train): 1.00
R2 Score (Test): 0.64


# Outputs

## Compare Model Performance



In [40]:
model_performance = {
    'Linear Regression': {
        'RMSE (Train)': 69339.27,
        'RMSE (Test)': 71129.36,
        'MAE (Test)': 51858.55,
        'R2 Score (Train)': 0.64,
        'R2 Score (Test)': 0.61
    },
    'Lasso Regression': {
        'RMSE (Train)': 69339.28,
        'RMSE (Test)': 71128.04,
        'MAE (Test)': 51857.39,
        'R2 Score (Train)': 0.64,
        'R2 Score (Test)': 0.61
    },
    'Decision Tree': {
        'RMSE (Train)': 0.00,
        'RMSE (Test)': 68872.48,
        'MAE (Test)': 43657.26,
        'R2 Score (Train)': 1.00,
        'R2 Score (Test)': 0.64
    }
}

performance_df = pd.DataFrame(model_performance).T
print(performance_df)

                   RMSE (Train)  RMSE (Test)  MAE (Test)  R2 Score (Train)  \
Linear Regression      69339.27     71129.36    51858.55              0.64   
Lasso Regression       69339.28     71128.04    51857.39              0.64   
Decision Tree              0.00     68872.48    43657.26              1.00   

                   R2 Score (Test)  
Linear Regression             0.61  
Lasso Regression              0.61  
Decision Tree                 0.64  


## Model Performance Analysis

### Linear Regression and Lasso Regression:
Both Linear Regression and Lasso Regression show similar performance metrics. Their R2 scores for both training (0.64) and testing (0.61) are relatively close, indicating a consistent, but moderate, fit to the data. The RMSE values are also quite similar between train and test sets, suggesting neither model is severely overfit or underfit. However, the overall R2 score of 0.61 suggests there's still room for improvement in capturing the variance in the target variable, indicating a slight underfitting tendency, as more complex relationships might be missed.

### Decision Tree:
The Decision Tree model exhibits a perfect R2 Score (Train) of 1.00 and an RMSE (Train) of 0.00. This indicates significant overfitting, as the model has learned the training data too well, including its noise. Consequently, its performance on the unseen test data, while slightly better than the linear models (R2 Score (Test): 0.64, RMSE (Test): 68872.48), is substantially worse than its training performance. This large gap between training and testing metrics is a classic sign of overfitting, where the model struggles to generalize to new data.

## Comment on at least one real-world ML issue (e.g., noisy features, outliers, non-linearity, dataset bias)

The dataset exhibits dataset bias due to severe under-representation of island regions: almost all samples come from mainland census tracts, while islands appear only rarely or not at all. As a result, models trained on this data learn relationships driven by mainland housing markets—such as income, proximity to the coast, and urban density—and implicitly assume these patterns apply everywhere. When asked to predict prices for island locations, the model produces systematically biased estimates because island housing follows a different data-generating process (scarce land, isolation, tourism-driven demand) that the training data does not capture. This is a representation bias, where good overall accuracy masks consistently poor performance on a small but distinct subgroup.

# A brief note (5–8 lines) explaining:


1.   where underfitting occurred (high bias)
2.   where overfitting occurred (high variance)




**Underfitting (High Bias):** The Linear Regression and Lasso Regression models exhibited signs of underfitting. Their R2 scores were moderate (0.64 train, 0.61 test) and similar for both training and test sets. This suggests they failed to capture the complexity of the data, leading to a high bias and not learning the underlying patterns sufficiently.

**Overfitting (High Variance):** The Decision Tree model severely overfit the training data, achieving a perfect R2 score of 1.00 on the training set but a significantly lower 0.64 on the test set. This large discrepancy indicates high variance, where the model learned the training data too well, including its noise, and thus struggled to generalize to unseen data.