In [27]:
import pandas as pd
import numpy as np

df = pd.read_csv("housing.csv")
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


Prepare the dataset

In [28]:
df = df[df['ocean_proximity'].isin(['<1H OCEAN', 'INLAND'])]

# Select specific columns
columns_to_use = [
    'latitude',
    'longitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'median_house_value'
]

df = df[columns_to_use]

Question 1
There's one feature with missing values. What is it?

total_rooms
total_bedrooms
population
households

In [29]:
missing_values = df.isnull().sum()

columns_with_missing_values = missing_values[missing_values > 0]
print(columns_with_missing_values)

total_bedrooms    157
dtype: int64


Question 2
What's the median (50% percentile) for variable 'population'?

995
1095
1195
1295

In [30]:
population_median = df['population'].median()
print(population_median)

1195.0


Prepare and split the dataset
Shuffle the dataset (the filtered one you created above), use seed 42.
Split your data in train/val/test sets, with 60%/20%/20% distribution.
Apply the log transformation to the median_house_value variable using the np.log1p() function.

In [33]:
df_shuffled = df.sample(frac=1, random_state=42).copy()
n = len(df_shuffled)

n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test
n_val, n_test, n_train

df_train = df_shuffled.iloc[:n_train].reset_index(drop=True)
df_val = df_shuffled.iloc[n_train:n_train+n_val].reset_index(drop=True)
df_test = df_shuffled.iloc[n_train+n_val:].reset_index(drop=True)

df_train['median_house_value'] = np.log1p(df_train['median_house_value'])
df_val['median_house_value'] = np.log1p(df_val['median_house_value'])
df_test['median_house_value'] = np.log1p(df_test['median_house_value'])

df_train.head()


Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,36.23,-119.14,22.0,2935.0,523.0,1927.0,530.0,2.5875,11.161963
1,34.12,-117.79,16.0,2426.0,426.0,1319.0,446.0,4.8125,12.321635
2,33.68,-117.97,26.0,3653.0,568.0,1930.0,585.0,5.7301,12.471896
3,34.1,-118.03,32.0,2668.0,609.0,1512.0,541.0,2.9422,12.359227
4,37.34,-121.87,39.0,2479.0,541.0,1990.0,506.0,2.4306,12.574531


In [40]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

Question 3
We need to deal with missing values for the column from Q1.
We have two options: fill it with 0 or with the mean of this variable.
Try both options. For each, train a linear regression model without regularization using the code from the lessons.
For computing the mean, use the training only!
Use the validation dataset to evaluate the models and compare the RMSE of each option.
Round the RMSE scores to 2 decimal digits using round(score, 2)
Which option gives better RMSE?
Options:

With 0
With mean
Both are equally good

1st option - fill with 0

In [36]:
train_zero_filled = df_train.copy()
train_zero_filled["total_bedrooms"] = train_zero_filled["total_bedrooms"].fillna(0)
X_train_zero = train_zero_filled.drop(columns='median_house_value').values
y_train = df_train['median_house_value'].values
w0_zero, w_zero = train_linear_regression(X_train_zero, y_train)

val_zero_filled = df_val.copy()
val_zero_filled["total_bedrooms"] = val_zero_filled["total_bedrooms"].fillna(0)
X_val_zero = val_zero_filled.drop(columns='median_house_value').values
y_val = df_val['median_house_value'].values
y_pred_zero = w0_zero + X_val_zero.dot(w_zero)
rmse_zero = rmse(y_val, y_pred_zero)
rmse_zero

0.3408479034165669

2nd option - fill with mean

In [37]:
mean_value = df_train["total_bedrooms"].mean()
train_mean_filled = df_train.copy()
train_mean_filled["total_bedrooms"] = train_mean_filled["total_bedrooms"].fillna(mean_value)
X_train_mean = train_mean_filled.drop(columns='median_house_value').values
w0_mean, w_mean = train_linear_regression(X_train_mean, y_train)

val_mean_filled = df_val.copy()
val_mean_filled["total_bedrooms"] = val_mean_filled["total_bedrooms"].fillna(mean_value)
X_val_mean = val_mean_filled.drop(columns='median_house_value').values
y_pred_mean = w0_mean + X_val_mean.dot(w_mean)
rmse_mean = round(rmse(y_val, y_pred_mean), 2)
rmse_mean

0.3405699801474657

In [39]:
print("RMSE with zero filling:", round(rmse_zero, 2))
print("RMSE with mean filling:", round(rmse_mean, 2))


RMSE with zero filling: 0.34
RMSE with mean filling: 0.34


Question 4
Now let's train a regularized linear regression.
For this question, fill the NAs with 0.
Try different values of r from this list: [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10].
Use RMSE to evaluate the model on the validation dataset.
Round the RMSE scores to 2 decimal digits.
Which r gives the best RMSE?
If there are multiple options, select the smallest r.

Options:

0
0.000001
0.001
0.0001

In [44]:
train_zero_filled = df_train.copy()
train_zero_filled["total_bedrooms"] = train_zero_filled["total_bedrooms"].fillna(0)
X_train_zero = train_zero_filled.drop(columns='median_house_value').values

val_zero_filled = df_val.copy()
val_zero_filled["total_bedrooms"] = val_zero_filled["total_bedrooms"].fillna(0)
X_val_zero = val_zero_filled.drop(columns='median_house_value').values

r_values = [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]
rmse_scores = []

for r in r_values:
    w0, w = train_linear_regression_reg(X_train_zero, y_train, r=r)
    y_pred = w0 + X_val_zero.dot(w)
    score = rmse(y_val, y_pred)
    rmse_scores.append(round(score, 2))

best_r = r_values[rmse_scores.index(min(rmse_scores))]

print("RMSE scores for different r values:", rmse_scores)
print(f"The best r value is: {best_r} with RMSE: {min(rmse_scores)}")

RMSE scores for different r values: [0.34, 0.34, 0.34, 0.34, 0.34, 0.34, 0.34, 0.35, 0.35]
The best r value is: 0 with RMSE: 0.34


Question 5
We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
For each seed, do the train/validation/test split with 60%/20%/20% distribution.
Fill the missing values with 0 and train a model without regularization.
For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
What's the standard deviation of all the scores? To compute the standard deviation, use np.std.
Round the result to 3 decimal digits (round(std, 3))
What's the value of std?

0.5
0.05
0.005
0.0005

In [51]:
seed_values = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmse_scores_seeds = []


for seed in seed_values:
    df_shuffled = df.sample(frac=1, random_state=seed)
    n = len(df_shuffled)

    n_val = int(n * 0.2)
    n_test = int(n * 0.2)
    n_train = n - n_val - n_test
    n_val, n_test, n_train

    df_train = df_shuffled.iloc[:n_train].reset_index(drop=True)
    df_val = df_shuffled.iloc[n_train:n_train+n_val].reset_index(drop=True)
    df_test = df_shuffled.iloc[n_train+n_val:].reset_index(drop=True)

    df_train['median_house_value'] = np.log1p(df_train['median_house_value'])
    df_val['median_house_value'] = np.log1p(df_val['median_house_value'])
    df_test['median_house_value'] = np.log1p(df_test['median_house_value'])
    
    train_filled = df_train.copy()
    train_filled["total_bedrooms"] = train_filled["total_bedrooms"].fillna(0)
    y_train = train_filled['median_house_value'].values
    X_train = train_filled.drop(columns='median_house_value').values
    
    val_filled = df_val.copy()
    val_filled["total_bedrooms"] = val_filled["total_bedrooms"].fillna(0)
    y_val = val_filled['median_house_value'].values
    X_val = val_filled.drop(columns='median_house_value').values
    
    w0, w = train_linear_regression(X_train, y_train)
    
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    rmse_scores_seeds.append(score)

std_value = round(np.std(rmse_scores_seeds), 3)
print(f"Standard deviation of RMSE scores across different seeds: {std_value}")


Standard deviation of RMSE scores across different seeds: 0.005


Question 6
Split the dataset like previously, use seed 9.
Combine train and validation datasets.
Fill the missing values with 0 and train a model with r=0.001.
What's the RMSE on the test dataset?
Options:

0.13
0.23
0.33
0.43


In [53]:
df_shuffled = df.sample(frac=1, random_state=9)
n = len(df_shuffled)

n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test
n_val, n_test, n_train

df_train = df_shuffled.iloc[:n_train].reset_index(drop=True)
df_val = df_shuffled.iloc[n_train:n_train+n_val].reset_index(drop=True)
df_test = df_shuffled.iloc[n_train+n_val:].reset_index(drop=True)

df_train['median_house_value'] = np.log1p(df_train['median_house_value'])
df_val['median_house_value'] = np.log1p(df_val['median_house_value'])
df_test['median_house_value'] = np.log1p(df_test['median_house_value'])

train_combined = pd.concat([df_train, df_val])

train_combined_filled = train_combined.copy()
train_combined_filled["total_bedrooms"] = train_combined_filled["total_bedrooms"].fillna(0)
X_train_combined = train_combined_filled.drop(columns='median_house_value').values
y_train_combined = train_combined_filled['median_house_value'].values

test_filled = df_test.copy()
test_filled["total_bedrooms"] = test_filled["total_bedrooms"].fillna(0)
X_test = test_filled.drop(columns='median_house_value').values
y_test = test_filled['median_house_value'].values

w0, w = train_linear_regression_reg(X_train_combined, y_train_combined, r=0.001)

y_pred = w0 + X_test.dot(w)
score = rmse(y_test, y_pred)
print(f"RMSE on the test dataset: {round(score, 2)}")


RMSE on the test dataset: 0.33
