# ML Zoomcamp 2023, Homework 2 (linear regression)

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv')
data = data[(data['ocean_proximity'] == 'INLAND') | (data['ocean_proximity'] == '<1H OCEAN')]
data = data[['latitude','longitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income',
'median_house_value']]

In [3]:
data.count() 

latitude              15687
longitude             15687
housing_median_age    15687
total_rooms           15687
total_bedrooms        15530
population            15687
households            15687
median_income         15687
median_house_value    15687
dtype: int64

Hence, total_bedrooms has missing values. 

In [4]:
data['population'].median()

1195.0

The median of the population column is 1195. 

In [5]:
np.random.seed(42)

n = len(data)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = data.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

In [6]:
y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [7]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [8]:
bdrmean = df_train.total_bedrooms.mean()
train_bdr_0 = df_train.copy()
train_bdr_0.total_bedrooms = df_train['total_bedrooms'].fillna(0)
test_bdr_0 = df_test.copy()
test_bdr_0.total_bedrooms = df_test['total_bedrooms'].fillna(0)
train_bdr_mean = df_train.copy()
train_bdr_mean.total_bedrooms = df_train['total_bedrooms'].fillna(bdrmean)
test_bdr_mean = df_test.copy()
test_bdr_mean.total_bedrooms = df_test['total_bedrooms'].fillna(bdrmean)


In [9]:
w_0, w = train_linear_regression_reg(train_bdr_0, y_train)
y_pred_0 = w_0 + train_bdr_0.dot(w)
v_0, v = train_linear_regression_reg(train_bdr_mean, y_train)
y_pred_mean = v_0 + train_bdr_mean.dot(v)
rmse(y_train, y_pred_0),rmse(y_train, y_pred_mean)

(0.3402719715829537, 0.3400300822264263)

In [10]:
y_pred0 = w_0 + df_val.dot(w)
y_predmean = v_0 + df_val.dot(v)
rmse(y_val, y_pred0), rmse(y_val, y_predmean)

(0.3400286008487107, 0.3399257629290944)

Both methods of filling nulls give approximately the same result. 

In [11]:
df_train.total_bedrooms = df_train.total_bedrooms.fillna(0)
df_val.total_bedrooms = df_val.total_bedrooms.fillna(0)
df_test.total_bedrooms = df_test.total_bedrooms.fillna(0)

In [12]:
for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    w_0, w = train_linear_regression_reg(df_train, y_train, r=r)
    y_pred = w_0 + df_val.dot(w)
    print(np.round(rmse(y_val, y_pred),2))

0.34
0.34
0.34
0.34
0.34
0.34
0.34
0.35
0.35


The smallest r which give the lowest value of RMSE (0.34) is 0. 

In [13]:
data = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv')
data = data[(data['ocean_proximity'] == 'INLAND') | (data['ocean_proximity'] == '<1H OCEAN')]
data = data[['latitude','longitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income',
'median_house_value']]
data.total_bedrooms = data.total_bedrooms.fillna(0)

a = []
for i in range(10):
    np.random.seed(i)
    n = len(data)

    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)
    idx = np.arange(n)

    np.random.shuffle(idx)

    df_shuffled = data.iloc[idx]

    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()

    y_train = np.log1p(df_train.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)

    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']
    
    w_0, w = train_linear_regression_reg(df_train, y_train, r=0)
    y_pred = w_0 + df_val.dot(w)
    a.append(np.round(rmse(y_val, y_pred),3))

In [14]:
round(np.std(a),3)

0.005

The standard deviation of all RMSE's obtained with different random seeds is 0.005. 

In [15]:
np.random.seed(9)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = data.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

X = pd.concat([df_train,df_val])
y = np.concatenate([y_train,y_val])
    
w_0, w = train_linear_regression_reg(X, y, r=0.001)
y_pred = w_0 + df_test.dot(w)
np.round(rmse(y_test, y_pred),2)

0.33

RMSE on the test dataset with random seed = 9 and r = 0.001 is 0.33. 