In [None]:
import numpy as np 
import pandas as pd

In [None]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

In [None]:
df = pd.read_csv('housing.csv')
df

In [None]:
df['median_house_value']

In [None]:
df.dtypes

In [None]:
df

In [None]:
select_cols = ["longitude", "latitude", "housing_median_age", "total_rooms", "total_bedrooms", "population", "households", 
               "median_income", "median_house_value"]
data = pd.read_csv('housing.csv', usecols=select_cols)

In [None]:
data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
data.info()

Solution for Question 1 : missing value is " total_bedrooms"

In [None]:
df.isnull().sum()

In [None]:
data.duplicated().value_counts()

In [None]:
data.describe(percentiles=[.0, .25, .5, .75, .9, .95, .99, .1]).T

In [None]:
np.median(data['population'])

In [None]:
sns.histplot(data.median_house_value, bins=50)

In [None]:
sns.histplot(np.log1p(data.median_house_value))

In [None]:
n = len(data)
n

In [None]:
n=len(data)
n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

In [None]:
idx = np.arange(n)
idx

In [None]:

np.random.seed(42)
np.random.shuffle(idx)
df_shuffled = data.iloc[idx]
df_shuffled

In [None]:
df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test= df_shuffled.iloc[n_train+n_val:].copy()

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
y_train_org = df_train.median_house_value.values
y_val_org = df_val.median_house_value.values
y_test_org = df_test.median_house_value.values

y_train = np.log1p(y_train_org)
y_val = np.log1p(y_val_org)
y_test = np.log1p(y_test_org)

In [None]:
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [None]:
colums =["longitude", "latitude", "housing_median_age", "total_rooms", "total_bedrooms", 
       "population", "households", "median_income"]

def prepare_X(df,fill_value):
    df_num = df[colums]
    df_num = df_num.fillna(fill_value)
    X = df_num.values
    return X

In [None]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [None]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [None]:
mean = df_train.total_bedrooms.mean()

X_mean_train = prepare_X(df_train, fill_value=mean)
w_0_mean, w_mean = train_linear_regression(X_mean_train, y_train)

In [None]:
X_mean_val = prepare_X(df_val, fill_value=mean)
y_mean_pred_val = w_0_mean + X_mean_val.dot(w_mean)

In [None]:
np.round(rmse(y_val, y_mean_pred_val),2)

In [None]:
X_null_train = prepare_X(df_train, fill_value=0)
w_0_null, w_null = train_linear_regression(X_null_train, y_train)

In [None]:
X_null_val = prepare_X(df_val, fill_value=0)
y_null_pred_val = w_0_null + X_null_val.dot(w_null)

In [None]:
np.round(rmse(y_val, y_null_pred_val),2)

Solution for Question 3: Both are equally good

In [None]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [None]:
for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    w_0, w = train_linear_regression_reg(X_null_train, y_train, r=r)
    y_null_reg_val = w_0 + X_null_val.dot(w)
    rmse_val = np.round(rmse(y_val, y_null_reg_val),2)
    print(r, w_0, rmse_val)
     

In [None]:
rmse_list = []

for r in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:

    idx = np.arange(n)
    np.random.seed(r)
    np.random.shuffle(idx)

    df_shuffled = data.iloc[idx]
    
    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()

    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    
    y_train_orig = df_train.median_house_value.values
    y_val_orig = df_val.median_house_value.values
    y_test_orig = df_test.median_house_value.values

    y_train = np.log1p(y_train_orig)
    y_val = np.log1p(y_val_orig)
    y_test = np.log1p(y_test_orig)
    
    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']
    
    X_null_train = prepare_X(df_train, fill_value=0)
    w_0, w = train_linear_regression(X_null_train, y_train)
    
    X_null_val = prepare_X(df_val, fill_value=0)
    y_null_reg_val = w_0 + X_null_val.dot(w)
    rmse_val = np.round(rmse(y_val, y_null_reg_val),2)
    
    rmse_list.append(rmse_val)
    
    print(r, w_0, rmse_val)

In [None]:
rmse_list

In [None]:
np.round(np.std(rmse_list),3)

In [None]:
r = 9

idx = np.arange(n)
np.random.seed(r)
np.random.shuffle(idx)

df_shuffled = data.iloc[idx]
    
df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

frames = [df_train, df_val]
df_train_val = pd.concat(frames)

df_train_val = df_train_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train_val_orig = df_train_val.median_house_value.values
y_test_orig = df_test.median_house_value.values

y_train_val = np.log1p(y_train_val_orig)
y_test = np.log1p(y_test_orig)

del df_train_val['median_house_value']
del df_test['median_house_value']

In [None]:
X_null_train_val = prepare_X(df_train_val, fill_value=0)
w_0_train_val, w_train_val = train_linear_regression_reg(X_null_train_val, y_train_val, r=0.001)

X_null_test = prepare_X(df_test, fill_value=0)
y_null_pred_test = w_0_train_val + X_null_test.dot(w_train_val)

np.round(rmse(y_test, y_null_pred_test),2)