In [11]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("housing.csv")

In [4]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


#### Question 1

In [7]:
len(df) - df.total_bedrooms.count()

207

#### Question 2

In [9]:
df.population.median()

1166.0

In [12]:
n = len(df)

# Validation dataset
n_val = int(n * 0.2)

# Test dataset 
n_test = int(n * 0.2)

# Train dataset
n_train = n - n_val - n_test

df_train = df.iloc[n_train:]
df_val = df.iloc[n_train:n_train+n_val]
df_test = df.iloc[n_train+n_val:]

idx = np.arange(n)

np.random.seed(42)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Transformation of the y values
y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

del df_train["median_house_value"]
del df_val["median_house_value"]
del df_test["median_house_value"]

In [13]:
len(df_train), len(df_val), len(df_test)

(12384, 4128, 4128)

In [17]:
print(df_train.shape, y_train.shape)
print(df_val.shape, y_val.shape)
print(df_test.shape, y_test.shape)

(12384, 9) (12384,)
(4128, 9) (4128,)
(4128, 9) (4128,)


In [19]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [22]:
X_train.shape

(12384,)

In [25]:
X_train = df_train["total_bedrooms"].fillna(0).values

w0, w = train_linear_regression(X_train, y_train)

y_pred = w0 + X_train.dot(w[0])

12.033415752214037 [0.00010218]


In [31]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [30]:
# This is the same code as the steps taken in 2.8, but applied to X_train and X_val. 
def prepare_X(df):
    df_num = df["total_bedrooms"]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

In [33]:
X_train = prepare_X(df_train)
w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w[0])
rmse(y_val, y_pred)

0.5713144443358035

#### Question 3

#### Question 4

#### Question 5

In [49]:
rmse_array = []

for seed in range(10):
    n = len(df)

    # Validation dataset
    n_val = int(n * 0.2)

    # Test dataset 
    n_test = int(n * 0.2)

    # Train dataset
    n_train = n - n_val - n_test

    df_train = df.iloc[n_train:]
    df_val = df.iloc[n_train:n_train+n_val]
    df_test = df.iloc[n_train+n_val:]

    idx = np.arange(n)

    np.random.seed(seed)
    np.random.shuffle(idx)

    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train+n_val]]
    df_test = df.iloc[idx[n_train+n_val:]]

    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    # Transformation of the y values
    y_train = np.log1p(df_train.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)

    del df_train["median_house_value"]
    del df_val["median_house_value"]
    del df_test["median_house_value"]
    
    X_train = prepare_X(df_train)
    w0, w = train_linear_regression(X_train, y_train)

    X_val = prepare_X(df_val)
    y_pred = w0 + X_val.dot(w[0])
    rmse_array.append(rmse(y_val, y_pred))

In [50]:
round(np.std(np.array(rmse_array)), 3)

0.002

#### Question 6

In [36]:
n = len(df)

# Test dataset 
n_test = int(n * 0.2)

# Train dataset
n_train = n - n_test

idx = np.arange(n)

np.random.seed(9)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train]]
df_test = df.iloc[idx[n_train:]]

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Transformation of the y values
y_train = np.log1p(df_train.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

del df_train["median_house_value"]
del df_test["median_house_value"]

X_train = prepare_X(df_train)
w0, w = train_linear_regression(X_train, y_train)

X_test = prepare_X(df_test)
y_pred = w0 + X_test.dot(w[0])
rmse(y_val, y_pred)

0.5756370934169434