In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



In [2]:
housing = pd.read_csv('housing.txt')

In [3]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


### Exploratary Data Analysis

In [4]:
housing = housing.loc[(housing['ocean_proximity'] == '<1H OCEAN') | (housing['ocean_proximity'] == 'INLAND')].copy()

In [5]:
df_housing = housing.loc[:,['latitude', 'longitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']].reset_index(drop=True)
df_housing

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,37.64,-121.97,32.0,1283.0,194.0,485.0,171.0,6.0574,431000.0
1,37.61,-121.99,9.0,3666.0,711.0,2341.0,703.0,4.6458,217000.0
2,37.57,-121.97,21.0,4342.0,783.0,2172.0,789.0,4.6146,247600.0
3,37.58,-121.96,15.0,3575.0,597.0,1777.0,559.0,5.7192,283500.0
4,37.58,-121.98,20.0,4126.0,1031.0,2079.0,975.0,3.6832,216900.0
...,...,...,...,...,...,...,...,...,...
15682,39.48,-121.09,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0
15683,39.49,-121.21,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0
15684,39.43,-121.22,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0
15685,39.43,-121.32,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0


In [6]:
df_housing.isnull().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        157
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

In [7]:
df_housing.describe()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,15687.0,15687.0,15687.0,15687.0,15530.0,15687.0,15687.0,15687.0,15687.0
mean,35.467307,-119.217442,27.188118,2665.677312,541.244688,1466.317205,500.916491,3.804019,191943.020017
std,2.066387,1.78038,12.057128,2257.672743,435.650018,1180.389908,392.759399,1.857158,108801.07762
min,32.61,-124.14,1.0,2.0,2.0,3.0,2.0,0.4999,14999.0
25%,33.94,-121.07,17.0,1441.0,295.0,802.0,278.0,2.5208,111300.0
50%,34.16,-118.37,27.0,2118.0,432.0,1195.0,406.0,3.4688,166900.0
75%,37.34,-117.99,36.0,3172.0,645.0,1777.0,602.0,4.6862,241100.0
max,41.95,-114.31,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [None]:
n = len(df_housing)
n_train = int(0.6 * n)
n_val = int(0.2 * n)
n_test = n - n_train - n_val

idx = np.arange(n)

np.random.seed(42)
np.random.shuffle(idx)

df_housing_shuffled = df_housing.iloc[idx]
df_housing_shuffled

In [None]:
df_housing_train = df_housing_shuffled.iloc[:n_train].copy()
df_housing_val = df_housing_shuffled.iloc[n_train:n_val+n_train].copy()
df_housing_test = df_housing_shuffled.iloc[n_val+n_train:].copy()

In [None]:
y_train = np.log1p(df_housing_train.median_house_value.values)
y_val = np.log1p(df_housing_val.median_house_value.values)
print(y_val)
y_test = np.log1p(df_housing_test.median_house_value.values)

del df_housing_train['median_house_value']
del df_housing_val['median_house_value']
del df_housing_test['median_house_value']

In [10]:
def prepare(df):
    df = df.fillna(0)
    X = df.values
    return X

In [None]:
def lin_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX += reg
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [11]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [None]:
X_train = prepare(df_housing_train)
w0, w = lin_reg(X_train, y_train)
w0

In [None]:
y_pred = w0 + X_train.dot(w)

error = rmse(y_train, y_pred)
round(error,2)

In [None]:
plt.figure(figsize=(6, 4))

sns.histplot(y_train, label='target', color='#222222', alpha=0.6, bins=40)
sns.histplot(y_pred, label='prediction', color='#aaaaaa', alpha=0.8, bins=40)

plt.legend()

plt.ylabel('Frequency')
plt.xlabel('Log(Price + 1)')
plt.title('Predictions vs actual distribution')

plt.show()

In [None]:
for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    w0, w = lin_reg(X_train, y_train, r=r)
    y_pred = w0 + X_train.dot(w)
    error = rmse(y_train, y_pred)
    print('%5s, %.2f, %.5f' % (r, w0, error))

In [8]:
def lin_reg_shuffle(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [20]:
n = len(df_housing)
n_train = int(0.6 * n)
n_val = int(0.2 * n)
n_test = n - n_train - n_val

idx = np.arange(n)
seed = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
error = []
for i in seed:
    np.random.seed(i)
    np.random.shuffle(idx)

    df_housing_shuffled = df_housing.iloc[idx]
    df_housing_shuffled
    
    df_housing_train = df_housing_shuffled.iloc[:n_train].copy()
    df_housing_val = df_housing_shuffled.iloc[n_train:n_val+n_train].copy()
    df_housing_test = df_housing_shuffled.iloc[n_val+n_train:].copy()
    y_train = np.log1p(df_housing_train.median_house_value.values)
    y_val = np.log1p(df_housing_val.median_house_value.values)
    y_test = np.log1p(df_housing_test.median_house_value.values)

    del df_housing_train['median_house_value']
    del df_housing_val['median_house_value']
    del df_housing_test['median_house_value']
    
    
    X_train = prepare(df_housing_train)
    X_val = prepare(df_housing_val)
    w0, w = lin_reg_shuffle(X_train, y_train)
    y_pred = w0 + X_val.dot(w)
    error.append(rmse(y_val, y_pred))
    print('%5s, %.2f, %.5f' % (i, w0, rmse(y_val, y_pred)))

std_error = np.std(error)
std_error = round(std_error,3)
print(std_error)

    0, -9.87, 0.33778
    1, -9.62, 0.34549
    2, -10.37, 0.33209
    3, -9.57, 0.33116
    4, -9.34, 0.33481
    5, -9.54, 0.33334
    6, -10.45, 0.33277
    7, -10.49, 0.33953
    8, -9.97, 0.33067
    9, -9.85, 0.34808
0.006


Since the standard deviation of the RMSE scores is low, it shows that our model is stable

In [26]:
n = len(df_housing)
n_train = int(0.8 * n)
n_test = n - n_train

idx = np.arange(n)

np.random.seed(9)
np.random.shuffle(idx)

df_housing_shuffled = df_housing.iloc[idx]
df_housing_shuffled

df_housing_train = df_housing_shuffled.iloc[:n_train].copy()
df_housing_test = df_housing_shuffled.iloc[n_train:].copy()


y_train = np.log1p(df_housing_train.median_house_value.values)
y_test = np.log1p(df_housing_test.median_house_value.values)

del df_housing_train['median_house_value']
del df_housing_test['median_house_value']

def prepare(df):
    df = df.fillna(0)
    X = df.values
    return X

def lin_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX += reg
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)


X_train = prepare(df_housing_train)
X_test = prepare(df_housing_test)
w0, w = lin_reg(X_train, y_train)

y_pred = w0 + X_test.dot(w)
error = rmse(y_test, y_pred)
round(error,2)

0.33