In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("/kaggle/input/housingdata/housing.csv")
df.head()

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
# Keep only records that have ocean_proximity
# For multiple conditions, each condition should be in Parentheses
df = df[(df['ocean_proximity'] == 'INLAND') | (df['ocean_proximity'] == '<1H OCEAN')]
df

In [None]:
df_use = df.copy()
del df_use['ocean_proximity']

In [None]:
df_use.isnull().sum()
# total_bedrooms has 157 null values

In [None]:
# median value for population
np.median(df_use['population'])
# 1195

In [None]:
# Plot median_house_value
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
sns.histplot(df_use.median_house_value, bins=50)

In [None]:
median_price_logs = np.log1p(df_use.median_house_value)
median_price_logs

In [None]:
sns.histplot(median_price_logs)
# no long-tail distrubution, we get normal distrubution

In [None]:
# Prepare and shuffle the df_use
n = len(df_use)
n_val = int(n *.2)
n_test = int(n * .2)
n_train = n - n_val - n_test 
print(n, n_train, n_val, n_test)

In [None]:
# create a range of size n
idx = np.arange(n)
idx

In [None]:
# Now we shuffle
# set the seed
np.random.seed(42)

In [None]:
np.random.shuffle(idx)
idx

In [None]:
df_train = df_use.iloc[idx[:n_train]]
df_val = df_use.iloc[idx[n_train:n_val+n_train]]
df_test = df_use.iloc[idx[n_train+n_val:]]
len(df_train), len(df_val), len(df_test)

In [None]:
# mean value of total_bedrooms
total_bedrooms_mean = df_train['total_bedrooms'].mean()
total_bedrooms_mean
# 542.553

In [None]:
# the index values are all random in the new dataset. 
# we can use reset_index to reset them
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Now we define our Y - median_home_value
y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [None]:
# # Now we define our Y - median_home_value
# y_train = np.log1p(df_train.median_house_value.values)
# y_val = np.log1p(df_val.median_house_value.values)
# y_test = np.log1p(df_test.median_house_value.values)


In [None]:
# del df_train['median_house_value']
# del df_val['median_house_value']
# del df_test['median_house_value']

In [None]:
df_train.iloc[2]

In [None]:
df_train.shape, df_train.isnull().sum()

In [None]:
base = list(df_train.columns.values)
base

In [None]:
df_train_mean = df_train.copy()
df_train_mean = df_train_mean[base].fillna(542.552956325786)
X_train_mean = df_train_mean.values
X_train_mean.shape, y_train.shape

In [None]:
df_train_zero = df_train.copy()
df_train_zero = df_train_zero[base].fillna(0)
X_train_zero = df_train_zero.values

In [None]:
def train_linear_regression(X, y):
#     add in bias term
    ones = np.ones(X.shape[0])
#     stack the columns to create new feature matrix
    X = np.column_stack([ones, X])
    
#     normal equation
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [None]:
mean_w0, mean_w = train_linear_regression(X_train_mean, y_train)
mean_w0, mean_w

In [None]:
zero_w0, zero_w = train_linear_regression(X_train_zero, y_train)
zero_w0, zero_w

In [None]:
y_pred_mean = mean_w0 + X_train_mean.dot(mean_w)
y_pred_mean

In [None]:
y_pred_zero = zero_w0 + X_train_zero.dot(zero_w)
y_pred_zero

In [None]:
# Plot the predictions
sns.histplot(y_pred_mean, color="red", alpha=0.5, bins=50)
sns.histplot(y_train, color="green", alpha=0.5, bins=50)

In [None]:
# Plot the predictions
sns.histplot(y_pred_zero, color="red", alpha=0.5, bins=50)
sns.histplot(y_train, color="green", alpha=0.5, bins=50)

In [None]:
def rmse(y, y_pred):
    s_error =  (y - y_pred) ** 2
    mean_s_error = s_error.mean()
    return np.sqrt(mean_s_error)

In [None]:
# mean total_bedrooms
rmse(y_train, y_pred_mean)

In [None]:
# 0 total bedrooms
rmse(y_train, y_pred_zero)

In [None]:
def prepare_X(df, fill):
    df = df.copy()
    features = base
    df_num = df[features].fillna(fill)
    X = df_num.values

    return X
    

In [None]:
X_train_mean = prepare_X(df_train, total_bedrooms_mean)
X_train_mean

In [None]:
#  Train DF - df_train - set total_bedrooms to mean value
X_train_mean = prepare_X(df_train, total_bedrooms_mean)
mean_w0, mean_w = train_linear_regression(X_train_mean, y_train)
# prepare the df_val - we prepare it the same way. 
X_val_mean = prepare_X(df_val, total_bedrooms_mean)
y_pred = mean_w0 + X_val_mean.dot(mean_w)
mean_score = round(rmse(y_val, y_pred), 2)
mean_score

In [None]:
#  Train DF - df_train - set total_bedrooms to 0
X_train_zero = prepare_X(df_train, 0)
zero_w0, zero_w = train_linear_regression(X_train_zero, y_train)
# prepare the df_val - we prepare it the same way. 
X_val_zero = prepare_X(df_val, 0)
y_pred = zero_w0 + X_val_zero.dot(zero_w)
zero_score = round(rmse(y_val, y_pred), 2)
zero_score
# Both are equally good. 

In [None]:
def train_linear_regression_reg(X, y, r):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    print("r values = ", r)
#   The normal equation
    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [None]:
# Train regularized regression with NA values = 0
for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    X_train = prepare_X(df_train, 0)
    w0, w = train_linear_regression_reg(X_train, y_train, r)
#     prepare the df_value
    X_val = prepare_X(df_val, 0)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    
    print('R value = ', r, 'W0 =', w0, 'score = ', round(score, 2))
    
# smallest RMSE comes from r = 0

In [None]:
idx = np.arange(n)
# set a function to take the DF and Seed and return the Train, Val, and Test feature matrix and y

def prepare_df(df, seed):
    # set the seed
    np.random.seed(seed)
    np.random.shuffle(idx)
    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_val+n_train]]
    df_test = df.iloc[idx[n_train+n_val:]]
    # the index values are all random in the new dataset. 
    # we can use reset_index to reset them
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    # Now we define our Y - median_home_value
    y_train = np.log1p(df_train.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)
    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']
    
    return df_train, df_val, df_test, y_train, y_val, y_test

In [None]:
df_train, df_val, df_test, y_train, y_val, y_test = prepare_df(df_use, 0)

In [None]:
# define a function to call prepare_df and then prepare_X and to calculate the scores
seed_array = [0,1,2,3,4,5,6,7,8,9]
scores = []

for seed in seed_array:
    df_train, df_val, df_test, y_train, y_val, y_test = prepare_df(df_use, seed)
    X_train = prepare_X(df_train, 0)
    w0, w = train_linear_regression(X_train, y_train)
#     prepare the df_value
    X_val = prepare_X(df_val, 0)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    scores.append(score)
    print('seed value = ', seed, 'W0 =', w0, 'score = ', score)
    

In [None]:
np.std(scores)
# standard deviation of all scores = 0.005. this means our model is stable

In [None]:
# DFs for seed 9 
df_train, df_val, df_test, y_train, y_val, y_test = prepare_df(df_use, 9)
    
# Combine the train and val DF
df_full_train = pd.concat([df_train, df_val])
df_full_train = df_full_train.reset_index(drop=True)
len(df_full_train)
y_full_train = np.concatenate([y_train, y_val])


In [None]:
X_full_train = prepare_X(df_full_train, 0)
w0, w = train_linear_regression_reg(X_full_train, y_full_train, 0.001)
# prepare df_test
X_test = prepare_X(df_test, 0)
y_pred = w0 + X_test.dot(w)
score = rmse(y_test, y_pred)

print('score = ', score)