Load the data

wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

pandas_version = pd.__version__

print( 'Pandas version = ' + pandas_version + '\n' )

df = pd.read_csv('housing.csv')

df.head

Pandas version = 2.1.0



<bound method NDFrame.head of        longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0        -122.23     37.88                41.0        880.0           129.0   
1        -122.22     37.86                21.0       7099.0          1106.0   
2        -122.24     37.85                52.0       1467.0           190.0   
3        -122.25     37.85                52.0       1274.0           235.0   
4        -122.25     37.85                52.0       1627.0           280.0   
...          ...       ...                 ...          ...             ...   
20635    -121.09     39.48                25.0       1665.0           374.0   
20636    -121.21     39.49                18.0        697.0           150.0   
20637    -121.22     39.43                17.0       2254.0           485.0   
20638    -121.32     39.43                18.0       1860.0           409.0   
20639    -121.24     39.37                16.0       2785.0           616.0   

       population  ho

 Keep only the records where ocean_proximity is either '<1H OCEAN' or 'INLAND'

In [2]:
filtered_df = df[ ( df['ocean_proximity'] == '<1H OCEAN' ) | ( df['ocean_proximity'] == 'INLAND' ) ]

Next, use only the following columns:

    'latitude',
    'longitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'median_house_value'


In [3]:
features = [ 
            'latitude',
            'longitude',
            'housing_median_age',
            'total_rooms',
            'total_bedrooms',
            'population',
            'households',
            'median_income',
            'median_house_value']
filtered_df = filtered_df[ features ]
filtered_df

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
701,37.64,-121.97,32.0,1283.0,194.0,485.0,171.0,6.0574,431000.0
830,37.61,-121.99,9.0,3666.0,711.0,2341.0,703.0,4.6458,217000.0
859,37.57,-121.97,21.0,4342.0,783.0,2172.0,789.0,4.6146,247600.0
860,37.58,-121.96,15.0,3575.0,597.0,1777.0,559.0,5.7192,283500.0
861,37.58,-121.98,20.0,4126.0,1031.0,2079.0,975.0,3.6832,216900.0
...,...,...,...,...,...,...,...,...,...
20635,39.48,-121.09,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0
20636,39.49,-121.21,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0
20637,39.43,-121.22,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0
20638,39.43,-121.32,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0


Question 1
There's one feature with missing values. What is it?

Answer:  total_bedrooms


In [4]:
filtered_df.isnull().sum()


latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        157
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

Question 2
What's the median (50% percentile) for variable 'population'?


Answer: 1195

In [5]:
filtered_df[ 'population' ].median()

1195.0

Helper function for preparing data


In [6]:
def prepare_X(df):
    df = df.copy()
    features = base.copy()

    filtered_df = df[ ( df['ocean_proximity'] == '<1H OCEAN' ) | ( df['ocean_proximity'] == 'INLAND' ) ]

    features = [ 
            'latitude',
            'longitude',
            'housing_median_age',
            'total_rooms',
            'total_bedrooms',
            'population',
            'households',
            'median_income',
            'median_house_value'] 
    filtered_df = filtered_df[ features ]
    
    #df_num = df_num.fillna(0)
    X = filtered_df.values
    return X



Prepare and split the dataset

    Shuffle the dataset (the filtered one you created above), use seed 42.
    Split your data in train/val/test sets, with 60%/20%/20% distribution.
    Apply the log transformation to the median_house_value variable using the np.log1p() function.


In [7]:
np.random.seed(42)

n = len(filtered_df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = filtered_df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

# Split done 

y_train_orig = df_train.median_house_value.values
y_val_orig = df_val.median_house_value.values
y_test_orig = df_test.median_house_value.values

y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']




Question 3

    We need to deal with missing values for the column from Q1.
    We have two options: fill it with 0 or with the mean of this variable.
    Try both options. For each, train a linear regression model without regularization using the code from the lessons.
    For computing the mean, use the training only!


In [8]:
mean_total_bedrooms_train = df_train[ 'total_bedrooms' ].mean(skipna=True)
print( "mean_total_bedrooms_train = ",  mean_total_bedrooms_train)

filled_with_zero_df_train = df_train.copy()
filled_with_zero_df_train.fillna(0, inplace=True)
filled_with_zero_X_train = filled_with_zero_df_train.values

filled_with_mean_df_train = df_train.copy()
filled_with_mean_df_train.fillna( mean_total_bedrooms_train, inplace=True)
filled_with_mean_X_train = filled_with_mean_df_train.values

mean_total_bedrooms_train =  542.552956325786


Question 3 - continuation
Use the validation dataset to evaluate the models and compare the RMSE of each option.
    Round the RMSE scores to 2 decimal digits using round(score, 2)
    Which option gives better RMSE?

Answer: Both are equally good

In [9]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)


filled_with_zero_w_0, filled_with_zero_w = train_linear_regression(filled_with_zero_X_train, y_train)
filled_with_zero_y_pred = filled_with_zero_w_0 + filled_with_zero_X_train.dot(filled_with_zero_w)
filled_with_zero_rmse_val = round( rmse(y_train, filled_with_zero_y_pred), 2 )

print("Filled with zero rmse = ", filled_with_zero_rmse_val)

filled_with_mean_w_0, filled_with_mean_w = train_linear_regression(filled_with_mean_X_train, y_train)
filled_with_mean_y_pred = filled_with_mean_w_0 + filled_with_mean_X_train.dot(filled_with_mean_w)
rmse(y_train, filled_with_mean_y_pred)
filled_with_mean_rmse_val = round( rmse(y_train, filled_with_mean_y_pred), 2 )

print("Filled withwmean rmse = ", filled_with_mean_rmse_val)


Filled with zero rmse =  0.34
Filled withwmean rmse =  0.34


Question 4
    Now let's train a regularized linear regression.
    For this question, fill the NAs with 0.
    Try different values of r from this list: [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10].
    Use RMSE to evaluate the model on the validation dataset.
    Round the RMSE scores to 2 decimal digits.
    Which r gives the best RMSE?

Answer: 0. There are multiple options, smallest is zero.


In [10]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

X_train = filled_with_zero_X_train
filled_with_zero_df_val = df_val.copy()
filled_with_zero_df_val.fillna(0, inplace=True)
X_val = filled_with_zero_df_val.values 
#X_val = df_val.values

for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = w_0 + X_val.dot(w)
    print('%s => ' %r, round( rmse(y_val, y_pred), 2 ) )


0 =>  0.34
1e-06 =>  0.34
0.0001 =>  0.34
0.001 =>  0.34
0.01 =>  0.34
0.1 =>  0.34
1 =>  0.34
5 =>  0.35
10 =>  0.35


Question 5

    We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
    Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
    For each seed, do the train/validation/test split with 60%/20%/20% distribution.
    Fill the missing values with 0 and train a model without regularization.
    For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
    What's the standard deviation of all the scores? To compute the standard deviation, use np.std.
    Round the result to 3 decimal digits (round(std, 3))

What's the value of std?

Answer: Standar Devviation is 0.005
    0 =>  0.3377387160043232
    1 =>  0.3377999353665378
    2 =>  0.3384287006753424
    3 =>  0.3320049468303904
    4 =>  0.3394451862555887
    5 =>  0.34338197052659875
    6 =>  0.3385330211769949
    7 =>  0.34687476972995646
    8 =>  0.35127368659627967
    9 =>  0.33415582665206545


In [11]:


def prepare_train_val_test_data( df, seed ):
    np.random.seed(seed)
    n = len( df )

    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)

    idx = np.arange(n)
    np.random.shuffle(idx)

    df_shuffled = df.iloc[idx]

    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()

    # Split done 

    y_train_orig = df_train.median_house_value.values
    y_val_orig = df_val.median_house_value.values
    y_test_orig = df_test.median_house_value.values

    y_train = np.log1p(df_train.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)

    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']

    df_train.fillna(0, inplace=True)
    df_val.fillna(0, inplace=True)
    df_test.fillna(0, inplace=True)

    return df_train.values , df_val.values, df_test.values, y_train, y_val, y_test

rmse_score_list = []
for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    X_train, X_val, X_test, y_train, y_val, y_test = prepare_train_val_test_data( filtered_df, seed )

    w_0, w = train_linear_regression(X_train, y_train,)
    y_pred = w_0 + X_val.dot(w)
    crt_rmse = rmse(y_val, y_pred)
    print('%s => ' %seed, crt_rmse )
    rmse_score_list.append( crt_rmse )

scores_array = np.array( rmse_score_list )
standard_deviation = np.std(  scores_array )
print( "Standard deviation of all rmse = ", round( standard_deviation, 3) )


0 =>  0.3377387160043232
1 =>  0.3377999353665378
2 =>  0.3384287006753424
3 =>  0.3320049468303904
4 =>  0.3394451862555887
5 =>  0.34338197052659875
6 =>  0.3385330211769949
7 =>  0.34687476972995646
8 =>  0.35127368659627967
9 =>  0.33415582665206545
Standard deviation of all rmse =  0.005


Question 6

    Split the dataset like previously, use seed 9.
    Combine train and validation datasets.
    Fill the missing values with 0 and train a model with r=0.001.
    What's the RMSE on the test dataset?

Answer: RMSE on test data %s =>  0.33498993366147445

In [12]:
def prepare_train_test_data( df, seed ):
    np.random.seed(seed)
    n = len( df )

    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)

    idx = np.arange(n)
    np.random.shuffle(idx)

    df_shuffled = df.iloc[idx]

    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()

    frames = [ df_train, df_val ]
    df_combined = pd.concat(frames)
    df_test = df_shuffled.iloc[n_train+n_val:].copy()

    # Split done 

    y_combined_orig = df_combined.median_house_value.values
    y_test_orig = df_test.median_house_value.values

    y_combined = np.log1p(df_combined.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)

    del df_combined['median_house_value']
    del df_test['median_house_value']

    df_combined.fillna(0, inplace=True)
    df_test.fillna(0, inplace=True)

    return df_combined.values, df_test.values, y_combined, y_test

seed = 9
r = 0.001

X_combined, X_test, y_combined, y_test = prepare_train_test_data( filtered_df, seed )
#X_combined = np.concatenate( X_train, X_val )
#y_combined = np.concatenate( y_train, y_val )

w_0, w = train_linear_regression_reg(X_combined, y_combined, r=r)
y_pred = w_0 + X_test.dot(w)
print("RMSE on test data %s => ", rmse(y_test, y_pred) )


RMSE on test data %s =>  0.33498993366147445
