### Import dependencies and check the versions

In [1]:
import numpy as np
import pandas as pd

In [2]:
print(pd.__version__)
print(np.__version__)

2.1.0
1.25.2


### Getting the data

In [3]:
!mkdir data
!wget -P data/ https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

mkdir: data: File exists
--2023-09-14 14:57:35--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1423529 (1.4M) [text/plain]
Saving to: ‘data/housing.csv.1’


2023-09-14 14:57:36 (5.17 MB/s) - ‘data/housing.csv.1’ saved [1423529/1423529]



### Exploring the dataset

In [4]:
df = pd.read_csv("data/housing.csv")

df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
df[df['total_bedrooms'].isnull()]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
290,-122.16,37.77,47.0,1256.0,,570.0,218.0,4.3750,161900.0,NEAR BAY
341,-122.17,37.75,38.0,992.0,,732.0,259.0,1.6196,85100.0,NEAR BAY
538,-122.28,37.78,29.0,5154.0,,3741.0,1273.0,2.5762,173400.0,NEAR BAY
563,-122.24,37.75,45.0,891.0,,384.0,146.0,4.9489,247100.0,NEAR BAY
696,-122.10,37.69,41.0,746.0,,387.0,161.0,3.9063,178400.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20267,-119.19,34.20,18.0,3620.0,,3171.0,779.0,3.3409,220500.0,NEAR OCEAN
20268,-119.18,34.19,19.0,2393.0,,1938.0,762.0,1.6953,167400.0,NEAR OCEAN
20372,-118.88,34.17,15.0,4260.0,,1701.0,669.0,5.1033,410700.0,<1H OCEAN
20460,-118.75,34.29,17.0,5512.0,,2734.0,814.0,6.6073,258100.0,<1H OCEAN


In [6]:
df['ocean_proximity'].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [7]:
df[df.ocean_proximity == 'NEAR BAY']["median_house_value"].mean()

259212.31179039303

In [8]:
avg_total_bedrooms = df['total_bedrooms'].mean()

avg_total_bedrooms

537.8705525375618

In [9]:
df['total_bedrooms'] = df['total_bedrooms'].fillna(avg_total_bedrooms)

In [10]:
df['total_bedrooms'].mean()

537.8705525375617

In [11]:
island_options = df[df['ocean_proximity'] == 'ISLAND']

selected_columns = island_options[['housing_median_age', 'total_rooms', 'total_bedrooms']]
X = selected_columns.values
XTX = np.dot(X.T, X)

y = np.array([950, 1300, 800, 1000, 1300])

XTX_inv = np.linalg.pinv(XTX)

w = np.dot(np.dot(XTX_inv, X.T), y)

last_element_of_w = w[-1]

last_element_of_w

5.699229455065618