In [1]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

In [2]:
housing = pd.read_csv('housing.csv')
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## split the data into training and testing

In [3]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2,random_state=213)

In [4]:
train_X = train_set.drop("median_house_value", axis = 1)
train_y = train_set["median_house_value"].copy()

In [13]:
test_X = test_set.drop("median_house_value", axis = 1)
test_y = test_set["median_house_value"].copy()

## Preprocessing

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

num_attributes = [
    'longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income'
]
cat_attributes = ['ocean_proximity']

num_pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)

cat_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore')
)

preprocessing = ColumnTransformer([
    ('num', num_pipeline, num_attributes),
    ('cat', cat_pipeline, cat_attributes)
])

X_prepared = preprocessing.fit_transform(train_X)

test_X_prepared = preprocessing.transform(test_X)


## Train a linear regression model

In [7]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

lin_reg.fit(X_prepared, train_y)

In [8]:
lin_reg.coef_

array([-5.11812346e+04, -5.19085439e+04,  1.31075545e+04, -9.97600870e+03,
        2.80824106e+04, -4.35432893e+04,  3.04533301e+04,  7.35752596e+04,
       -1.39018437e+16, -1.39018437e+16, -1.39018437e+16, -1.39018437e+16,
       -1.39018437e+16])

In [9]:
lin_reg.intercept_

1.3901843744914576e+16

# training MSE

In [12]:
from sklearn.metrics import mean_squared_error

train_fitted = lin_reg.predict(X_prepared)

train_mse = mean_squared_error(train_y, train_fitted)

train_rmse = np.sqrt(train_mse)
print(train_rmse)ß


68747.93815294572


## test MSE

In [16]:
test_fitted = lin_reg.predict(test_X_prepared)
test_mse = mean_squared_error(test_y, test_fitted)

test_rmse = np.sqrt(test_mse)
test_rmse

68592.35568459977