In [1]:
import numpy as np
import pandas as pd

# Problem Understanding

Your Real Estate partner in California needs your help with pricing homes at the optimal level<br>

Help them to predict the expected sale value of properties in their State and you will get slice of their additional sales commission 💸

# Data Understanding

In [2]:
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()
print(data['DESCR'])

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [3]:
X = pd.DataFrame(data['data'],columns=data['feature_names'])
y = data['target']
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


# Data preparation

### Split your X data in train and test datasets
Here is the documentation: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(16512, 8)
(16512,)
(4128, 8)
(4128,)


### Split your train data in train and validation datasets

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=55)

### Scale the 3 datasets using StandardScaler

In [9]:
from sklearn.preprocessing import StandardScaler 

scaler = StandardScaler()

scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# 

# Modelling and Model Evaluation

### Train a linear regression model

In [10]:
from sklearn.linear_model import LinearRegression
# 
lin_reg = LinearRegression()

lin_reg.fit(X_train, y_train)
lin_reg.score(X_train,y_train)

0.6069191925875926

### Measure the R-squared, MSE and MAE of your model
Here is the documentation: https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics

In [11]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

y_hat = lin_reg.predict(X_val)



print("R squared score =", r2_score(y_val,y_hat))

print("Mean Squared Error =", mean_squared_error(y_val,y_hat))

print("Mean Absolute Error =", mean_absolute_error(y_val,y_hat))
# 
# 
# 
# 
# 

R squared score = 0.6335358524333233
Mean Squared Error = 0.4978293787647968
Mean Absolute Error = 0.5211629400965607


### Train a LASSO model

In [12]:
from sklearn.linear_model import Lasso

lasso = Lasso(0.1)
lasso.fit(X_train,y_train)
lasso.score (X_train,y_train)
# 
# 
# 

0.5193591917487457

### Measure the R-squared, MSE and MAE of your model

In [13]:
# 
lasso_y_hat = lasso.predict(X_val)


print("R squared score =", r2_score(y_val,lasso_y_hat))

print("Mean Squared Error =", mean_squared_error(y_val,lasso_y_hat))

print("Mean Absolute Error =", mean_absolute_error(y_val,lasso_y_hat))
# 
# 
# 
# 
# 

R squared score = 0.5193591917487457
Mean Squared Error = 0.6529345819216771
Mean Absolute Error = 0.6107760766392206


# Interprete your winning model

### What can you tell your business partner by looking at the coefficients?

In [14]:
# 

print("Intercept =", lin_reg.intercept_)
coef_list = list(lin_reg.coef_)
coef_names = list(X.columns)
coefs = list(zip(coef_list,coef_names))
coefs
# 
# 

Intercept = 2.0785260224089677


[(0.8537382309501829, 'MedInc'),
 (0.12314016437064798, 'HouseAge'),
 (-0.2786629912078322, 'AveRooms'),
 (0.3100873204811737, 'AveBedrms'),
 (-0.0024643688494347064, 'Population'),
 (-0.04206023286147729, 'AveOccup'),
 (-0.8770630198227276, 'Latitude'),
 (-0.8554802475268458, 'Longitude')]

In [None]:
Linear regression model has a better R2 than the Lasso model.