In [23]:
import numpy as np
import pandas as pd

# Problem Understanding

Your Real Estate partner in California needs your help with pricing homes at the optimal level<br>

Help them to predict the expected sale value of properties in their State and you will get slice of their additional sales commission 💸

# Data Understanding

In [59]:
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()
print(data['DESCR'])

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [70]:
X = pd.DataFrame(data['data'],columns=data['feature_names'])
y = data['target']
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


# Data preparation

### Split your X data in train and test datasets
Here is the documentation: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [61]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Split your train data in train and validation datasets

In [62]:
from sklearn.model_selection import train_test_split

X_train_v, X_val, y_train_v, y_val = train_test_split(X_train,y_train,test_size=0.2, random_state=55)

### Scale the 3 datasets using StandardScaler

In [63]:
from sklearn.preprocessing import StandardScaler 
ss = StandardScaler()

ss.fit(X_train_v)

X_train_v = ss.transform(X_train_v)
X_val = ss.transform(X_val)

X_train = ss.transform(X_train)
X_test = ss.transform(X_test)

# Modelling and Model Evaluation

### Train a linear regression model

In [64]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

lin_reg.fit(X_train_v,y_train_v)
lin_reg.score(X_train_v,y_train_v)

0.6069191925875926

### Measure the R-squared, MSE and MAE of your model
Here is the documentation: https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics

In [65]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

y_pred = lin_reg.predict(X_val)

print("R squared score =", r2_score(y_val,y_pred))

print("Mean Squared Error =", mean_squared_error(y_val,y_pred))

print("Mean Absolute Error =", mean_absolute_error(y_val,y_pred))

R squared score = 0.6335358524333233
Mean Squared Error = 0.4978293787647968
Mean Absolute Error = 0.521162940096561


### Train a LASSO model

In [66]:
from sklearn.linear_model import Lasso
lasso = Lasso(.01)
lasso.fit(X_train_v,y_train_v)
lasso.score(X_train_v,y_train_v)

0.6031801653777368

### Measure the R-squared, MSE and MAE of your model

In [67]:
lasso_y_pred = lasso.predict(X_val)

print("R squared score =", r2_score(y_val,lasso_y_pred))

print("Mean Squared Error =", mean_squared_error(y_val,lasso_y_pred))

print("Mean Absolute Error =", mean_absolute_error(y_val,lasso_y_pred))

R squared score = 0.6285274479682577
Mean Squared Error = 0.5046331299639312
Mean Absolute Error = 0.525037895361113


# Interprete your winning model

### What can you tell your business partner by looking at the coefficients?

In [74]:
# I have chosen the linear regression model (not lasso regression) as the R-squared value was slightly higher
# when using the validation set.

print("Intercept =", lin_reg.intercept_)
coef_list = list(lin_reg.coef_)
coef_names = list(X.columns)
coefs = list(zip(coef_list,coef_names))
coefs

Intercept = 2.0785260224089677


[(0.8537382309501826, 'MedInc'),
 (0.1231401643706483, 'HouseAge'),
 (-0.27866299120783306, 'AveRooms'),
 (0.31008732048117466, 'AveBedrms'),
 (-0.0024643688494343273, 'Population'),
 (-0.04206023286147723, 'AveOccup'),
 (-0.8770630198227283, 'Latitude'),
 (-0.8554802475268465, 'Longitude')]

In [71]:
# The coefficients state as follows:
# A one standard deviation increase in MedInc leads to a 0.85 increase in the target variable
# A one standard deviation increase in HouseAge leads to a 0.12 increase in the target variable
# A one standard deviation increase in AveRooms leads to a 0.28 decrease in the target variable
# A one standard deviation increase in AveBedrms leads to a 0.31 increase in the target variable
# A one standard deviation increase in Population leads to a 0.002 decrease in the target variable
# A one standard deviation increase in AveOccup leads to a 0.04 decrease in the target variable
# A one standard deviation increase in Latitude leads to a 0.88 decrease in the target variable
# A one standard deviation increase in Longitude leads to a 0.85 decrease in the target variable

In [None]:
# Using these coefficients, an equation can be constructed that will assist my manager in valuing properties.
# This equation will be in the form of y = b1x1 + b2x2 + … + bnxn + c. 
# bi are the regression coefficients xi are the value of variables and c is the intercept