In [None]:
import numpy as np
import pandas as pd

# Problem Understanding

Your Real Estate partner in California needs your help with pricing homes at the optimal level<br>

Help them to predict the expected sale value of properties in their State and you will get slice of their additional sales commission 💸

# Data Understanding

In [27]:
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()
print(data['DESCR'])

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [None]:
X = pd.DataFrame(data['data'],columns=data['feature_names'])
y = data['target']
X.head()

# Data preparation

### Split your X data in train and test datasets
Here is the documentation: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
print(len(X_train), len(X_test), len(y_train), len(y_test))

### Split your train data in train and validation datasets

In [None]:
from sklearn.model_selection import train_test_split
X_train_sub, X_validation, y_train_sub, y_validation = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [None]:
print(len(X_train_sub), len(X_validation), len(y_train_sub), len(y_validation))

### Scale the 3 datasets using StandardScaler

In [None]:
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler()

scaler.fit(X_train_sub)

X_train_sub_scaled = scaler.transform(X_train_sub)
X_validation_scaled = scaler.transform(X_validation)
X_test_scaled = scaler.transform(X_test)

X_train_sub_scaled = pd.DataFrame(X_train_sub_scaled, columns=data['feature_names'])
X_validation_scaled = pd.DataFrame(X_validation_scaled, columns=data['feature_names'])
X_test_scaled = pd.DataFrame(X_test_scaled, columns=data['feature_names'])



# Modelling and Model Evaluation

### Train a linear regression model

In [None]:
y_series = pd.Series(y, name = 'House_median')
whole = pd.concat([X,y_series], axis=1)
whole


In [35]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

lin_reg.fit(X_train_sub_scaled, y_train_sub)
initial_score = lin_reg.score(X_train_sub_scaled, y_train_sub)
parameters = lin_reg.coef_

print('The Initial R-Squared value for the linear mode is ', initial_score.round(2))
print('The intercept of the regression is', inter.round(2))
print('The coefficients are ', parameters.round(2))

The Initial R-Squared value for the linear mode is  0.61
The intercept of the regression is 2.07
The coefficients are  [ 0.85  0.12 -0.27  0.3  -0.01 -0.03 -0.89 -0.86]


### Measure the R-squared, MSE and MAE of your model
Here is the documentation: https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
# 
# 
# 
# 
# 

### Train a LASSO model

In [None]:
from sklearn.linear_model import Lasso
# 
# 
# 

### Measure the R-squared, MSE and MAE of your model

In [None]:
# 
# 
# 
# 
# 

# Interprete your winning model

### What can you tell your business partner by looking at the coefficients?

In [None]:
# 
# 
# 