## This exercise and code have been taken and adapted from:
### https://medium.com/@kumar.bits009/make-your-own-model-to-predict-house-prices-in-python-ad843aee1e2

### Please visit the above link to learn more

In [1]:
import numpy as np
import matplotlib.pyplot as plt 

import pandas as pd  
import seaborn as sns 

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
from sklearn.datasets import fetch_california_housing
dataset = fetch_california_housing()

In [3]:
feature_names = dataset['feature_names']
print("Feature names: {}\n".format(feature_names))

Feature names: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']



In [4]:
print(dataset.data)

[[   8.3252       41.            6.98412698 ...    2.55555556
    37.88       -122.23      ]
 [   8.3014       21.            6.23813708 ...    2.10984183
    37.86       -122.22      ]
 [   7.2574       52.            8.28813559 ...    2.80225989
    37.85       -122.24      ]
 ...
 [   1.7          17.            5.20554273 ...    2.3256351
    39.43       -121.22      ]
 [   1.8672       18.            5.32951289 ...    2.12320917
    39.43       -121.32      ]
 [   2.3886       16.            5.25471698 ...    2.61698113
    39.37       -121.24      ]]


In [5]:
from sklearn import preprocessing

In [6]:
data_original = (dataset.data)
# dataset.data loads the whole dataset(not the target value)
# into the variable data_original
X_scaled = preprocessing.scale(dataset.data)
# X_scaled contains the scaled dataset
X_scaled

array([[ 2.34476576,  0.98214266,  0.62855945, ..., -0.04959654,
         1.05254828, -1.32783522],
       [ 2.33223796, -0.60701891,  0.32704136, ..., -0.09251223,
         1.04318455, -1.32284391],
       [ 1.7826994 ,  1.85618152,  1.15562047, ..., -0.02584253,
         1.03850269, -1.33282653],
       ...,
       [-1.14259331, -0.92485123, -0.09031802, ..., -0.0717345 ,
         1.77823747, -0.8237132 ],
       [-1.05458292, -0.84539315, -0.04021111, ..., -0.09122515,
         1.77823747, -0.87362627],
       [-0.78012947, -1.00430931, -0.07044252, ..., -0.04368215,
         1.75014627, -0.83369581]])

In [7]:
pft = preprocessing.PolynomialFeatures(degree = 2)
X_poly = pft.fit_transform(X_scaled)

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_poly, dataset.target,test_size = 0.40,random_state = 42)

In [9]:
from sklearn import linear_model

# we can try two version of linear regression: (i) multiple linear regression, (ii) Ridge
model = linear_model.LinearRegression()
# model = linear_model.Ridge(alpha = 300)

# alpha is the regularization parameter(don't get confused by the symbol)
model.fit(X_train, y_train)

LinearRegression()

In [10]:
predictionTestSet = model.predict(X_test)

In [11]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import numpy as np

# model evaluation for training set
y_train_predict = model.predict(X_train)
rmse = (np.sqrt(mean_squared_error(y_train, y_train_predict)))
# r square
r2 = r2_score(y_train, y_train_predict)

print("The model performance for training set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
print("\n")

# model evaluation for testing set
y_test_predict = model.predict(X_test)
rmse = (np.sqrt(mean_squared_error(y_test, y_test_predict)))
r2 = r2_score(y_test, y_test_predict)

print("The model performance for testing set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))


The model performance for training set
--------------------------------------
RMSE is 0.6450401066282406
R2 score is 0.6863854687130181


The model performance for testing set
--------------------------------------
RMSE is 0.8457257113696922
R2 score is 0.4657135571416554


You can also visit the following link to see another example
https://towardsdatascience.com/linear-regression-on-boston-housing-dataset-f409b7e4a155¶
