# Boston Housing ~ Price Predictions

In [141]:
# This is for dataframe manipulation.
import pandas

# This is for special high-efficiency arrays.
import numpy

# This is the boston housing dataset.
from sklearn.datasets import load_boston

# These various imports will be explained as they are used later.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

## Import & Read Boston Data

In [142]:
# Load the boston housing dataset.
boston = load_boston()

# Print data description.
print(boston.DESCR[21:])

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by 

In [143]:
# Create the dataframe.
data = pandas.DataFrame(boston.data)
data.columns = boston.feature_names
data['PRICE'] = boston.target

In [144]:
# This outputs the first five columns of the dataframe.
display(data.head(5))

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [145]:
# This describes properties of each dataframe column.
display(data.describe())

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


## Prepare Data
There are so many variables within the boston housing dataset.
Some of them are less relevant than others.
Reading over the data, there is only a few variables that seem to more directly affect price&hellip;
- ~~**CRIM:** per capita crime rate by town~~
- ~~**ZN:** proportion of residential land zoned for lots over 25,000 sq.ft.~~
- ~~**INDUS:** proportion of non-retail business acres per town~~
- ~~**CHAS:** Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)~~
- ~~**NOX:** nitric oxides concentration (parts per 10 million)~~
- **RM:** average number of rooms per dwelling
- ~~**AGE:** proportion of owner-occupied units built prior to 1940~~
- ~~**DIS:** weighted distances to five Boston employment centres~~
- **RAD:** index of accessibility to radial highways
- ~~**TAX:** full-value property-tax rate per &dollar;10,000~~
- **PTRATIO:** pupil-teacher ratio by town
- ~~**B:** 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town~~
- **LSTAT:** % lower status of the population

Once we remove those less-relevent variables, we are left with:
- **RM:** average number of rooms per dwelling
- **RAD:** index of accessibility to radial highways
- **PTRATIO:** pupil-teacher ratio by town
- **LSTAT:** % lower status of the population

In [146]:
# Consider only [RM, RAD, PTRATIO, LSTAT]
items = ['RM', 'RAD', 'PTRATIO', 'LSTAT']

# Create an array of those data-items; X.
X = []
for x in items:
	X.append(data[x])
# The python splat-operator (*X) removes the array.
# This makes its items into parameters for the zip function.
X = numpy.array(list(zip(*X)))

# Do the same for the target data; Y.
Y = data['PRICE']

display(X.shape)
display(Y.shape)

(506, 4)

(506,)

### Train/Test Split
*Why do we need train/test split?*
*What does it do?*

In [147]:
# Use the built-in "train test split" function
# to generate the four desireable segments of data.
X_train, X_test, Y_train, Y_test = train_test_split(
	X, Y, test_size=0.25, random_state=0)

display(X_train.shape)
display(X_test.shape)

(379, 4)

(127, 4)

### Min/Max Scalar
*Why do we need min/max scalar?*
*What does it do?*

In [148]:
# First, scale X.
X_scaled = MinMaxScaler()
# No reshaping necessary for X.
# Reshaping is needed for Y, though.
X_scaled.fit(X_train)
X_scaled.transform(X_test)

array([[0.53113623, 0.13043478, 0.45744681, 0.15914894],
       [0.50871815, 1.        , 0.80851064, 0.2212766 ],
       [0.53880054, 0.17391304, 0.40425532, 0.24879433],
       [0.34259437, 1.        , 0.80851064, 0.51177305],
       [0.51178387, 0.17391304, 0.64893617, 0.30099291],
       [0.43552405, 0.13043478, 0.89361702, 0.19120567],
       [0.44912819, 1.        , 0.80851064, 0.27574468],
       [0.46119946, 0.17391304, 0.80851064, 0.21446809],
       [0.27054991, 0.13043478, 0.61702128, 0.30950355],
       [0.54493198, 0.17391304, 0.88297872, 0.25248227],
       [0.45909178, 1.        , 0.80851064, 0.53588652],
       [0.59321709, 1.        , 0.80851064, 0.5529078 ],
       [0.53266909, 1.        , 0.80851064, 0.45560284],
       [0.20904388, 1.        , 0.80851064, 0.75319149],
       [0.81260778, 0.17391304, 0.22340426, 0.00539007],
       [0.66679441, 0.17391304, 0.23404255, 0.08539007],
       [0.51369994, 1.        , 0.80851064, 0.25560284],
       [0.71392987, 0.04347826,

In [149]:
# Next, scale Y.
Y_scaled =  MinMaxScaler()
Y_train = Y_train.values.reshape(-1,1)
Y_test = Y_test.values.reshape(-1,1)
Y_scaled.fit(Y_train)
Y_scaled.transform(Y_test)

array([[0.39111111],
       [1.        ],
       [0.4       ],
       [0.07333333],
       [0.36      ],
       [0.33111111],
       [0.34666667],
       [0.30444444],
       [0.24666667],
       [0.30222222],
       [0.08444444],
       [0.27111111],
       [0.22      ],
       [0.12222222],
       [1.        ],
       [0.53333333],
       [0.4       ],
       [0.62888889],
       [0.54222222],
       [0.35555556],
       [0.41777778],
       [0.31333333],
       [0.34222222],
       [0.53555556],
       [0.31777778],
       [0.40222222],
       [0.32444444],
       [0.32      ],
       [0.74888889],
       [0.30444444],
       [0.21333333],
       [0.33333333],
       [0.34444444],
       [0.33555556],
       [0.41333333],
       [0.26222222],
       [0.01333333],
       [1.        ],
       [0.21111111],
       [0.18444444],
       [0.42      ],
       [0.33333333],
       [0.32888889],
       [0.19555556],
       [0.25555556],
       [0.36888889],
       [0.34      ],
       [0.266