# Dataset Exploration: Boston House Pricing
## Bohumír Zámečník
http://www.neural.cz/dataset-exploration-boston-house-pricing.html


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# We will load the Boston dataset directly instead of getting
# it through sklearn.
df = pd.read_csv('data/Boston.csv')

In [3]:
# count data points and features (attributes)
instance_count, attr_count = df.shape

In [4]:
instance_count

506

In [5]:
attr_count

13

In [6]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2


In [7]:
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,11.36,21.2
75%,3.677082,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,37.97,50.0


In [8]:
# CRIM = per capita crime rate by town
# ZN = proportion of residential land zoned for lots over 25,000 sq. ft.
# INDUS = proportion of non-retail business acres per town
# CHAS = Charles River dummy variable
# NOX = nitrogen oxides concentration
# RM = avg. rooms per dwelling
# AGE = proportion of owner-occupied units built prior to 1940
# DIS = weighted mean of distances to five Boston employment centers
# RAD = index of accessibility to radial highways
# TAX = full-value property-tax rate per $10,000
# PTRATIO = pupil-teacher ratio by town
# LSTAT = lower status of the population (percent)

In [9]:
# pandas offers three correlation coefficients via the corr() function:
# Pearson, Spearman rank correlation, and Kendall Tau rank correlation
# We'll use Spearman...

pearson = df.corr(method='pearson')
pearson

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
CRIM,1.0,-0.200469,0.406583,-0.055892,0.420972,-0.219247,0.352734,-0.37967,0.625505,0.582764,0.289946,0.455621,-0.388305
ZN,-0.200469,1.0,-0.533828,-0.042697,-0.516604,0.311991,-0.569537,0.664408,-0.311948,-0.314563,-0.391679,-0.412995,0.360445
INDUS,0.406583,-0.533828,1.0,0.062938,0.763651,-0.391676,0.644779,-0.708027,0.595129,0.72076,0.383248,0.6038,-0.483725
CHAS,-0.055892,-0.042697,0.062938,1.0,0.091203,0.091251,0.086518,-0.099176,-0.007368,-0.035587,-0.121515,-0.053929,0.17526
NOX,0.420972,-0.516604,0.763651,0.091203,1.0,-0.302188,0.73147,-0.76923,0.611441,0.668023,0.188933,0.590879,-0.427321
RM,-0.219247,0.311991,-0.391676,0.091251,-0.302188,1.0,-0.240265,0.205246,-0.209847,-0.292048,-0.355501,-0.613808,0.69536
AGE,0.352734,-0.569537,0.644779,0.086518,0.73147,-0.240265,1.0,-0.747881,0.456022,0.506456,0.261515,0.602339,-0.376955
DIS,-0.37967,0.664408,-0.708027,-0.099176,-0.76923,0.205246,-0.747881,1.0,-0.494588,-0.534432,-0.232471,-0.496996,0.249929
RAD,0.625505,-0.311948,0.595129,-0.007368,0.611441,-0.209847,0.456022,-0.494588,1.0,0.910228,0.464741,0.488676,-0.381626
TAX,0.582764,-0.314563,0.72076,-0.035587,0.668023,-0.292048,0.506456,-0.534432,0.910228,1.0,0.460853,0.543993,-0.468536


In [None]:
# Let's look at correlation with target/answer
corr_with_target = pearson.iloc[-1][:-1]
corr_with_target

In [None]:
predictivity = corr_with_target.sort_values(inplace=False, ascending=False)

In [None]:
predictivity

In [None]:
# strong negative correlations are important too...
corr_with_target[abs(corr_with_target).argsort()[::-1]]

In [None]:
# It might be interesting to select some strong correlations between
# attribute pairs. With a bit of Python magic it is possible:
attrs = pearson.iloc[:-1, :-1] # all except target
# only important correlations and not auto-correlations
threshold = 0.5
# {('LSTAT', 'TAX'): 0.543993, ('INDUS', 'RAD'): 0.595129, ...
important_corrs = (attrs[abs(attrs) > threshold][attrs != 1.0]) \
    .unstack().dropna().to_dict()
#     attribute pair  correlation
# 0     (AGE, INDUS)     0.644779
# 1     (INDUS, RAD)     0.595129
# ...
unique_important_corrs = pd.DataFrame(
    list(set([(tuple(sorted(key)), important_corrs[key]) \
    for key in important_corrs])), columns=['attribute pair', 'correlation'])
# sorted by absolute value
unique_important_corrs = unique_important_corrs.iloc[
    abs(unique_important_corrs['correlation']).argsort()[::-1]]

In [None]:
unique_important_corrs

## Let's Visualize

In [None]:
%matplotlib inline
import seaborn as sns  #heatmap replaces corrplot

# Using all correlations
sns.heatmap(pearson); 

In [None]:
# display annotations and change the colors...
sns.heatmap(pearson, cmap='coolwarm', annot=True); 

In [None]:
# Generate a mask for the upper triangle / values above the identity diagonal
# Remove use of the mask below to see the "whole" heatmap
mask = np.zeros_like(pearson, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Adjust the size of the image output
fig, ax = plt.subplots(figsize=(8,8)) 

# Add square parameter to make cells square, use the mask, remove annot
sns.heatmap(pearson, cmap='coolwarm', mask=mask, square=True, ax=ax); 

In [None]:
attr = df['MEDV']
plt.hist(attr);

In [None]:
plt.hist(attr, bins=50);

In [None]:
sns.distplot(attr);

### For integer-valued data (e.g., categories) automatic quantization into a pre-defined number of bins might not be the best option.
### We'd like to quantize according the original distinct values. For that we can just compute this kind of histogram ourselves and use the bar plot.
* Example for __RAD__ int (category) - index of accessibility to radial highways:

In [None]:
cat_attr = df['RAD']
h = cat_attr.value_counts()
values, counts = h.index, h
plt.bar(values, counts);

In [None]:
plt.scatter(df['DIS'], df['MEDV']);

In [None]:
x, y = df['DIS'], df['MEDV']
plt.scatter(x, y, alpha=0.5)

# or via jointplot (with histograms aside):
sns.jointplot(x, y, kind='scatter', joint_kws={'alpha':0.5});

In [None]:
sns.jointplot(df['DIS'], df['MEDV'], kind='hex');

In [None]:
#sns.kdeplot(df['DIS'], df['MEDV'], shade=True)
# or 
sns.jointplot(df['DIS'], df['MEDV'], kind='kde');