# III. Explore the data

This notebook assumes, that notebook 2_get_California_housing_data.ipynb has been successfully executed.

In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

%matplotlib inline

In [3]:
# Ignore all warnings for the purpose of this notebook
import warnings 
warnings.filterwarnings('ignore')

## III.2 Study each attribute

In [5]:
housing = pd.read_csv('X_train.csv', index_col=0)
housing.info()

FileNotFoundError: File b'train_set.csv' does not exist

In [None]:
housing.ocean_proximity.value_counts().plot(kind='bar')

In [None]:
def explore(feature):
    # http://stackoverflow.com/questions/40070093/gridspec-on-seaborn-subplots
    gridkw = dict(height_ratios=[5, 1])
    fig, (ax1, ax2) = plt.subplots(2, 1, gridspec_kw=gridkw, sharex=True)
    sns.distplot(feature, ax=ax1, kde=False) #array, top subplot
    sns.boxplot(feature, ax=ax2, width=.4) #bottom subplot
    ax1.set_xlabel('') 
    ax1.text(1.05, 0.95, 
             feature.describe(), 
             transform=ax1.transAxes, fontsize=14,
            verticalalignment='top')
    #http://stackoverflow.com/questions/29813694/how-to-add-a-title-to-seaborn-facet-plot
    fig.subplots_adjust(top=0.9)
    fig.suptitle(feature.name, fontsize=16)

explore(housing.longitude)

In [None]:
explore(housing['latitude'])

In [None]:
explore(housing['housing_median_age'])

In [None]:
explore(housing['total_rooms'])

In [None]:
explore(housing['total_bedrooms'].dropna())

In [None]:
explore(housing['population'])

In [None]:
explore(housing['households'])

In [None]:
explore(housing['median_income']) 

In [None]:
explore(housing['median_house_value']) 

## Visualize the data

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
             s=housing["population"]/100, label="population",
             c="median_house_value", cmap=plt.get_cmap("jet"), 
             colorbar=True)
plt.legend()

## Study the correlations between attributes.

In [None]:
# compute the standard correlation coefficient (also called Pearson’s r)
corr_matrix = housing.corr()

In [None]:
# look at how much each attribute correlates with the median house value
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
sns.pairplot(housing[["median_house_value", "median_income", 
                      "total_rooms", "housing_median_age","ocean_proximity"]],
            hue="ocean_proximity",
            plot_kws= { "alpha":0.4})

In [None]:
housing["max_value"] = housing["median_house_value"] == housing["median_house_value"].max()
sns.pairplot(housing,
             x_vars=["median_house_value", "median_income", "total_rooms", "housing_median_age"],
             y_vars=["median_house_value", "households", "total_bedrooms", "population"],
            hue="max_value", plot_kws= { "alpha":0.4})

In [None]:
del housing["max_value"]

## Feature Engineering

In [None]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

In [None]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
explore(housing["population_per_household"])

In [None]:
housing["log_population_per_household"] = np.log(housing["population_per_household"])

In [None]:
explore(housing["log_population_per_household"])

In [None]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)