In [None]:
import sys
from packaging import version
import sklearn
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request
import numpy as np

assert sys.version_info >= (3, 7)
assert version.parse(sklearn.__version__) >= version.parse("1.0.1")

# the next 5 lines define the default font sizes
plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)

# Get the Data

*Your task is to predict median house values in Californian districts, given a number of features from these districts.*

## Download the Data

In [None]:
def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("datasets/housing/housing.csv"))


housing = load_housing_data()

## Take a Quick Look at the Data Structure

In [None]:
housing.head()  # top 5 rows

In [None]:
housing.info()

### Question
*Is something odd with the info echo above?*

In [None]:
housing["ocean_proximity"].value_counts()

In [None]:
housing.describe()

### Question?

Do you have takeaways from the data?


In [None]:
housing.hist(bins=50, figsize=(12, 8))
plt.show()

### Question?

Do you see Problems in the diagrams?

## Create a Test Set

### Question?
What do we have to do for a test set?

In [None]:
def shuffle_and_split_data(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = shuffle_and_split_data(housing, 0.2)
len(train_set)

In [None]:
len(test_set)

### Question?
Are there Problems with this Process?

To ensure that this notebook's outputs remain the same every time we run it, we need to set the random seed:

In [None]:
np.random.seed(42)

## API

Alternative with sklearn methods

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

### Question?
Do we still have Problems?

In [None]:
test_set["total_bedrooms"].isnull().sum()

In [None]:
train_set["total_bedrooms"].isnull().sum()

## Representative Test/Train Dataset

In [None]:
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

In [None]:
housing["income_cat"].value_counts().sort_index().plot.bar(rot=0, grid=True)
plt.xlabel("Income category")
plt.ylabel("Number of districts")
plt.show()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

splitter = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
strat_splits = []
for train_index, test_index in splitter.split(housing, housing["income_cat"]):
    strat_train_set_n = housing.iloc[train_index]
    strat_test_set_n = housing.iloc[test_index]
    strat_splits.append([strat_train_set_n, strat_test_set_n])

In [None]:
strat_train_set, strat_test_set = strat_splits[0]

It's much shorter to get a single stratified split using train_test_split() with stratify option

In [None]:
strat_train_set, strat_test_set = train_test_split(
    housing, test_size=0.2, stratify=housing["income_cat"], random_state=42)

In [None]:
strat_test_set["income_cat"].value_counts() / len(strat_test_set)

In [None]:
def income_cat_proportions(data):
    return data["income_cat"].value_counts() / len(data)


train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

compare_props = pd.DataFrame({
    "Overall %": income_cat_proportions(housing),
    "Stratified %": income_cat_proportions(strat_test_set),
    "Random %": income_cat_proportions(test_set),
}).sort_index()
compare_props.index.name = "Income Category"
compare_props["Strat. Error %"] = (compare_props["Stratified %"] /
                                   compare_props["Overall %"] - 1)
compare_props["Rand. Error %"] = (compare_props["Random %"] /
                                  compare_props["Overall %"] - 1)
(compare_props * 100).round(2)

### Clean up

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

# Discover and Visualize the Data to Gain Insights

In [None]:
housing = strat_train_set.copy()

## Visualizing Geographical Data

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", grid=True)
plt.show()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", grid=True, alpha=0.2)
plt.show()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", grid=True,
             s=housing["population"] / 100, label="population",
             c="median_house_value", cmap="jet", colorbar=True,
             legend=True, sharex=False, figsize=(10, 7))
plt.show()

The argument `sharex=False` fixes a display bug: without it, the x-axis values and label are not displayed (see: https://github.com/pandas-dev/pandas/issues/10611).

### Question?
- What do we can take away from the visualization?
- Which other Attributes can be usefull?

## Looking for Correlations

The goal is still to predict housing prices.

 0   longitude  
 1   latitude  
 2   housing_median_age  
 3   total_rooms  
 4   total_bedrooms  
 5   population  
 6   households  
 7   median_income  
 8   median_house_value  
 9   ocean_proximity   

In [None]:
corr_matrix = housing.corr(numeric_only=True)

In [None]:
corr_matrix["median_house_value"].sort_values(ascending=False)

Standart Correlation Coefficient  
+1 .. strong positive correlation  
-1 .. strong negative correlation  

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms",
              "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))
plt.show()

### Question?
What do we see?

In [None]:
housing.plot(kind="scatter", x="median_income", y="median_house_value",
             alpha=0.1, grid=True)
plt.show()

## Experimenting with Attribute Combinations

In [None]:
housing["rooms_per_house"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_ratio"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["people_per_house"] = housing["population"] / housing["households"]

In [None]:
corr_matrix = housing.corr(numeric_only=True)
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "rooms_per_house", "bedrooms_ratio",
              "people_per_house"]
scatter_matrix(housing[attributes], figsize=(12, 8))
plt.show()

# Next Steps

- Data Cleaning
- Data Scaling
- pick a ML Algorithm
- iterate