In [None]:
print("Hello World")

In [None]:
from pathlib import Path as path
import tarfile
import urllib

In [None]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = path("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

In [None]:
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    path.mkdir(housing_path, exist_ok=True, parents=True)
    tgz_path = path(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

fetch_housing_data()

In [None]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    housing_csv = path(housing_path,"housing.csv")
    return pd.read_csv(housing_csv)

In [None]:
housing = load_housing_data()
housing.head()

In [None]:
housing.info()

### 10 columns
### 9 numerical feature, 1 catecorigal feature

In [None]:
housing.ocean_proximity.value_counts()

In [None]:
housing.describe()

In [None]:
housing.hist(bins=50, figsize=(20,15))

## Creating an income category attribute

In [None]:
help(pd.cut)

In [None]:
import numpy as np

housing["income_cat"] = pd.cut(housing['median_income'],
                                bins=[0.,1.5,3.,4.5,6.,np.inf],
                                labels=[1,2,3,4,5])

In [None]:
housing["income_cat"].hist()

In [None]:
housing.columns

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
stratified = strat_test_set["income_cat"].value_counts()/len(strat_test_set)
stratified

### compare to original housing data

In [None]:
original = housing["income_cat"].value_counts()/len(housing)
original

### Let's see how a random split would work

In [None]:
from sklearn.model_selection import train_test_split

random_test_set, random_train_set = train_test_split(housing,test_size=0.2, random_state=42)

In [None]:
random = random_test_set["income_cat"].value_counts()/len(housing)
random

In [None]:
columns=["Random","Original","Stratified"]
summary = pd.DataFrame({"Original":original,"Random":random,"Stratified":stratified})

In [None]:
summary

In [None]:
summary[r"Rand. %error"]=((summary.Original-summary.Random)/summary.Original)*100
summary

In [None]:
summary[r"Strat. %error"]=((summary.Original-summary.Stratified)/summary.Original)*100
summary.sort_index(axis=0)

### Delete dataframes and series created to compare test/train splits

In [None]:
del random, original, stratified, summary

### Remove the income_cat attribute so that the data goes back to its original format

In [None]:
# before
strat_train_set.head()

In [None]:
# remove
for set_ in (strat_train_set,strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [None]:
# after
strat_test_set.head()

In [None]:
strat_train_set.head()

# Data Visualisation