# ENGSCI762 - Data Science for Engineering Applications

# California House Pricing

Adapted from Geron (2017):

https://github.com/ageron/handson-ml/blob/master/02_end_to_end_machine_learning_project.ipynb

## II.2 Find and document where you can get that data 

In [None]:
config = {'data_repository': "https://raw.githubusercontent.com/ageron/handson-ml/master",
          'data_path': "/datasets/housing/housing.tgz"}
config['download_url'] = config['data_repository'] + config['data_path']
config

## II.7 Get the data

In [None]:
import os
from six.moves import urllib

config['reference_path'] = "../../reference/datasets/housing"

def fetch_housing_data(housing_url=config['download_url'], 
                       housing_path=config['reference_path']):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    return tgz_path

In [None]:
config['local_reference'] = fetch_housing_data()
config['local_reference']

## II.8 Convert the data to a format you can easily manipulate

In [None]:
import tarfile
def convert_housing_data(tgz_path=config['local_reference'], 
                        housing_path=config['reference_path']):
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [None]:
convert_housing_data()
reference_files = os.listdir(config['reference_path'])
reference_files

In [None]:
import fnmatch
csv_files = fnmatch.filter(reference_files, '*.csv')
csv_files

In [None]:
config['data_filename'] = csv_files[0]
config['data_filename']

## II.9 Check the size and type of data (time series, sample, geographical, etc.).

In [None]:
import pandas as pd
def load_housing_data(housing_path=config['reference_path'],
                      filename=config['data_filename']):
    csv_path = os.path.join(housing_path, filename)
    return pd.read_csv(csv_path)

In [None]:
housing = load_housing_data()
housing.info()

In [None]:
import numpy as np
np.sum(housing['median_house_value']>=500000)/housing.shape[0]

In [None]:
housing['ocean_proximity'].value_counts()

In [None]:
housing.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
housing.hist(bins=50, figsize=(20,15))
plt.show()

## II.11 Sample a test set, put it aside, and never look at it

In [None]:
import numpy as np
housing["value_cat"] = np.ceil(housing["median_house_value"] / 100000)
housing["value_cat"].where(housing["value_cat"] < 5, 5.0, inplace=True)

In [None]:
sns.distplot(housing["value_cat"],kde=False)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["value_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
check_sample = lambda df: df["value_cat"].value_counts() / len(df)

In [None]:
pd.DataFrame(dict([(sample, globals()[sample].shape) for sample in 
                  ("housing", "strat_train_set", "strat_test_set")])
            ).sort_index()

In [None]:
pd.DataFrame(dict([(sample, check_sample(globals()[sample])) for sample in 
                  ("housing", "strat_train_set", "strat_test_set")])
            ).sort_index()

Remove the additional value_cat.

In [None]:
for set in (strat_train_set, strat_test_set):
    set.drop(["value_cat"], axis=1, inplace=True)

In [None]:
def save_housing_data(project_path="..",
                      train_set=strat_train_set, 
                      test_set=strat_test_set):
    
    housing_path = os.path.join(project_path, 'data')
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
        
    filepaths = list()
    for data_set in ("train_set", "test_set"):
        csv_path = os.path.join(housing_path, 
                                "{}.csv".format(data_set))
        filepaths.append(csv_path)
        locals()[data_set].to_csv(csv_path)
    return filepaths

In [None]:
save_housing_data()

In [None]:
housing.head()