### Prepare Data for ML Algorithms

#### 1. Load Data and Split (as section 1)

In [1]:
#load data
import os
import pandas as pd

HOUSING_PATH = os.path.join("datasets", "housing")

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

housing = load_housing_data()

In [2]:
#split train and test (with stratified sampling)
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np

housing["income_cat"] = pd.cut(housing["median_income"],
                              bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                              labels=[1, 2, 3, 4, 5])

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [3]:
#clean unnecessary data
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

#separate predictors from the labels ("median_house_value")
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set.["median_house_value"].copy()

#### 2. Data Cleaning

e.g. Total_bedrooms attribute has some missing values (from section 1). We have three options:
- get rid of the corresponding district
- get rid of the whole attribute
- set the values to some values (zero, the mean, the median, etc.)

In [4]:
# pandas methods:
housing.dropna(subset=["total_bedrooms"])   #option 1
housing.drop("total_bedrooms", axis=1)      #option 2
median = housing["total_bedrooms"].median() #option 3 --> calc new median, and same modification on test set and new data
housing["total_bedrooms"].fillna(median, inplace=True) 

In [5]:
# sklearn method:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")             
housing_num = housing.drop("ocean_proximity", axis=1)  # copy all numerical data to new variable
imputer.fit(housing_num)                               # apply imputer on all numerical attributes  --> Imputer is an Estimator

imputer.statistics_  # the median of each attribute is stored in statistics_ by default = housing_num.median().values

array([-118.51  ,   34.26  ,   29.    , 2119.5   ,  433.    , 1164.    ,
        408.    ,    3.5409])

In [6]:
# transform the train set by replacing missing values with the "learned" medians --> NumPy array with transformed features
X = imputer.transform(housing_num)                      # --> Imputer is an Transformer
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index) # put it back to panda's DataFrame
X.shape

(16512, 8)

#### 3. Handling Text and Categorical Attributes

In [7]:
# take a look at the only text/non-numerical attribute: ocean_proximity
housing_cat = housing[["ocean_proximity"]]
housing_cat.head(10)

Unnamed: 0,ocean_proximity
17606,<1H OCEAN
18632,<1H OCEAN
14650,NEAR OCEAN
3230,INLAND
3555,<1H OCEAN
19480,INLAND
8879,<1H OCEAN
13685,INLAND
4937,<1H OCEAN
4861,<1H OCEAN


Oberserving that the "ocean_proximity" attribute doesn't contain arbitrary text, but a limited number of possible values.\
--> this is a categorical attribute\
--> they can be converted to numbers for ML algorithms

In [8]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

array([[0.],
       [0.],
       [4.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.]])

In [9]:
ordinal_encoder.categories_   # instance variable: all categories

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

HOWEVER, representing with numbers 0 to 4 will cause ML algorithms to assume two nearby values are more similar than two distant values, which is not the case here.\
--> one-hot vectors

In [10]:
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot     # SciPy sparse matrix: stores locations of non-zero elements

<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

In [11]:
housing_cat_1hot.toarray()   # convert to NumPy dense matrix

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

In [12]:
cat_encoder.categories_    # instance variable: all categories

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

There are more considerations in converting the categorical attributes, see more details in chapters about ***representation leaning***

#### 4. Feature Scaling

two common ways:
1) _min-max scaling_ (_normalization_): minus min and divided by (max - min) --> sensitive to outlier\
2) _standardization_: minus mean

#### 5. Data Transformation Pipeline

In [13]:
# custom transformation (for completion of following transformation pipeline)
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]

        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [14]:
# Transformation Pipeline for numerical attributes
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [15]:
# Transformation Pipeline for all attributes
from sklearn.compose import ColumnTransformer  # allow great flexibility dealing with different attributes

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(housing)
X_test_prepared  = full_pipeline.transform(X_test)

In [16]:
#save for further use
import pickle

with open('datasets/train.pickle', 'wb') as f:
    pickle.dump((housing_labels, housing_prepared), f)
    
with open('datasets/test.pickle', 'wb') as f:
    pickle.dump((y_test, X_test_prepared), f)