# <center>Machine Learning Project - valuation of houses in california</center>

---

## Step 1 - Get the data

In [None]:
import pandas as pd
import numpy as np
import hashlib

### Loading data

In [None]:
HOUSING_DATA_PATH = "./data/housing.csv"
housing = pd.read_csv(HOUSING_DATA_PATH)

### Data structure lookup

In [None]:
housing.head() # first 5 rows of dataset

In [None]:
housing.info()

In [None]:
housing.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20, 15))
plt.show()


### Test set creation

In [None]:
def split_train_test(data, test_ratio):
    """
    This method split dataset to two smaller, train and test datasets.
    """
    shuffled_indices = np.random.permutation(len(data)) # shuffle list of indexes with size of all dataset
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

def test_set_check(identifier, test_ratio, hash):
    return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio

def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [None]:
# split dataset to train and test sets
housing_with_id =  housing.reset_index()
housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"] # creating new attribute base on long and lat
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")  # spliting datasets

**STRATIFIED SAMPLING**\
Means sampling separately from each group of general dataset, which were separated before sampling.This way of sampling helps to avoid sampling load problem.

In [None]:
# create new, categorial attribute that represents group of income
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace = True)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

# making a stratified sampling
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
# checking a data proportion
housing["income_cat"].value_counts() / len(housing)

In [None]:
# droping 'income_cat' attribute from train and test sets
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

---

## Step 2 - Discoved and visualize data