# Importing the main libraries
** Other imports added as required **

In [None]:
from pathlib import Path as path
import tarfile
import urllib

import matplotlib.pyplot as plt
import seaborn as sns

# Defining main environment variables and locations

In [None]:
CHAPTER_ID = 'end_to_end_project'
PROJECT_ROOT_DIR = '.'
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = path("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
IMAGES_PATH = path(PROJECT_ROOT_DIR,'images',CHAPTER_ID)

# Preparing graph functions and settings

In [None]:
sns.set_style('darkgrid') # darkgrid, white grid, dark, white and ticks

plt.rc('axes', titlesize=18)     # fontsize of the axes title
plt.rc('axes', labelsize=14)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=13)    # fontsize of the tick labels
plt.rc('ytick', labelsize=13)    # fontsize of the tick labels
plt.rc('legend', fontsize=13)    # legend fontsize
plt.rc('font', size=13)          # controls default text sizes

def save_fig(fig_id, tight_layout=True, fig_extension='png', resolution=300, overwrite=False):
    path_ = path(IMAGES_PATH, f'{fig_id}.{fig_extension}')
    if path_.exists() and not overwrite:
        print(f'{fig_id}.{fig_extension} already exists in {IMAGES_PATH}')
    else:
        print(f'Saving figure {fig_id} in {IMAGES_PATH}')
        if tight_layout:
            plt.tight_layout()
        plt.savefig(path_, format=fig_extension, dpi=resolution)

In [None]:
IMAGES_PATH.mkdir(parents=True,exist_ok=True)

In [None]:
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    path.mkdir(housing_path, exist_ok=True, parents=True)
    tgz_path = path(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [None]:
# Check if data is already available before downloading again
if path(HOUSING_PATH,'housing.csv').exists():
    print('Data is already downloaded')
else:
    fetch_housing_data()

In [None]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    housing_csv = path(housing_path,"housing.csv")
    return pd.read_csv(housing_csv)

In [None]:
housing = load_housing_data()
housing.head()

In [None]:
housing.info()

Data consists of 10 columns - 9 numerical and 1 categorical

In [None]:
housing.ocean_proximity.value_counts()

In [None]:
housing.describe()

In [None]:
%matplotlib inline
housing.hist(bins=50, figsize=(20,15))
save_fig("attribute_histogram_plots")
plt.show()

# Creating an income category attribute

In [None]:
import numpy as np

housing["income_cat"] = pd.cut(housing['median_income'],
                                bins=[0.,1.5,3.,4.5,6.,np.inf],
                                labels=[1,2,3,4,5])

In [None]:
housing["income_cat"].hist()

In [None]:
housing.columns

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
stratified = strat_test_set["income_cat"].value_counts()/len(strat_test_set)
stratified

## Comparing to original housing data

In [None]:
original = housing["income_cat"].value_counts()/len(housing)
original

### Random vs Stratified vs Original

In [None]:
from sklearn.model_selection import train_test_split

random_test_set, random_train_set = train_test_split(housing,test_size=0.2, random_state=42)

In [None]:
random = random_test_set["income_cat"].value_counts()/len(housing)
random

In [None]:
columns=["Random","Original","Stratified"]
summary = pd.DataFrame({"Original":original,"Random":random,"Stratified":stratified})

In [None]:
summary

In [None]:
summary["Rand. % error"]=((summary.Original-summary.Random)/summary.Original)*100
summary

In [None]:
summary["Strat. % error"]=((summary.Original-summary.Stratified)/summary.Original)*100
summary.sort_index(axis=0)

### Delete dataframes and series created to compare test/train splits

In [None]:
del random, original, stratified, summary

### Remove the income_cat attribute so that the data goes back to its original format

In [None]:
# remove
for set_ in (strat_train_set,strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

# Data Visualisation

In [None]:
housing = strat_train_set.copy()

In [None]:
housing.plot(kind='scatter',x='longitude',y='latitude')
save_fig('bad_visualisation_plot')

In [None]:
housing.plot(kind='scatter',x='longitude',y='latitude',alpha=0.3,figsize=(15,10))
save_fig('better_visualisation_plot')

In [None]:
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.3,
            s=housing['population']/100, label='population', figsize=(15,10),
            c='median_house_value', cmap=plt.get_cmap('jet'), colorbar=True, sharex=False)
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('California', size=20)
plt.legend()
save_fig('housing_prices_scatterplot')

### Code from github notebook - Adding Californian image

In [None]:
# Download the california image
images_path = path(PROJECT_ROOT_DIR,'images',CHAPTER_ID)
images_path.mkdir(exist_ok=True)
filename = 'california.png'
print(f'Downloading {filename}')
url = DOWNLOAD_ROOT + 'images/end_to_end_project/' + filename
urllib.request.urlretrieve(url,path(images_path,filename))

In [None]:
import matplotlib.image as mpigm
california_img = mpigm.imread(path(images_path,filename))
ax = housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.3,
                    s=housing['population']/100, label='Population',
                    figsize=(15,10), c='median_house_value', cmap=plt.get_cmap('jet'),
                    colorbar=False)#, sharex=False)

plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5,
           cmap=plt.get_cmap("jet"))
plt.ylabel("Latitude", fontsize=14)
plt.xlabel("Longitude", fontsize=14)

prices = housing["median_house_value"]
tick_values = np.linspace(prices.min(),prices.max(),11)
cbar = plt.colorbar(ticks=tick_values/prices.max())
cbar.ax.set_yticklabels([f"${round(v/1000)}k" for v in tick_values], fontsize=14)
cbar.set_label('Median House Value', fontsize=14)

plt.legend(fontsize=16)
save_fig('california_housing_prices_plot')
plt.show()

# Looking for correlations

In [None]:
corr_matrix = housing.corr()

### Check the correlation between our tagret and the other features

In [None]:
corr_matrix['median_house_value'].sort_values(ascending=False)

### Use Pandas scatter matrix to visualise the most promising attributes

In [None]:
from pandas.plotting import scatter_matrix

attributes = ['median_house_value','median_income','total_rooms','housing_median_age']

scatter_matrix(housing[attributes], figsize=(12,8), diagonal='hist')
plt.suptitle('Scatter Matrix of Promising Features')

save_fig('scatter_matrix_promising_features')

In [None]:
housing.plot(kind='scatter',x='median_income',y='median_house_value',alpha=0.15,figsize=(12,10))
plt.axis([0,16,0,550000])
plt.title('Median Income vs Median House Value')
save_fig('income_vs_house_value_scatterplot')

### Experimenting with Attribute Combinations

In [None]:
housing.columns

In [None]:
housing['rooms_per_household'] = (housing.total_rooms)/(housing.households)
housing['bedrooms_per_room'] = (housing.total_bedrooms)/(housing.total_rooms)
housing['population_per_household'] = (housing.population)/(housing.households)

In [None]:
corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
housing.plot(kind='scatter', y='median_house_value', x='bedrooms_per_room',alpha=0.15,figsize=(12,8))
save_fig('scatter_bedrooms_per_room_vs_median_house_value')

In [None]:
housing.plot(kind='scatter', y='median_house_value', x='population_per_household',alpha=0.15,figsize=(12,8))
plt.xlim((0,10))
save_fig('scatter_population_per_household_vs_median_house_value')

In [None]:
housing.plot(kind="scatter", x="rooms_per_household", y="median_house_value", alpha=0.15, figsize=(12,8))
plt.axis([0, 5, 0, 520000])
save_fig('scatter_rooms_per_household_vs_median_house_value')

In [None]:
housing.describe()

# Prepare the Data for Machine Learning Algorithms

In [None]:
housing = strat_train_set.drop('median_house_value', axis=1)
housing_labels = strat_train_set['median_house_value'].copy()

### Data Cleaning

#### Dealing with Missing Values

Exploring the 3 options from the book

To demonstrate each of them, let's create a copy of the housing dataset, but keeping only the rows that contain at least one null. Then it will be easier to visualize exactly what each option does

In [None]:
sample_incoplete_rows = housing[housing.isnull().any(axis=1)].head()
sample_incoplete_rows

Method #1 Get rid of districts that have missing values

In [None]:
sample_incoplete_rows.dropna(subset='total_bedrooms')

Method #2 Drop the whole column/attribute with missing values

In [None]:
sample_incoplete_rows.drop('total_bedrooms', axis=1)

Method #2 Fill missing values with either median or mean of the districts that have data

In [None]:
median = housing['total_bedrooms'].median()
sample_incoplete_rows['total_bedrooms'].fillna(median,inplace=True)
sample_incoplete_rows

The scikit-learn method - The SimpleIMputer

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')

In [None]:
housing.head()

In [None]:
housing_num = housing.drop('ocean_proximity',axis=1)
imputer.fit(housing_num)

In [None]:
imputer.statistics_

In [None]:
housing_num.median().values

In [None]:
X = imputer.transform(housing_num)

Transform the training set

In [None]:
housing_tr = pd.DataFrame(X, columns=housing_num.columns,index=housing.index)

In [None]:
housing_tr.loc[sample_incoplete_rows.index.values]

In [None]:
imputer.strategy

In [None]:
housing_tr = pd.DataFrame(X,columns=housing_num.columns,index=housing_num.index)

In [None]:
housing_tr.head()

# Handling Text and Categorical Attributes

In [None]:
housing_cat = housing[['ocean_proximity']]
housing_cat.head(10)

### Ordinal Encoding

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

In [None]:
ordinal_encoder.categories_

### One-Hot Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder(sparse=True) # default, can set to False to get .toarray() output as shown below
housing_cat_1hot = cat_encoder.fit_transform(housing_cat,dtype=int)
housing_cat_1hot

In [None]:
housing_cat_1hot.toarray()

In [None]:
cat_encoder.categories_