In [1]:
import os
import tarfile
import urllib

In [None]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

In [None]:
HOUSING_PATH

In [None]:
HOUSING_URL

In [None]:
def fetch_housing_data(housing_url, housing_path):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [None]:
fetch_housing_data(HOUSING_URL, HOUSING_PATH)

In [None]:
import pandas as pd

def load_housing_data(housing_path):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [None]:
housing = load_housing_data(HOUSING_PATH)

In [None]:
housing

In [None]:
housing.info()

In [None]:
housing["ocean_proximity"].value_counts()

In [None]:
housing.describe()

In [None]:
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
import numpy as np

In [None]:
len(housing)

In [None]:
np.random.permutation(len(housing))

In [None]:
int(120.7)

In [None]:
a = [1, 2, 3, 5, 6, 7]
a[2:]

In [None]:
def split_train_test(data, test_size, random_state):
    np.random.seed(random_state)
    shuffled_indicies = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_size)
    test_indices = shuffled_indicies[:test_set_size]
    train_indices = shuffled_indicies[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = split_train_test(housing, 0.2, 10)

In [None]:
train_set, test_set = split_train_test(housing, 0.2, 4)

In [None]:
train_set

In [None]:
train_set

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=10)

In [None]:
train_set

In [None]:
housing["income_cat"] = pd.cut(
    housing["median_income"],
    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
    labels=[1, 2, 3, 4, 5]
)

In [None]:
housing["income_cat"].hist()

In [None]:
train_set, test_set = split_train_test(housing, 0.2, 10)

In [None]:
train_set["income_cat"].value_counts(normalize=True)

In [None]:
test_set["income_cat"].value_counts(normalize=True)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

In [None]:
split.split

In [None]:
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
strat_train_set["income_cat"].value_counts(normalize=True)

In [None]:
strat_test_set["income_cat"].value_counts(normalize=True)

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [None]:
strat_test_set

In [None]:
housing = strat_train_set.copy()

In [None]:
housing

# Visualizations

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude")

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)

In [None]:
housing.plot(
    kind="scatter",
    x="longitude",
    y="latitude",
    alpha=0.4,
    s=housing["population"]/100,
    label="population",
    figsize=(10,7),
    c="median_house_value",
    cmap=plt.get_cmap("jet"),
    colorbar=True,
)

# plt.legend()

# Correlations

In [None]:
housing.head(1)

In [None]:
corr_matrix = housing.corr()

In [None]:
corr_matrix

In [None]:
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix

attributes = [
    "median_house_value",
    "median_income",
    "total_rooms",
    "housing_median_age"
]

scatter_matrix(housing[attributes], figsize=(12, 8));

In [None]:
housing.plot(
    kind="scatter",
    x="median_income",
    y="median_house_value",
    alpha=0.1
)

In [None]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

In [None]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [None]:
housing

In [None]:
housing_labels

In [None]:
housing.info()

# Filling nans

In [None]:
median = housing["total_bedrooms"].median()
median

In [None]:
housing["total_bedrooms"].fillna(median, inplace=True)

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

In [None]:
housing_num = housing.drop("ocean_proximity", axis=1)

In [None]:
housing_num

In [None]:
imputer.fit(housing_num)

In [None]:
imputer.statistics_

In [None]:
X = imputer.transform(housing_num)

In [None]:
housing_tr = pd.DataFrame(
    X,
    columns=housing_num.columns,
    index=housing_num.index
)

# Handling text and categorical data

In [None]:
housing_cat = housing[["ocean_proximity"]]
housing_cat.head(10)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()

In [None]:
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)

In [None]:
housing_cat_encoded

In [None]:
housing_cat_encoded[:10]

In [None]:
ordinal_encoder.categories_

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()

In [None]:
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

In [None]:
housing_cat_1hot.toarray()

In [None]:
housing_cat.value_counts()

In [None]:
housing

# Custom fit_transform()

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
        
    def fit(self, X, y=None):
        return self # nothing else to do
    
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [None]:
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)

housing_extra_attribs = attr_adder.transform(housing.values)

# Pipelines

In [3]:
import pandas as pd
import numpy as np
import os

HOUSING_PATH = os.path.join("datasets", "housing")

def load_housing_data(housing_path):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [4]:
housing = load_housing_data(HOUSING_PATH)

In [51]:
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
12655,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,INLAND
15502,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,NEAR OCEAN
2908,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.8750,INLAND
14053,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,NEAR OCEAN
20496,-118.70,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,<1H OCEAN
...,...,...,...,...,...,...,...,...,...
15174,-117.07,33.03,14.0,6665.0,1231.0,2026.0,1001.0,5.0900,<1H OCEAN
12661,-121.42,38.51,15.0,7901.0,1422.0,4769.0,1418.0,2.8139,INLAND
19263,-122.72,38.44,48.0,707.0,166.0,458.0,172.0,3.1797,<1H OCEAN
19140,-122.70,38.31,14.0,3155.0,580.0,1208.0,501.0,4.1964,<1H OCEAN


In [5]:
housing["income_cat"] = pd.cut(
    housing["median_income"],
    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
    labels=[1, 2, 3, 4, 5]
)

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
    
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [6]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

In [7]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [8]:
housing_num = housing.drop("ocean_proximity", axis=1)
housing_cat = housing[["ocean_proximity"]]

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

In [None]:
housing_num_tr = num_pipeline.fit_transform(housing_num)

In [10]:
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(housing)

In [11]:
housing_prepared.shape

(16512, 16)

In [12]:
housing_prepared

array([[-0.94135046,  1.34743822,  0.02756357, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.17178212, -1.19243966, -1.72201763, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.26758118, -0.1259716 ,  1.22045984, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.5707942 ,  1.31001828,  1.53856552, ...,  0.        ,
         0.        ,  0.        ],
       [-1.56080303,  1.2492109 , -1.1653327 , ...,  0.        ,
         0.        ,  0.        ],
       [-1.28105026,  2.02567448, -0.13148926, ...,  0.        ,
         0.        ,  0.        ]])

# Choose models

In [13]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

In [14]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]

In [15]:
some_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
12655,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,INLAND
15502,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,NEAR OCEAN
2908,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.875,INLAND
14053,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,NEAR OCEAN
20496,-118.7,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,<1H OCEAN


In [16]:
some_data_prepared = full_pipeline.transform(some_data)

In [17]:
print("Predictions:", lin_reg.predict(some_data_prepared))
print("Labels:", list(some_labels))

Predictions: [ 85657.90192014 305492.60737488 152056.46122456 186095.70946094
 244550.67966089]
Labels: [72100.0, 279600.0, 82700.0, 112500.0, 238300.0]


In [18]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

68627.87390018745

In [19]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

In [20]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

In [21]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)

In [22]:
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

18701.79527419818

# Cross-validation

https://www.kaggle.com/code/satishgunjal/tutorial-k-fold-cross-validation

In [23]:
from sklearn.model_selection import cross_val_score

In [33]:
def display_score(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [36]:
lin_scores = cross_val_score(
    lin_reg,
    housing_prepared,
    housing_labels,
    scoring="neg_mean_squared_error",
    cv=10
)

In [30]:
lin_scores

array([-5.14989425e+09, -4.11073216e+09, -4.59293165e+09, -4.71078941e+09,
       -4.46840655e+09, -5.26031619e+09, -5.47556792e+09, -4.73376148e+09,
       -4.41471057e+09, -4.91959144e+09])

In [29]:
lin_rmse_score = np.sqrt(-lin_scores)

In [34]:
display_score(lin_rmse_score)

Scores: [71762.76364394 64114.99166359 67771.17124356 68635.19072082
 66846.14089488 72528.03725385 73997.08050233 68802.33629334
 66443.28836884 70139.79923956]
Mean: 69104.07998247063
Standard deviation: 2880.328209818062


In [37]:
tree_scores = cross_val_score(
    tree_reg,
    housing_prepared,
    housing_labels,
    scoring="neg_mean_squared_error",
    cv=10
)

In [38]:
tree_rmse_score = np.sqrt(-tree_scores)

In [39]:
display_score(tree_rmse_score)

Scores: [73174.77296326 72188.03114167 70146.0889006  71707.12536948
 69866.88949369 74901.59886376 70061.3240506  73862.89330448
 69125.24059195 70829.45181301]
Mean: 71586.3416492498
Standard deviation: 1818.2346888906043


In [40]:
%%time
forest_scores = cross_val_score(
    forest_reg,
    housing_prepared,
    housing_labels,
    scoring="neg_mean_squared_error",
    cv=10
)

Wall time: 1min 32s


In [42]:
forest_rmse_scores = np.sqrt(-forest_scores)
display_score(forest_rmse_scores)

Scores: [51339.96928458 49132.05112138 46770.87000138 52075.44399994
 47446.12824857 52029.19403307 52356.40061625 50046.31090417
 48754.08949545 53860.48940834]
Mean: 50381.09471131214
Standard deviation: 2201.8537386132675


In [43]:
import joblib
joblib.dump(forest_reg, "forest_reg.pkl")

['forest_reg.pkl']

# Fine-tuning

In [44]:
from sklearn.model_selection import GridSearchCV

In [46]:
param_grid = [
    {
        'n_estimators': [3, 10, 30, 50],
        'max_features': [2, 4, 6, 8, 10],
    },
    {
        'bootstrap': [False],
        'n_estimators': [3, 10, 30, 50],
        'max_features': [2, 4, 6, 8, 10],
    }
]

In [47]:
forest_reg = RandomForestRegressor()

In [49]:
grid_search = GridSearchCV(
    forest_reg,
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    return_train_score=True
)

In [50]:
%%time
grid_search.fit(housing_prepared, housing_labels)

Wall time: 3min 57s


In [52]:
grid_search.best_params_

{'bootstrap': False, 'max_features': 6, 'n_estimators': 50}

In [53]:
cvres = grid_search.cv_results_

In [54]:
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

63237.746713051165 {'max_features': 2, 'n_estimators': 3}
55431.288284071896 {'max_features': 2, 'n_estimators': 10}
52628.416480959866 {'max_features': 2, 'n_estimators': 30}
52114.79371055696 {'max_features': 2, 'n_estimators': 50}
61204.25476752402 {'max_features': 4, 'n_estimators': 3}
52507.60969341161 {'max_features': 4, 'n_estimators': 10}
50313.158513194205 {'max_features': 4, 'n_estimators': 30}
49962.15952134665 {'max_features': 4, 'n_estimators': 50}
59473.19769210899 {'max_features': 6, 'n_estimators': 3}
52419.09618134294 {'max_features': 6, 'n_estimators': 10}
50065.37211429619 {'max_features': 6, 'n_estimators': 30}
49527.47701913644 {'max_features': 6, 'n_estimators': 50}
59475.50056454915 {'max_features': 8, 'n_estimators': 3}
52173.88850833664 {'max_features': 8, 'n_estimators': 10}
50214.42531720976 {'max_features': 8, 'n_estimators': 30}
49650.89058998007 {'max_features': 8, 'n_estimators': 50}
59188.934162394435 {'max_features': 10, 'n_estimators': 3}
52709.1053836

In [55]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([7.47632903e-02, 7.08251177e-02, 4.25352997e-02, 1.64131313e-02,
       1.60987723e-02, 1.74080051e-02, 1.58174591e-02, 3.27154367e-01,
       5.53897623e-02, 1.07903289e-01, 7.95243956e-02, 9.91004699e-03,
       1.59521351e-01, 6.16676055e-05, 3.01513728e-03, 3.65890747e-03])

In [59]:
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
12655,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,INLAND
15502,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,NEAR OCEAN
2908,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.8750,INLAND
14053,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,NEAR OCEAN
20496,-118.70,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,<1H OCEAN
...,...,...,...,...,...,...,...,...,...
15174,-117.07,33.03,14.0,6665.0,1231.0,2026.0,1001.0,5.0900,<1H OCEAN
12661,-121.42,38.51,15.0,7901.0,1422.0,4769.0,1418.0,2.8139,INLAND
19263,-122.72,38.44,48.0,707.0,166.0,458.0,172.0,3.1797,<1H OCEAN
19140,-122.70,38.31,14.0,3155.0,580.0,1208.0,501.0,4.1964,<1H OCEAN


In [58]:
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

[(0.3271543666565967, 'median_income'),
 (0.15952135110508273, 'INLAND'),
 (0.10790328947660735, 'pop_per_hhold'),
 (0.07952439560604309, 'bedrooms_per_room'),
 (0.07476329030596403, 'longitude'),
 (0.07082511767818388, 'latitude'),
 (0.05538976232647533, 'rooms_per_hhold'),
 (0.042535299652963994, 'housing_median_age'),
 (0.01740800512291266, 'population'),
 (0.016413131343829195, 'total_rooms'),
 (0.016098772308688454, 'total_bedrooms'),
 (0.015817459070485963, 'households'),
 (0.009910046989117469, '<1H OCEAN'),
 (0.003658907474312689, 'NEAR OCEAN'),
 (0.0030151372772048834, 'NEAR BAY'),
 (6.166760553165882e-05, 'ISLAND')]

In [60]:
final_model = grid_search.best_estimator_

In [62]:
X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

In [63]:
strat_test_set.head(1)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
5241,-118.39,34.12,29.0,6447.0,1012.0,2184.0,960.0,8.2816,500001.0,<1H OCEAN


In [64]:
X_test_prepared = full_pipeline.transform(X_test)

In [65]:
final_predictions = final_model.predict(X_test_prepared)

In [66]:
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [67]:
final_rmse

46951.89796345451

In [68]:
from scipy import stats
confidence = 0.95

squared_errors = (final_predictions - y_test) ** 2

In [69]:
np.sqrt(
    stats.t.interval(
        confidence,
        len(squared_errors) - 1,
        loc=squared_errors.mean(),
        scale=stats.sem(squared_errors)
    )
)

array([44921.04365839, 48898.47933609])

In [70]:
joblib.dump(final_model, "final_model.pkl")

['final_model.pkl']