In [3]:
import os
import tarfile
import urllib.request
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin


# ---- Downloading & fetching data -------

DOWNLOAD_ROOT = 'https://raw.githubusercontent.com/ageron/handson-ml/master/'
HOUSING_PATH = 'datasets/housing' # specifying a local directory where data should be stored
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + '/housing.tgz'

def fetch_housing_data(housing_url = HOUSING_URL, housing_path = HOUSING_PATH):
    if not os.path.isdir(housing_path): # check if exists
        os.makedirs(housing_path) # makes it
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

def load_housing_data(housing_path = HOUSING_PATH):
    csv_path = os.path.join(housing_path, 'housing.csv')
    return pd.read_csv(csv_path)

fetch_housing_data()
housing = load_housing_data()

# ----- Separating into train / test sets -----

X = housing.drop('median_house_value', axis = 1)
y = housing['median_house_value']

X_num_attribs = list(X.drop('ocean_proximity', axis = 1).columns)
X_cat_attribs = ['ocean_proximity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# --- Defining custom transformers ---
rooms_ix, bedrooms_ix, population_ix, household_ix = 3,4,5,6
class CombinedAttributesAdder(BaseEstimator,TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # self = obj being created and we say whether it will be created with True/False
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y = None):
        return self
    def transform(self, X, y = None): # taking data X and adding new features
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix] # X[row_index:, column index] (take all rows(why left blank) but only column ...)
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room=  X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room] #np.c_ horizontally stacks columns
        else:
            return np.c_[X, rooms_per_household, population_per_household]

# --- Running pipeline ------

num_pipeline = Pipeline([
                        ('imputer', SimpleImputer(strategy = 'median')),
                        ('attribs_adder', CombinedAttributesAdder()),
                        ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
                        ('one_hot', OneHotEncoder()),
])

full_pipeline = ColumnTransformer([
                            ('num_pipeline', num_pipeline, X_num_attribs),
                            ('cat_pipeline', cat_pipeline, X_cat_attribs)
])

X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_prepared = full_pipeline.transform(X_test)


# ---Linear Regression ---
LinRegModel = LinearRegression()
LinRegModel.fit(X_train_prepared, y_train)
T_LR_Score = LinRegModel.score(X_train_prepared, y_train)
Test_LR_Score = LinRegModel.score(X_test_prepared, y_test)
print('Training Score for Linear Regression Model: {}'.format(T_LR_Score))
print('Test Score for Linear Regression Model: {}'.format(Test_LR_Score))

# --- Decision Trees ----
Tree_reg = DecisionTreeRegressor()
Tree_reg.fit(X_train_prepared, y_train)
scores = cross_val_score(Tree_reg, X_train_prepared, y_train,
                         scoring = 'r2', cv = 10 )
i = 1
for score in scores:
    print(f'{i}th fold Training r^2 score: {score:.4f}')
    i += 1
Test_score = Tree_reg.score(X_test_prepared, y_test)
print('Test Score for Decision Tree Model: {}'.format(Test_score))

# --- Random Forest ---
Rand_forest_regressor = RandomForestRegressor()
Rand_forest_regressor.fit(X_train_prepared, y_train)
T_RF_Score = Rand_forest_regressor.score(X_train_prepared, y_train)
Test_RF_Score = Rand_forest_regressor.score(X_test_prepared, y_test)
print('Training Score for Random forest: {}'.format(T_RF_Score))
print('Test Score for Random forest: {}'.format(Test_RF_Score))


  housing_tgz.extractall(path=housing_path)


Training Score for Linear Regression Model: 0.6582199160539229
Test Score for Linear Regression Model: 0.5966539537632211
1th fold Training r^2 score: 0.7001
2th fold Training r^2 score: 0.6493
3th fold Training r^2 score: 0.6353
4th fold Training r^2 score: 0.5957
5th fold Training r^2 score: 0.5994
6th fold Training r^2 score: 0.6762
7th fold Training r^2 score: 0.6554
8th fold Training r^2 score: 0.6428
9th fold Training r^2 score: 0.6763
10th fold Training r^2 score: 0.6537
Test Score for Decision Tree Model: 0.5994802887564741
Training Score for Random forest: 0.9743629590519244
Test Score for Random forest: 0.808720265073133
