In [None]:
print("Hello world!")

In [None]:
#fetching an extracting housing data
import os
import tarfile
import urllib
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = "https://github.com/ageron/handson-ml2/raw/3df90fde06c564a0968df305e83bfa9fd53f667e/datasets/housing/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [None]:
fetch_housing_data()

In [None]:
import pandas as pd

In [None]:
#loading housing data into pandas
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, 'housing.csv')
    return pd.read_csv(csv_path)

In [None]:
housing = load_housing_data()

In [None]:
housing.head()

In [None]:
housing.info()

In [None]:
housing["ocean_proximity"].value_counts()

In [None]:
housing.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize = (20, 15))
plt.show()

In [None]:
#makes a randomized test set and training set for your data
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state = 1)

In [None]:
#create a stratified data set for more representative test and training sets
import numpy as np
housing["income_cat"] = pd.cut(housing['median_income'], 
                              bins = [0., 1.5, 3.0, 4.5, 6., np.inf],
                              labels = [1, 2, 3, 4, 5])

In [None]:
housing['income_cat'].hist()

In [None]:
#creates a stratum-weighted test and training set
#important because the sets become more representative of the data 
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(housing, housing['income_cat']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
#find proportions per each income category
strat_test_set["income_cat"].value_counts() / len(strat_test_set)

In [None]:
#drop the 'income_cat' column from the data
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [None]:
housing = strat_train_set.copy()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude")

In [None]:
#alpha: figsize: size of the graph, alpha: transparency, s: size of points, c: color, cmap: color map
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
s=housing["population"]/100, label="population", figsize=(10,7),
c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
)
plt.legend()

In [None]:
# checking correlations between columns
# note that if there is a non-linear correlation, this won't find it
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms",
"housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))
# diagonals are just histograms of the data instead

In [None]:
# zooming in on the correlation between median income and median house value
housing.plot(kind="scatter", x="median_income", y="median_house_value",
alpha=0.1)

In [None]:
# condensing some data
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]

In [None]:
>>> corr_matrix = housing.corr()
>>> corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
# visualizing another decent correlation
housing.plot(kind="scatter", x="bedrooms_per_room", y="median_house_value",
alpha=0.1)

In [None]:
#create separate datasets for predictors and target values
housing = strat_train_set.drop("median_house_value", axis = 1)
housing_labels = strat_train_set["median_house_value"].copy()

In [None]:
#three ways of cleaning the data of incomplete values for "total_bedrooms"

#option 1 - get rid of the incomplete data points
#housing.dropna(subset=["total_bedrooms"])

#option 2 - gets rid of the whole column
#housing.drop("total_bedrooms", axis=1)

#option 3 - sets incomplete values to the median(could also set equal to zero or mean)
#median = housing["total_bedrooms"].median()
#housing["total_bedrooms"].fillna(median, inplace=True)
#OR
#this method works for a missing value in every column, but you need to drop categorical columns (you could add it back later)
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
housing_num = housing.drop("ocean_proximity", axis=1)

In [None]:
#copies housing_num as X and adds median values from imputer.statistics_ to the copy
X = imputer.fit_transform(housing_num)
#creates a pandas DataFrame from the NumPy array X
#combines: imputer.fit(housing_num) & X = imputer.transform(housing_num)
#fit generates the values and transform applies the values in the correct format
housing_tr = pd.DataFrame(X, columns = housing_num.columns, index=housing_num.index)

In [None]:
#these are the median values for each numerical predictor column
imputer.statistics_

In [None]:
housing_cat = housing[["ocean_proximity"]]
housing_cat.head(10)

In [None]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
#you can use just one encoder for multiple categorical columns because one encoder can store multiple category types
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

In [None]:
#encoding would be a bad idea here because encoding assumes that categories closer to each other in number are similar
ordinal_encoder.categories_

In [None]:
#use hot encoding instead (creates a list of binary 0s and a 1 for each category)
#one column becomes 5 columns
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

In [None]:
#the default format is a sparse matrix which more memory efficient b/c it stores only the locations of nonzeros
#to show the NumPy array (not recommended to store) do this:
housing_cat_1hot.toarray()

In [None]:
cat_encoder.categories_

In [None]:
#reference page 107 and 108 to create your own transformations

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [None]:
#it's good to scale your predictors similarly. 
#use MinMaxScaler transformer to scale (normalize) your values from 0 to 1
#use the StandardScaler transformer to standardize your data around zero

In [None]:
#example of transformation pipeline:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')), 
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [None]:
#putting a pipeline and transformer into one overall pipeline
#the goal is to transform numerical and categorical (and/or other kinds of) data in one step
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs), 
    ('cat', OneHotEncoder(), cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(housing)

In [None]:
#you can treat each column differently if you wish
#use "drop" to drop a column, "passthrough" to leave columns as they are
#default is to drop columns not listed
#set remainder hyperparameter to treat these remainder columns differently (or passthrough method)

In [None]:
#Now we train a linear regression model to see if it fits the data

from sklearn.linear_model import LinearRegression 

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
#trains the linear model on the whole training set

In [None]:
#testing the model predictions on the first 5 entries of the training set
some_data = housing.iloc[:5]
some_labels = housing_labels[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Prediction:", lin_reg.predict(some_data_prepared))
#gives the predcited y value for first 5 datapoints of the training set

In [None]:
print("Labels:", list(some_labels))
#now we see the actual y values associated with those 5 data points and compare

In [None]:
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse
#this value represents the typical prediction error
#the model is underfitted
#3 options to improve: create more powerful model, reduce constraints, feed the training algorithm better feautures

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

In [None]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_rmse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_rmse)
tree_rmse

In [None]:
#the model is showing absolutely no error at all, so we need to run some more tests to make sure its not overfitting
#cross validation with the training data
#could use train_test_split() to create a validation set and training set out of the old test set
#better option is to use k-fold cross-validation feature (split into 10 pieces and gain 10 eval scores)
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, 
                        scoring = 'neg_mean_squared_error', cv=10)
tree_rmse_scores = np.sqrt(-scores)


In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
display_scores(tree_rmse_scores)
#Now we see that this is even worse than linear reg model

In [None]:
#do the same error evaluation for the linear model to compare it to the decision tree model
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                            scoring = "neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

In [None]:
#The decision tree model overfits the data so bad that the linear model is actually better

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)

In [None]:
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                               scoring = 'neg_mean_squared_error', cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)