# HOUSING PRICE PRECDICTION

## GET THE DATA

In [None]:
import os
import tarfile
from six.moves import urllib

In [None]:
downloading_url =  "https://raw.githubusercontent.com/ageron/handson-ml/master/"
housing_path = "datasets/housing"
housing_url = downloading_url + housing_path + "/housing.tgz"

In [None]:
def fetch_data(housing_url, housing_path):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path + "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path = housing_path)
    housing_tgz.close()
fetch_data(housing_url, housing_path)

In [None]:
import pandas as pd

def load_data(housing_path = housing_path):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [None]:
data = load_data()
data.head()

In [None]:
data.info()

In [None]:
data["ocean_proximity"].value_counts()

In [None]:
import matplotlib.pyplot as plt
data.hist(bins=50, figsize=(20, 15))
plt.show()

## SPLIT TRAIN - TEST DATASETS

In [None]:
import numpy as np

def traintestsplit(data, ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data)*ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = traintestsplit(data, 0.2)
print(len(train_set), ":train , ", len(test_set), ":test")

In [None]:
import hashlib

def test_check(id, ratio, hash):
    return hash(np.int64(id)).digest()[-1] < 256 * ratio

def split_train_test_by_id(data, ratio, id_col, hash=hashlib.md5):
    id = data[id_col]
    in_test_set = id.apply(lambda id_ : test_check(id_, ratio, hash))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [None]:
data_with_id = data.reset_index()
train_set, test_set = split_train_test_by_id(data_with_id, 0.2, "index")
print(len(train_set), ":train , ", len(test_set), ":test")

In [None]:
data["income"] = np.ceil(data["median_income"] / 1.5)
data['income'].where(data['income'] < 5, 5.0, inplace = True)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state=42)
for train_index, test_index in split.split(data, data["income"]):
    strat_train_index = data.loc[train_index]
    strat_test_index = data.loc[test_index]

In [None]:
data['income'].value_counts() / len(data)

In [None]:
for set in (strat_train_index, strat_test_index):
    set.drop(["income"], axis =1, inplace=True)
    

## VISUALIZE THE DATA

In [None]:
data = strat_train_index.copy()

In [None]:
data.plot(kind="scatter", x = "latitude", y= "longitude")

In [None]:
data.plot(kind="scatter", x = "latitude", y= "longitude", alpha = 0.1)

In [None]:
data.plot(kind="scatter", x = "latitude", y= "longitude", alpha = 0.4,
         s = data["population"]/ 100, label = "population",
         c = "median_house_value", cmap= plt.get_cmap("jet"), colorbar = True)

plt.legend()

In [None]:
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(data[attributes], figsize=(12, 8))

In [None]:
data.plot(kind="scatter", x = "median_income", y = "median_house_value", alpha =0.1)

## ADDING NEW FEATURES

In [None]:
data["rooms_per_household"] = data["total_rooms"] / data["households"]
data["bedrooms_per_room"] = data["total_bedrooms"] / data["total_rooms"]
data["population_per_household"] = data["population"] / data["households"]

In [None]:
housing = strat_train_index.drop("median_house_value", axis = 1)
housing_labels = strat_train_index["median_house_value"].copy()

## HANDLE MISSING VALUES

In [None]:
from sklearn.impute import SimpleImputer

# Create an instance of SimpleImputer with the median strategy
imputer = SimpleImputer(strategy="median")

# Apply the imputer to your dataset
housing_num = housing.drop("ocean_proximity", axis=1)
imputed_data = imputer.fit_transform(housing_num)


In [None]:
imputer.statistics_

In [None]:
housing_num.median().values

In [None]:
x = imputer.transform(housing_num)

In [None]:
housing_tr = pd.DataFrame(x, columns = housing_num.columns)

## HANDLE CATEGORICAL FEATURES

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
housing_cat = housing["ocean_proximity"]
housing_cat_encoded = encoder.fit_transform(housing_cat)
housing_cat_encoded

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1, 1))
housing_cat_1hot

In [None]:
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
housing_cat1hot = encoder.fit_transform(housing_cat)
housing_cat1hot


## CUSTOM TRANSFORMER

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3,4,5,6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room= True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        rooms_per_household = X[: ,rooms_ix] / X[: ,households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, \
                          population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, \
                          population_per_household ]
            

In [None]:
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room = False)
housing_extra_attribs = attr_adder.transform(housing.values)

## MAKING PIPELINE

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy= "median")),
    ("attr_adder", CombinedAttributesAdder()),
    ("std_sclr", StandardScaler()),
])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [None]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attributes_name):
        self.attributes_name = attributes_name
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attributes_name].values

In [None]:
from sklearn.pipeline import FeatureUnion

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
    ("selector", DataFrameSelector(num_attribs)),
    ("imputer", SimpleImputer(strategy= "median")),
    ("attr_adder", CombinedAttributesAdder()),
    ("std_sclr", StandardScaler()),
])

cat_pipeline = Pipeline([
    ("selector", DataFrameSelector(cat_attribs)),
    ("label_binarizer", OneHotEncoder()),
])

full_pipeline = FeatureUnion(transformer_list = [
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline)
])

In [None]:
housing_prepared = full_pipeline.fit_transform(housing)

## TRYING DIFFERENT MODELS

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)


In [None]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("prediction ------>", lin_reg.predict(some_data_prepared))
print("actual ------>",  list(some_labels))

In [None]:
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

In [None]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
rmse_scores

In [None]:
def display_scores(scores):
    print(scores)
    print("mean: ", scores.mean())
    print("std deviation: ", scores.std())

In [None]:
display_scores(scores)

In [None]:
lin_score = cross_val_score(lin_reg, housing_prepared, housing_labels,
                                   scoring="neg_mean_squared_error", cv=10)
lin_score = np.sqrt(-lin_score)

In [None]:
display_scores(lin_score)


In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor()
rf_reg.fit(housing_prepared, housing_labels)


In [None]:
rf_score = cross_val_score(rf_reg, housing_prepared, housing_labels,
                           scoring="neg_mean_squared_error", cv=10)
rf_score = np.sqrt(-rf_score)


In [None]:
display_scores(rf_score)

## SAVING MODEL

In [None]:
import joblib
joblib.dump(tree_reg, "tree_reg.pkl")

## FINE-TUNING

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {"n_estimators":[3, 10, 30], "max_features":[2, 4, 6, 8]},
    {"bootstrap":[False], "n_estimators":[3, 10], "max_features":[2, 3, 4]}
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring="neg_mean_squared_error")
grid_search.fit(housing_prepared, housing_labels)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_
for scores, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-scores), params)

In [None]:
feature_importance = grid_search.best_estimator_.feature_importances_
feature_importance

In [None]:
extra_features = ["rooms_per_household", "population_per_household", "bedrooms_per_room"]
cat_one_hot_attribs = list(encoder.classes_)
attribs = num_attribs + extra_features + cat_one_hot_attribs
sorted(zip(feature_importance, attribs), reverse=True)

## TESTING THE FINAL MODEL

In [None]:
final_model = grid_search.best_estimator_
test_x = strat_test_index.drop(["median_house_value"], axis=1)
test_y = strat_test_index["median_house_value"].copy()

test_x_prepared = full_pipeline.transform(test_x)

In [None]:
final_prediction = final_model.predict(test_x_prepared)
final_mse = mean_squared_error(test_y, final_prediction)
final_rmse = np.sqrt(final_mse)
final_rmse