In [1]:
import pandas as pd
import os

def load_data(housing_path):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)
housing = load_data("housing/") #pandas

In [2]:
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np


housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)


split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
#splitting

In [3]:
housing = strat_train_set.copy()

In [4]:
corr_matrix = housing.corr()
corr_matrix

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,income_cat
longitude,1.0,-0.924515,-0.109796,0.048963,0.075583,0.107149,0.061549,-0.015589,-0.045056,-0.013558
latitude,-0.924515,1.0,0.009864,-0.039721,-0.071816,-0.115095,-0.076752,-0.078979,-0.144684,-0.076783
housing_median_age,-0.109796,0.009864,1.0,-0.363195,-0.324448,-0.297841,-0.305708,-0.116608,0.11177,-0.113993
total_rooms,0.048963,-0.039721,-0.363195,1.0,0.928874,0.855803,0.917204,0.203718,0.135989,0.198239
total_bedrooms,0.075583,-0.071816,-0.324448,0.928874,1.0,0.876225,0.979599,-0.0058,0.049177,-0.005357
population,0.107149,-0.115095,-0.297841,0.855803,0.876225,1.0,0.905172,0.007472,-0.024765,0.007102
households,0.061549,-0.076752,-0.305708,0.917204,0.979599,0.905172,1.0,0.014929,0.065841,0.014345
median_income,-0.015589,-0.078979,-0.116608,0.203718,-0.0058,0.007472,0.014929,1.0,0.687474,0.975364
median_house_value,-0.045056,-0.144684,0.11177,0.135989,0.049177,-0.024765,0.065841,0.687474,1.0,0.667208
income_cat,-0.013558,-0.076783,-0.113993,0.198239,-0.005357,0.007102,0.014345,0.975364,0.667208,1.0


In [5]:
housing = strat_train_set.drop("median_house_value", axis=1) #drop makes a copy!

# #TO DROP LESS INFORMATIVE STUFF:
# # housing = housing.drop("households",axis=1)

# housing_labels = strat_train_set["median_house_value"].copy()

In [6]:
from sklearn.impute import SimpleImputer

# imputer = SimpleImputer(strategy="median")
housing_num = housing.drop("ocean_proximity", axis=1)
# imputer.fit(housing_num)

In [7]:
# X = imputer.transform(housing_num)
# housing_tr = pd.DataFrame(X, columns=housing_num.columns)
# # housing_tr.info()

In [8]:
from sklearn.preprocessing import LabelBinarizer

encoder = LabelBinarizer(sparse_output=True)
housing_cat_1hot = encoder.fit_transform(housing["ocean_proximity"])
housing_cat_1hot

<16512x5 sparse matrix of type '<class 'numpy.int32'>'
	with 16512 stored elements in Compressed Sparse Row format>

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin 
# BaseEstimator allows you to drop *args and **kwargs from you constructor
# and, in addition, allows you to use methods set_params() and get_params()

rooms_id, bedrooms_id, population_id, household_id = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_rooms = True): # note no *args and **kwargs used this time
        self.add_bedrooms_per_rooms = add_bedrooms_per_rooms
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_id] / X[:, household_id]
        bedrooms_per_household = X[:, bedrooms_id] / X[:, household_id]
        population_per_household = X[:, population_id] / X[:, household_id]
        if self.add_bedrooms_per_rooms:
            bedrooms_per_rooms = X[:, bedrooms_id] / X[:, rooms_id]
            return np.c_[X, rooms_per_household, bedrooms_per_household, 
                         population_per_household, bedrooms_per_rooms]
        else:
            return np.c_[X, rooms_per_household, bedrooms_per_household, 
                         population_per_household]
        
attr_adder = CombinedAttributesAdder()
housing_extra_attribs = attr_adder.transform(housing.values)
# print(housing_extra_attribs.info)

In [10]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# scaler = StandardScaler()
# housing_tr_scaled = scaler.fit_transform(housing_tr)

In [11]:
from sklearn.base import TransformerMixin # TransformerMixin allows you to use fit_transform method

class CustomLabelBinarizer(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.encoder = LabelBinarizer(*args, **kwargs)
    def fit(self, X, y=0):
        self.encoder.fit(X)
        return self
    def transform(self, X, y=0):
        return self.encoder.transform(X)

In [12]:

from sklearn.pipeline import Pipeline
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        #('imputer', Imputer(strategy="median")),
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('label_binarizer', CustomLabelBinarizer()),
    ])

In [13]:
from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

#trying to drop instead of imputer:
# strat_train_set = strat_train_set.dropna(subset=["total_bedrooms"]) # (16356, 17) vs  (16512, 17)


housing = strat_train_set.drop("median_house_value", axis=1)

housing_labels = strat_train_set["median_house_value"].copy()


housing_prepared = full_pipeline.fit_transform(housing)
print(housing_prepared.shape)
housing_prepared

(16512, 18)


array([[-1.15788621,  0.77388697,  0.74440696, ...,  0.        ,
         0.        ,  0.        ],
       [-1.44259512,  1.0077626 ,  1.85708974, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.18471864, -1.3403487 ,  0.18806557, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.58431009, -0.72291704, -1.56043594, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.7801323 , -0.84920988,  0.18806557, ...,  0.        ,
         0.        ,  0.        ],
       [-1.43760023,  0.99840757,  1.85708974, ...,  0.        ,
         1.        ,  0.        ]])

In [17]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error





# specify the range of hyperparameter values for the grid search to try out 
param_grid = {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}

# NOT GOING TO USE FOREST AS TAKES TOO LONG!

forest_reg = RandomForestRegressor() #here
not_grid_search = RandomizedSearchCV(forest_reg, param_grid, cv=5,
                          scoring="neg_mean_squared_error")
not_grid_search.fit(housing_prepared, housing_labels)

# print(grid_search.best_params_)

# cv_results = grid_search.cv_results_
# for mean_score, params in zip(cv_results["mean_test_score"], cv_results["params"]):
#     print(np.sqrt(-mean_score), params)
    
# feature_importances = grid_search.best_estimator_.feature_importances_
# print (feature_importances)

# extra_attribs = ['rooms_per_household', 'bedrooms_per_household', 'population_per_household', 'bedrooms_per_rooms']
# cat_one_hot_attribs = ['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']
# attributes = num_attribs + extra_attribs + cat_one_hot_attribs
# sorted(zip(feature_importances, attributes), reverse=True)


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              

In [18]:
final_model = not_grid_search.best_estimator_


# final_model = LinearRegression()
# final_model = final_model.fit(housing_prepared, housing_labels)

# print(final_model.coef_)

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

final_rmse

48770.37279878371

### Answers:
- Droping households which seems to be least inforamtive increases RMSE to 49737.4001959307 so no help as expected
- Different ideas:

Droppoing na instead of imputer stuff does: 48617.15275987233, 48708.793135364955 So seems very comparible perhaps a bit worse \
Doing mean SimpleImputer again makes v small diffrence perhaps a bit better 48074.00229046668. \
Min Max scaler makes it worse: 49153.254443779806 \
No scalling does simular badness: 49046.138073279675 \
Not sure what feature scaling is, I thought that is what the scaling we are doing? 

- Linear regresion final_rmse

Random Forest Ressonto RMSE with opt values are: 48695.78065596131, 48014.03994500478,  - seems to be different each time! \
But on other noteboke always get 48120.666286373504 each time! And 48100.813816646856 on other...


Linear Regesion with defalt prametres: 67244.28825732337 so much worse

- Diffrent features importance weights on SLR model.

[-5.54920601e+04 -5.66388462e+04  1.40868913e+04  3.09089868e+01
  5.81332081e+03 -4.66833873e+04  4.58323022e+04  7.86130745e+04
 -4.89297049e+03  2.83004792e+04 -2.03486060e+04  1.04776090e+03
  2.11146876e+04 -1.83626813e+04 -5.43180051e+04  1.09710906e+05
 -2.22161753e+04 -1.48140438e+04]
 The weights for SLR

Vs feature importace weights. Not sure

VS feature correlation scores with the taget vaule of step 3

- RandomizedSearchCV how does it compare with GridSearchCV

RSCV Has 48770.37279878371 so seems to not make much of a diffrence?











**Takes a long time to load any way to make it run faster and put more of my cpu to it?**