In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from mllibs.future_encoders import OneHotEncoder
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

ModuleNotFoundError: No module named 'mllibs'

In [2]:
HOUSING_PATH = os.path.join("datasets","housing")

def load_housing_data (housing_path=HOUSING_PATH):
    csv_path = os.path.join (housing_path,"housing.csv")
    return pd.read_csv (csv_path)

housing = load_housing_data()

In [3]:
housing = housing.loc[housing["median_house_value"] < 500000]
housing = housing.query("median_house_value not in [137500,350000]")
#housing = housing.loc[housing["housing_median_age"] != 41.0] : to_check
housing = housing.reset_index().drop("index",axis=1,inplace=False)

In [4]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


train_set, test_set = train_test_split(housing,test_size=0.2,random_state=42)

In [5]:
housing["income_cat"] = np.ceil(housing["median_income"]/1.5)
housing["income_cat"].where(housing["income_cat"] < 5,5.0,inplace=True)

In [6]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)

for train_index,test_index in split.split(housing,housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [7]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat",axis=1,inplace=True)

strat_train_set = strat_train_set.loc[strat_train_set["median_house_value"] < 500000]
strat_train_set = strat_train_set.query("median_house_value not in [137500,350000]")
strat_test_set = strat_test_set.loc[strat_test_set["median_house_value"] < 500000]
strat_test_set = strat_test_set.query("median_house_value not in [137500,350000]")

In [8]:
housing = strat_train_set.copy()

In [9]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]

In [10]:
housing = strat_train_set.drop("median_house_value",axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [11]:
rooms_ix, bedrooms_ix, population_ix, household_ix = 3,4,5,6

class CombinedAttributesAdder (BaseEstimator,TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True): #there is no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix]/X[:, household_ix]
        population_per_household = X[:, population_ix]/X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix]/X[:, rooms_ix]
            return np.c_[X,rooms_per_household, population_per_household,bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [12]:
class DataFrameSelector (BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X,y=None):
        return self
    def transform(self,X):
        return X[self.attribute_names].values

In [13]:
housing_num = housing.drop("ocean_proximity",axis=1)
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
num_pipeline = Pipeline([
                        ('selector', DataFrameSelector(num_attribs)),
                        ('imputer', Imputer(strategy="median")),
                        ('attribs_adder', CombinedAttributesAdder()),
                        ('std_scaler', StandardScaler())
                        
                      ])
cat_pipeline = Pipeline([
                        ('selector', DataFrameSelector(cat_attribs)),
                        ('cat_encoder', OneHotEncoder(sparse=False))
                      ])

In [14]:
full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline",num_pipeline),
    ("cat_pipeline",cat_pipeline)
])   

In [17]:
def try_model(X,
              y,
              pipeline=full_pipeline,
              regressor=None,
              predictions=False,
              cross_val=True,
              regressor_params=None,
              cross_val_params=None
             ):
    
    if not cross_val_params:
        cross_val_params = {'scoring':'neg_mean_squared_error', 
                            'cv':10
                           }
        
    full_pipeline_with_predictor = Pipeline([
        ("preparation", pipeline),
        ("regressor_name", regressor(**regressor_params))
    ])
    

    if predictions:
        model = full_pipeline_with_predictor.fit(X,y)
        y_predictions = full_pipeline_with_predictor.predict(X)
   
        mse = mean_squared_error(y,y_predictions)
        rmse = np.sqrt(mse)
        print("""
Predictions:   {}
RMSE:          {}
              """.format(y_predictions, rmse))
        
    if cross_val:
        scores = cross_val_score(full_pipeline_with_predictor,X=X, y=y, **cross_val_params)
        scores = np.sqrt(-scores)
        print("""
CROSS_VAL_SCORES:

SUM:     {}
Mean:    {}
STD:     {}
              """.format(scores, scores.mean(), scores.std()))    
        

In [18]:
from mine import try_regression

pipelines = [full_pipeline]


models_with_params = [(DecisionTreeRegressor,{'random_state':42}), 
                      (LinearRegression,{}),
                      (RandomForestRegressor,{'random_state':42})
                     ]  

for pipeline in pipelines:
    
    for model, params in models_with_params:
    
        regressor = model
        regressor_params = params
    
        print(regressor)
    
        try_model(X=housing, 
                  y=housing_labels, 
                  pipeline=pipeline,
                  regressor=regressor, 
                  predictions=True, 
                  cross_val=True, 
                  regressor_params=regressor_params
                  )

<class 'sklearn.tree.tree.DecisionTreeRegressor'>

Predictions:   [102900. 150800.  62400. ... 224200. 198200. 169800.]
RMSE:          0.0
              

CROSS_VAL_SCORES:

SUM:     [65513.86409131 63009.04999368 61537.56254818 64579.48386681
 60734.46128779 67273.06448836 65961.5004171  59370.42591704
 62969.51076651 63346.02909362]
Mean:    63429.49524704
STD:     2335.9180151520154
              
<class 'sklearn.linear_model.base.LinearRegression'>

Predictions:   [130208. 237728.  69856. ... 200224. 204288. 227840.]
RMSE:          59293.6295475256
              

CROSS_VAL_SCORES:

SUM:     [62947.33497163 60643.41302512 58863.29356865 60191.83941827
 59530.04874626 58052.16272198 57590.32805525 58251.36695008
 60235.46622943 60130.42126005]
Mean:    59643.567494672665
STD:     1486.9714536002984
              
<class 'sklearn.ensemble.forest.RandomForestRegressor'>

Predictions:   [110600. 176060.  62400. ... 222730. 197800. 167700.]
RMSE:          19894.45136669423
             

In [None]:
pipelines = [full_pipeline]


models_with_params = [(DecisionTreeRegressor,{'random_state':42}), 
                      (LinearRegression,{}),
                      (RandomForestRegressor,{'random_state':42})
                     ]        
        

for pipeline in pipelines:
    
    for model, params in models_with_params:
    
        regressor = model
        regressor_params = params
    
        print(regressor)
    
        try_model(X=housing, 
                  y=housing_labels, 
                  pipeline=pipeline,
                  regressor=regressor, 
                  predictions=True, 
                  cross_val=True, 
                  regressor_params=regressor_params
                  )

In [None]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predicts:", lin_reg.predict(some_data_prepared))

In [None]:
print("Labels:",list(some_labels))

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3,10,30], 'max_features': [2,4,6,8]},
    {'bootstrap': [False], 'n_estimators': [3,10], 'max_features': [2,3,4]},
    
]

forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared, housing_labels)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"],
                             cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_encoder= cat_pipeline.named_steps["cat_encoder"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes),reverse=True)

In [None]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

In [None]:
from sklearn.svm import SVR
sv_reg = SVR(kernel="linear")
sv_reg.fit(housing_prepared, housing_labels)