In [61]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

In [14]:
data = pd.read_csv("housing.csv")
data.drop("Unnamed: 0", axis =1 ,inplace =True)

In [15]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


our dataset is having one categorical feature and nine numerical features
there is null value present in total_bedrooms

    1. we will impute these null values with median value of total_bedrooms 
    2. the dataset have different ranges in terms of values so we will use a standared scaling
    3. we have one categorical feature ocean_proximity we do onehot encoding for this feature also
    4. we will also try to combine some features and create new features
    
we will do all these transforms with the help of transformers and pipelining

# train test split

In [17]:
train,test = train_test_split(data, test_size=0.1,random_state=42)

In [18]:
train.shape

(18576, 10)

In [19]:
test.shape

(2064, 10)

In [20]:
X_train = train.drop('median_house_value', axis=1)
y_train = train['median_house_value']

# custom transformer

# attribute combination transformer 

In [21]:
we will create a transformer for attribute combination as in above code line 13

SyntaxError: invalid syntax (Temp/ipykernel_6736/4066871513.py, line 1)

In [22]:
from sklearn.base import BaseEstimator, TransformerMixin

# column index
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttrAdder(BaseEstimator,TransformerMixin):
    
    def __init__(self, add_bedrooms_per_room =True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    
    def fit(self, X, y= None):
        return self
    
    def transform(self, X):
        X["rooms_per_household"] = X["total_rooms"]/X["households"]
        X["population_per_household"]=X["population"]/X["households"]
        
        if self.add_bedrooms_per_room:
            X["bedrooms_per_room"] = X["total_bedrooms"]/X["total_rooms"]
            return X
        else:
            return X
        
        


# missing value imputation transformer

In [23]:
#imputer transformer
class Imputer(BaseEstimator,TransformerMixin):
    
    def __init__(self, strategy="median"):
        self.strategy = strategy
        
    def fit(self, X, y= None):
        return self
    
    def transform(self, X, y =None):
        if self.strategy:
            cols_with_nan = [i for i  in X.columns if X[i].isna().sum() != 0]
            Y = X.copy()
            for i in cols_with_nan:
                median = np.round(Y[i].median(), 6)
                Y[i] = Y[i].fillna(median)

            return Y

# create pipeline

we will creat separate pipeline

with numerical variables


    1. imputation 
    2. combine attribute
    3. scaling 
    
    
with cayegorical variables


    4. categorical coding

In [24]:
numeric_features = [i for i in X_train if X_train[i].dtype != "object"]
X_num = X_train[numeric_features]

In [25]:
X_num.head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
5564,-118.29,33.91,41.0,2475.0,532.0,1416.0,470.0,3.8372
16016,-122.46,37.73,52.0,3547.0,506.0,1276.0,491.0,8.0069


In [26]:
num_pipeline = Pipeline([
    ("impute", Imputer()),
    ("combattri", CombinedAttrAdder()),
    ('scaling', StandardScaler())
])

Xnum_train = num_pipeline.fit_transform(X_num)

In [27]:
pd.DataFrame(Xnum_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.641303,-0.808863,0.983513,-0.074574,-0.013988,-0.009699,-0.077683,-0.018837,-0.066315,-0.006443,0.033452
1,-1.439758,0.980369,1.856793,0.416763,-0.076019,-0.132519,-0.022663,2.185237,0.712744,-0.044325,-1.205692
2,-1.285051,0.853905,0.110233,0.715140,1.684703,1.097431,1.696072,-0.106583,-0.705284,-0.068481,1.428373
3,-0.815939,1.462805,-1.001215,-0.471952,-0.577038,-0.557123,-0.559767,-0.377698,0.075494,-0.028728,-0.495671
4,-0.087318,0.558822,-1.080604,0.071177,-0.243026,-0.014086,-0.153664,0.558387,0.358362,0.010648,-0.975174
...,...,...,...,...,...,...,...,...,...,...,...
18571,0.805991,-0.869753,0.507178,-0.599370,-0.803689,-0.674679,-0.740549,1.319983,0.277077,-0.004660,-1.060477
18572,1.070490,-0.757340,0.348400,0.204553,0.076672,0.285945,-0.132704,-0.434945,0.571327,0.075129,-0.482896
18573,0.596388,-0.752657,0.586567,-0.245992,0.074286,0.288577,0.071658,-0.496050,-0.575286,0.022774,0.991153
18574,-1.190230,0.910111,-1.080604,0.429596,0.141089,0.307000,0.155499,0.975975,0.383036,0.008758,-0.788542


In [28]:
X_num

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
5564,-118.29,33.91,41.0,2475.0,532.0,1416.0,470.0,3.8372
16016,-122.46,37.73,52.0,3547.0,506.0,1276.0,491.0,8.0069
17131,-122.15,37.46,30.0,4198.0,1244.0,2678.0,1147.0,3.6712
11752,-121.21,38.76,16.0,1608.0,296.0,792.0,286.0,3.1583
2308,-119.75,36.83,15.0,2793.0,436.0,1411.0,441.0,4.9292
...,...,...,...,...,...,...,...,...
11284,-117.96,33.78,35.0,1330.0,201.0,658.0,217.0,6.3700
11964,-117.43,34.02,33.0,3084.0,570.0,1753.0,449.0,3.0500
5390,-118.38,34.03,36.0,2101.0,569.0,1756.0,527.0,2.9344
860,-121.96,37.58,15.0,3575.0,597.0,1777.0,559.0,5.7192


# column transformer to add to different pipelines

In [29]:
numeric_features = [i for i in X_train if X_train[i].dtype != "object"]
cat_features = [i for i in X_train if X_train[i].dtype == "object"]

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, numeric_features),
    ('cat', OneHotEncoder(), cat_features),
])

X_train_prepared = full_pipeline.fit_transform(X_train)

In [30]:
oh = full_pipeline.named_transformers_['cat']

In [31]:
oh.categories_[0]

array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
      dtype=object)

In [32]:
full_pipeline.feature_names_in_

array(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity'], dtype=object)

In [33]:
extra_attribs = ['rooms_per_household','population_per_household','bedrooms_per_room']
#cat_encoder = cat_pipeline.named_steps["cat_encoder"] # old solution
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = numeric_features + extra_attribs + cat_one_hot_attribs

In [34]:
X_train_prepared_DF = pd.DataFrame(X_train_prepared, columns= attributes)

In [35]:
X_train_prepared_DF.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,population_per_household,bedrooms_per_room,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,0.641303,-0.808863,0.983513,-0.074574,-0.013988,-0.009699,-0.077683,-0.018837,-0.066315,-0.006443,0.033452,1.0,0.0,0.0,0.0,0.0
1,-1.439758,0.980369,1.856793,0.416763,-0.076019,-0.132519,-0.022663,2.185237,0.712744,-0.044325,-1.205692,0.0,0.0,0.0,1.0,0.0
2,-1.285051,0.853905,0.110233,0.71514,1.684703,1.097431,1.696072,-0.106583,-0.705284,-0.068481,1.428373,0.0,0.0,0.0,1.0,0.0
3,-0.815939,1.462805,-1.001215,-0.471952,-0.577038,-0.557123,-0.559767,-0.377698,0.075494,-0.028728,-0.495671,0.0,1.0,0.0,0.0,0.0
4,-0.087318,0.558822,-1.080604,0.071177,-0.243026,-0.014086,-0.153664,0.558387,0.358362,0.010648,-0.975174,0.0,1.0,0.0,0.0,0.0


# Model building 

In [36]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import time
import xgboost
from sklearn.model_selection import cross_val_score

#import warnings
#warnings.filterwarnings('ignore')


def train_model(model, X,y):
    start = time.time()
    model.fit(X,y)
    training_execution_time = time.time() - start
    
    start = time.time()
    y_prd = model.predict(X)
    prediction_execution_time = time.time()-start
    
    rms_error = np.sqrt(mean_squared_error(y,y_prd))
    
    model_and_predictions_dictionary = {'Regression': str(model),
                                        'rms Error': np.round(rms_error, 2),                          
                                        'training_execution_time': training_execution_time,
                                        'prediction_execution_time': prediction_execution_time
                                       }
    
    performances = pd.DataFrame([[str(model), np.round(rms_error, 2),training_execution_time,prediction_execution_time]], 
                           columns=['Regression','rms Error','training_execution_time','prediction_execution_time'])
    return model_and_predictions_dictionary, performances
    

In [37]:
regression_dictionary={ 'Linear regression':LinearRegression(random_state=42), 
                        'Decision tree with depth of two':DecisionTreeRegressor(max_depth=2,random_state=42), 
                        'Decision tree - unlimited depth':DecisionTreeRegressor(random_state=42), 
                        'Random forest': RandomForestRegressor(random_state=42,n_jobs=-1),
                        'SVR':SVR(kernel="linear"),
                        'SGDRegressor': SGDRegressor(random_state=42)
                        
                       }


dataframe = pd.DataFrame()
for regression_name in regression_dictionary:
    model_and_predictions_dictionary,performances  = train_model(regression_dictionary[regression_name], X_train_prepared_DF,y_train)
    dataframe = dataframe.append(performances)

  dataframe = dataframe.append(performances)
  dataframe = dataframe.append(performances)
  dataframe = dataframe.append(performances)
  dataframe = dataframe.append(performances)
  dataframe = dataframe.append(performances)
  dataframe = dataframe.append(performances)


In [38]:
dataframe

Unnamed: 0,Regression,rms Error,training_execution_time,prediction_execution_time
0,LinearRegression(),67605.24,2.018914,0.033979
0,"DecisionTreeRegressor(max_depth=2, random_stat...",81875.86,0.118012,0.005998
0,DecisionTreeRegressor(random_state=42),0.0,0.421976,0.015004
0,"RandomForestRegressor(n_jobs=-1, random_state=42)",18430.4,11.011424,0.300985
0,SVR(kernel='linear'),109710.76,36.541164,24.744713
0,SGDRegressor(random_state=42),557822.32,0.100968,0.005985


# Model Evaluation Using cross validation 

In [39]:


def train_modelCV(model, X,y):
    start = time.time()
    scores = cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=4)
    rmse_scores = np.sqrt(-scores)
    
    CV_execution_time = time.time()-start
    
    
    
    model_and_predictions_dictionary = {'Regression': str(model),
                                        'mean rms Error': np.round(rmse_scores.mean(), 2),
                                        "Standard deviation":np.round(rmse_scores.std(), 2),
                                        'CV_execution_time': CV_execution_time,
                                        
                                       }
    
    performances = pd.DataFrame([[str(model), np.round(rmse_scores.mean(), 2),np.round(rmse_scores.std(), 2),CV_execution_time]], 
                           columns=['RegressionCV','mean rms Error','Standard deviation','CV_execution_time'])
    return model_and_predictions_dictionary, performances
    


    


In [40]:
CVdataframe = pd.DataFrame()
for regression_name in regression_dictionary:
    CVmodel_and_predictions_dictionary,performances  = train_modelCV(regression_dictionary[regression_name], X_train_prepared_DF,y_train)
    CVdataframe = CVdataframe.append(performances)

  CVdataframe = CVdataframe.append(performances)
  CVdataframe = CVdataframe.append(performances)
  CVdataframe = CVdataframe.append(performances)
  CVdataframe = CVdataframe.append(performances)
  CVdataframe = CVdataframe.append(performances)
  CVdataframe = CVdataframe.append(performances)


In [41]:
CVdataframe['mean rms Error'] = CVdataframe['mean rms Error'].astype(int)
CVdataframe['Standard deviation'] = CVdataframe['Standard deviation'].astype(int)

In [42]:
#using crossvalidation
CVdataframe

Unnamed: 0,RegressionCV,mean rms Error,Standard deviation,CV_execution_time
0,LinearRegression(),67914,1332,0.125992
0,"DecisionTreeRegressor(max_depth=2, random_stat...",82280,329,0.212018
0,DecisionTreeRegressor(random_state=42),70078,1245,1.25393
0,"RandomForestRegressor(n_jobs=-1, random_state=42)",50164,844,39.571921
0,SVR(kernel='linear'),111689,664,103.901492
0,SGDRegressor(random_state=42),121779362,199133249,0.161992


# Comparing models performance 

In [43]:
#without crossvalidation
dataframe

Unnamed: 0,Regression,rms Error,training_execution_time,prediction_execution_time
0,LinearRegression(),67605.24,2.018914,0.033979
0,"DecisionTreeRegressor(max_depth=2, random_stat...",81875.86,0.118012,0.005998
0,DecisionTreeRegressor(random_state=42),0.0,0.421976,0.015004
0,"RandomForestRegressor(n_jobs=-1, random_state=42)",18430.4,11.011424,0.300985
0,SVR(kernel='linear'),109710.76,36.541164,24.744713
0,SGDRegressor(random_state=42),557822.32,0.100968,0.005985


In [44]:
#using crossvalidation
CVdataframe

Unnamed: 0,RegressionCV,mean rms Error,Standard deviation,CV_execution_time
0,LinearRegression(),67914,1332,0.125992
0,"DecisionTreeRegressor(max_depth=2, random_stat...",82280,329,0.212018
0,DecisionTreeRegressor(random_state=42),70078,1245,1.25393
0,"RandomForestRegressor(n_jobs=-1, random_state=42)",50164,844,39.571921
0,SVR(kernel='linear'),111689,664,103.901492
0,SGDRegressor(random_state=42),121779362,199133249,0.161992


 Random Forests look very promising. However, note that the score on the training set is still much lower than on the validation sets, meaning that the model is still overfitting the training set. Possible solutions for overfitting are to simplify the model, constrain it (i.e., regularize it), or get a lot more training data. However, before you dive much deeper in Random Forests, you should try out many other models from various categories of Machine Learning algorithms (several Sup‐ port Vector Machines with different kernels, possibly a neural network, etc.), without spending too much time tweaking the hyperparameters. The goal is to shortlist a few (two to five) promising models.

# Fine-Tune Your Model

In [127]:
###Grid Search

All you need to
do is tell it which hyperparameters you want it to experiment with, and what values to
try out, and it will evaluate all the possible combinations of hyperparameter values,
using cross-validation.

For example, the following code searches for the best combi‐
nation of hyperparameter values for the RandomForestRegressor

In [47]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {'n_estimators':[3, 10, 30], 'max_features':[2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,scoring='neg_mean_squared_error', return_train_score=True)

grid_search.fit(X_train_prepared_DF,y_train)

In [48]:

print(grid_search.best_params_)
print(grid_search.best_estimator_)
print('best score:  ',grid_search.best_score_)

{'max_features': 6, 'n_estimators': 30}
RandomForestRegressor(max_features=6, n_estimators=30)
best score:   -2423720074.697801


In [52]:
#grid search step2

from sklearn.model_selection import GridSearchCV
param_grid = [
    {'n_estimators':[30,150], 'max_features':[6]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,scoring='neg_mean_squared_error', return_train_score=True)

grid_search.fit(X_train_prepared_DF,y_train)

print(grid_search.best_params_)
print(grid_search.best_estimator_)
print('best score:  ',grid_search.best_score_)

{'max_features': 6, 'n_estimators': 150}
RandomForestRegressor(max_features=6, n_estimators=150)
best score:   -2357617924.810178


In [53]:
# evaluation scores are also available:
a = grid_search.cv_results_['mean_test_score']
b = grid_search.cv_results_['params']

In [54]:
for i,j in zip(a,b):
    print(np.sqrt(-i), j)
#best rmse 50269.1

49709.15641968289 {'max_features': 6, 'n_estimators': 30}
48555.307895328784 {'max_features': 6, 'n_estimators': 150}
60760.167326288654 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
53289.47516647434 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
59412.310145698604 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52006.20705888377 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}
57698.26317671441 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3}
50853.675809082015 {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}



The RMSE score for this
combination is 49709, which is slightly better than the score you got earlier using the
default hyperparameter values (which was 50164).

# Evaluate Your System on the Test Set

In [56]:
grid_search.best_estimator_

In [63]:
final_model = grid_search.best_estimator_
X_test = test.drop("median_house_value", axis=1)
y_test = test["median_house_value"].copy()
X_test_prepared = full_pipeline.transform(X_test)


final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [62]:
print('mse:',final_mse)
print('rmse:',final_rmse)
print('R2 SCORE',final_model.score(X_test_prepared,y_test))

mse: 2395830120.968115
rmse: 48947.21770405459
R2 SCORE 0.8203499317047807
