In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_squared_error
from scipy.stats import randint

### Dataset

In [2]:
housing = pd.read_csv('../datasets/housing.csv')

In [3]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [4]:
housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


### Attributes to be Processed

In [5]:
#Missing values
print(housing.isna().value_counts())
print(housing.duplicated().value_counts())

longitude  latitude  housing_median_age  total_rooms  total_bedrooms  population  households  median_income  median_house_value  ocean_proximity
False      False     False               False        False           False       False       False          False               False              20433
                                                      True            False       False       False          False               False                207
Name: count, dtype: int64
False    20640
Name: count, dtype: int64


In [6]:
housing['total_bedrooms'].isna().value_counts()

total_bedrooms
False    20433
True       207
Name: count, dtype: int64

- From the above code blocks we can see that total bedrooms have 'na' values because it returned true for df.isna() function
- Therefore we impute - replace na with meadian values function
- No duplicate rows

In [7]:
print(pd.unique(housing['ocean_proximity']))
print(housing['ocean_proximity'].value_counts())

['NEAR BAY' '<1H OCEAN' 'INLAND' 'NEAR OCEAN' 'ISLAND']
ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64


We have one ocean proximity which is categorical in nature, and should be encoded in order to be used as a feature. Regular label encoding may be used, but ML algorithms percieve their values in way where closer number values are assumed to be more proximal than farther numbers. Hence we use OneHotEncoding where we make each category as an attribute and have binary values(1 or 0) for every row.

### Pipeline Implementation

In [15]:
from sklearn.base import BaseEstimator, TransformerMixin
import logging


class NaImputer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, housing):
        print('Imputing na values...')
        
        imputer = SimpleImputer(strategy='median')
        housing['total_bedrooms'] = imputer.fit_transform(housing[['total_bedrooms']])
        
        print('Imputed na values.\n')
        
        return housing


class FeatureEncoder(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, housing):
        print('OneHotEncoding column, ocean_proximity ...')
        
        encoder = OneHotEncoder()
        matrix = encoder.fit_transform(housing[['ocean_proximity']]).toarray()
        column_names = ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]
        for i in range(len(matrix.T)):
            housing[column_names[i]] = matrix.T[i]
        
        print('New Columns added \nocean_proximity dropped from dataframe \nEncoding completed.\n')
        
        return housing.drop(['ocean_proximity'], axis=1)


class SelectAttributes(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, housing):
        print('Computing correlation matrix ...')
        
        corr_matrix = housing.corr()
        
        print('Correlations for median_house_value')
        print(corr_matrix['median_house_value'].apply(abs).sort_values(ascending=False))
        
        corr_matrix['median_house_value'].apply(abs).sort_values(ascending=False)
        columns = housing.columns
        selected_columns = []
        print('Selecting most important columns...')
        # corr_threshold = 0.13
        corr_threshold=float(input('\nEnter the correlation threshold\n(0 - to select all columns \n0.13 - to select highly correlated columns  \n:)'))
        
        for col in columns:
            try:
                if abs(corr_matrix[col]['median_house_value']) > corr_threshold:
                    selected_columns.append(col)
            except KeyError:
                pass
                
        print('Completed selecting important columns\n')
        
        return housing[selected_columns]


class ApplyModel(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, housing_selected):
        print('Applying train-test-split on housing data...')
        
        X, y = housing_selected.drop(["median_house_value"],axis=1), housing['median_house_value']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        print('Train-test-split completed.\n')
        print('Scaling values...')
        
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        print('Scaling completed.\n')
        print('Applying RandomForestRegressor \nApplying RandomizedSearchCV')

        param_grid=[
            {
             'n_estimators': randint(low=1, high=200),
             'max_features': randint(low=1, high=8),
            }
        ]
        rfr = RandomForestRegressor()
        clf = RandomizedSearchCV(rfr, param_grid, cv=5, n_iter=10, scoring='neg_mean_squared_error')

        print('Training the Model, Please wait, this might take a moment...')
        
        clf.fit(X_train.values, y_train.values)

        print('Model Training completed.\n')

        print('Printing estimator results : \n')
        cvres = clf.cv_results_
        for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
            print(np.sqrt(-mean_score), params)

        print('\nScore of the best estimator is ')
        print(clf.best_estimator_.score(X.values,y.values))
        return clf.best_estimator_.score(X.values,y.values)

In [16]:
from sklearn.pipeline import Pipeline

pipe = Pipeline(
    [
        ('Imputer', NaImputer()),
        ('Encoder', FeatureEncoder()),
        ('Attribute Selector', SelectAttributes()),
        ('RandomForestRegressor with RandomizedSearchCV', ApplyModel())
    ]
)

pipe.fit_transform(housing)

Imputing na values...
Imputed na values.

OneHotEncoding column, ocean_proximity ...
New Columns added 
ocean_proximity dropped from dataframe 
Encoding completed.

Computing correlation matrix ...
Correlations for median_house_value
median_house_value    1.000000
median_income         0.688075
INLAND                0.484859
<1H OCEAN             0.256617
NEAR BAY              0.160284
latitude              0.144160
ISLAND                0.141862
total_rooms           0.134153
housing_median_age    0.105623
households            0.065843
total_bedrooms        0.049457
longitude             0.045967
population            0.024650
NEAR OCEAN            0.023416
Name: median_house_value, dtype: float64
Selecting most important columns...



Enter the correlation threshold
(0 - to select all columns 
0.13 - to select highly correlated columns  
:) 0.13


Completed selecting important columns

Applying train-test-split on housing data...
Train-test-split completed.

Scaling values...
Scaling completed.

Applying RandomForestRegressor 
Applying RandomizedSearchCV
Training the Model, Please wait, this might take a moment...
Model Training completed.

Printing estimator results : 

66057.40527209028 {'max_features': 5, 'n_estimators': 49}
65788.31477948448 {'max_features': 6, 'n_estimators': 128}
64969.90571395005 {'max_features': 2, 'n_estimators': 118}
64962.40893134692 {'max_features': 2, 'n_estimators': 168}
65161.21371610694 {'max_features': 4, 'n_estimators': 195}
65617.70144778489 {'max_features': 5, 'n_estimators': 133}
64816.087953100214 {'max_features': 2, 'n_estimators': 193}
65939.886939182 {'max_features': 7, 'n_estimators': 142}
66108.8284105395 {'max_features': 7, 'n_estimators': 98}
66305.36030735272 {'max_features': 7, 'n_estimators': 52}

Score of the best estimator is 
0.9095549448330557


0.9095549448330557