Importing pandas and numpy

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

Reading the csv file

In [2]:
california_housing = pd.read_csv("housing.csv")

Observing the dataframe

In [3]:
california_housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
california_housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


Dropping Rows with Null values

In [5]:
california_housing=california_housing.dropna()
california_housing=california_housing.dropna(axis=0)

In [6]:
california_housing.isna().sum(axis = 0)

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [7]:
X, y = california_housing.iloc[:, 0:-2].values, california_housing.iloc[:, -2:-1].values

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)

print ('Training Set: %d rows\nTest Set: %d rows' % (X_train.shape[0], X_test.shape[0]))


Training Set: 17368 rows
Test Set: 3065 rows


In [9]:
X_train.shape, y_train.shape, X_test.shape,y_test.shape

((17368, 8), (17368, 1), (3065, 8), (3065, 1))

In [10]:
X

array([[-1.2223e+02,  3.7880e+01,  4.1000e+01, ...,  3.2200e+02,
         1.2600e+02,  8.3252e+00],
       [-1.2222e+02,  3.7860e+01,  2.1000e+01, ...,  2.4010e+03,
         1.1380e+03,  8.3014e+00],
       [-1.2224e+02,  3.7850e+01,  5.2000e+01, ...,  4.9600e+02,
         1.7700e+02,  7.2574e+00],
       ...,
       [-1.2122e+02,  3.9430e+01,  1.7000e+01, ...,  1.0070e+03,
         4.3300e+02,  1.7000e+00],
       [-1.2132e+02,  3.9430e+01,  1.8000e+01, ...,  7.4100e+02,
         3.4900e+02,  1.8672e+00],
       [-1.2124e+02,  3.9370e+01,  1.6000e+01, ...,  1.3870e+03,
         5.3000e+02,  2.3886e+00]])

Firstly, we import __*StandardScaler*__ which standardize features by removing the mean and scaling to unit variance, __*MinMaxScaler*__ is generally used to normalize the features and __*FunctionTransformer*__ also.

In [11]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from copy import deepcopy

In [12]:
std_scaler = StandardScaler().fit(X_train[:, :2])
min_max_scaler = MinMaxScaler().fit(X_train[:,2:])

def preprocessor(X):
    A = np.copy(X)
    A[:, :2] = std_scaler.transform(X[:, :2])
    A[:, 2:] = min_max_scaler.transform(X[:, 2:])
    return A

__1. StandardScaler__ Standardize features by removing the mean and scaling to unit variance.

The standard score of a sample x is calculated as:

z = (x - u) / s

where u is the mean of the training samples or zeo if *with_mean=False*, and s is the *standard deviation* of the training samples or one if with_std=False.

__2. MinMaxScaler__ Transform features by scaling each feature to a given range.

This estimator scales and translates each feature individually such that it is in the given range on the training set. (between zero and one most of the times) (It basically performs *normalization*)

__3. np.copy__ Return an array copy of the given object.

It is a *shallow copy* and will not copy object elements within arrays. This is mainly important for arrays containing Python objects. This is used if modifications can occur. But if we want to copy the elements also, we can use __*np.deepcopy()*__

__4. .transform()__ is used to retain the same number of items as the original dataset after performing the transformation.

In [13]:
X

array([[-1.2223e+02,  3.7880e+01,  4.1000e+01, ...,  3.2200e+02,
         1.2600e+02,  8.3252e+00],
       [-1.2222e+02,  3.7860e+01,  2.1000e+01, ...,  2.4010e+03,
         1.1380e+03,  8.3014e+00],
       [-1.2224e+02,  3.7850e+01,  5.2000e+01, ...,  4.9600e+02,
         1.7700e+02,  7.2574e+00],
       ...,
       [-1.2122e+02,  3.9430e+01,  1.7000e+01, ...,  1.0070e+03,
         4.3300e+02,  1.7000e+00],
       [-1.2132e+02,  3.9430e+01,  1.8000e+01, ...,  7.4100e+02,
         3.4900e+02,  1.8672e+00],
       [-1.2124e+02,  3.9370e+01,  1.6000e+01, ...,  1.3870e+03,
         5.3000e+02,  2.3886e+00]])

In [14]:
preprocessor(X_test)

array([[ 1.24026441, -1.36964842,  0.70588235, ...,  0.03632592,
         0.06150304,  0.16379774],
       [ 0.82588005, -0.71009717,  0.37254902, ...,  0.06701797,
         0.11708601,  0.20862471],
       [ 1.23027925, -1.34626008,  0.19607843, ...,  0.0532556 ,
         0.11675711,  0.05826816],
       ...,
       [-1.38084149,  0.89902078,  0.98039216, ...,  0.02424531,
         0.05064956,  0.61168122],
       [-1.33590825,  2.12924759,  0.74509804, ...,  0.0364941 ,
         0.08189443,  0.07919891],
       [ 1.23027925, -1.43045811,  0.25490196, ...,  0.10250301,
         0.19602039,  0.12588792]])

In [15]:
preprocess_transformer = FunctionTransformer(preprocessor)

__FunctionTransformer() :__ A FunctionTransformer forwards its X (and optionally y) arguments to a user-defined function or function object and returns the result of this function. This is useful for stateless transformations such as taking the log of frequencies, doing custom scaling, etc. Let's look at a simple Example:



In [16]:
preprocess_transformer

FunctionTransformer(func=<function preprocessor at 0x000001F84DF6A700>)

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

p1 = Pipeline([('Scaler', preprocess_transformer), ('Linear Regression', LinearRegression())])
p1

Pipeline(steps=[('Scaler',
                 FunctionTransformer(func=<function preprocessor at 0x000001F84DF6A700>)),
                ('Linear Regression', LinearRegression())])

Generally, Transformers are usually combined with classifiers, regressors or other estimators to build a composite estimator. The most common tool is a Pipeline.

__Pipeline()__ is used to generate a Pipeline of transform(mostly preprocessing functions) with a final estimator(mostly ML algorithms).

We sequentially apply a list of transforms and a final estimator. Intermediate steps of the pipeline must be ‘transforms’, i.e., they must implement fit and transform methods. The final estimator only needs to implement fit. The transformers in the pipeline can be cached using memory argument.

The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.

In [18]:
from sklearn.metrics import mean_absolute_error

def fit_and_print(p, X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test):
    p.fit(X_train, y_train)
    train_preds = p.predict(X_train)
    test_preds = p.predict(X_test)
    print('Training Error :' + str(mean_absolute_error(train_preds, y_train)))
    print('Testing Error :' + str(mean_absolute_error(test_preds, y_test)))
    


In [19]:
fit_and_print(p1)

Training Error :50872.68798151339
Testing Error :50899.872261996825


In [20]:
from sklearn.neighbors import KNeighborsRegressor as KNR

p2 = Pipeline([('Scaler', preprocess_transformer), ('KNN Regression', KNR(n_neighbors = 7))])


In [21]:
fit_and_print(p2)

Training Error :30249.16602783444
Testing Error :34871.620554649264


In [22]:
from sklearn.ensemble import RandomForestRegressor as RFR

p3 =Pipeline([('Scaler', preprocess_transformer), ('Random Forest', RFR(n_estimators = 10, max_depth = 7))])


In [23]:
fit_and_print(p3)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Training Error :41150.53222654324
Testing Error :43297.92713618693
