# Import basic libraries

In [1]:
import pandas as pd
import numpy as np

# Import the dataset

In [2]:
df = pd.read_csv('housing.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


Let's look at the data types and null values in the dataset.

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


Five number summary of the dataset:

In [4]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [5]:
df['ocean_proximity'].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

We see that total_bedrooms has some null values. We will impute median values in place of nulls.

# Imputation

In [6]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

In [7]:
df_num = df.drop("ocean_proximity", axis=1)

In [8]:
imputer.fit(df_num)

SimpleImputer(strategy='median')

In [9]:
imputer.statistics_

array([-1.1849e+02,  3.4260e+01,  2.9000e+01,  2.1270e+03,  4.3500e+02,
        1.1660e+03,  4.0900e+02,  3.5348e+00,  1.7970e+05])

In [10]:
df_num.median().values

array([-1.1849e+02,  3.4260e+01,  2.9000e+01,  2.1270e+03,  4.3500e+02,
        1.1660e+03,  4.0900e+02,  3.5348e+00,  1.7970e+05])

In [11]:
X = imputer.transform(df_num)

In [12]:
df_tr = pd.DataFrame(X, columns=df_num.columns,
                          index=df.index)

In [13]:
df_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


Now we do not have any null values.

# One hot encoding categorical column - ocean_proximity

In [14]:
housing_cat = df[['ocean_proximity']]
housing_cat.head(10)

Unnamed: 0,ocean_proximity
0,NEAR BAY
1,NEAR BAY
2,NEAR BAY
3,NEAR BAY
4,NEAR BAY
5,NEAR BAY
6,NEAR BAY
7,NEAR BAY
8,NEAR BAY
9,NEAR BAY


In [15]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

<20640x5 sparse matrix of type '<class 'numpy.float64'>'
	with 20640 stored elements in Compressed Sparse Row format>

In [16]:
housing_cat_1hot.toarray()

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [17]:
cat_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

#### Creating a pipeline to make all transofrmation in one go on the original dataset 

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

df_num_tr = num_pipeline.fit_transform(df_num.drop('median_house_value', axis = 1))

In [19]:
df_num_tr

array([[-1.32783522,  1.05254828,  0.98214266, ..., -0.9744286 ,
        -0.97703285,  2.34476576],
       [-1.32284391,  1.04318455, -0.60701891, ...,  0.86143887,
         1.66996103,  2.33223796],
       [-1.33282653,  1.03850269,  1.85618152, ..., -0.82077735,
        -0.84363692,  1.7826994 ],
       ...,
       [-0.8237132 ,  1.77823747, -0.92485123, ..., -0.3695372 ,
        -0.17404163, -1.14259331],
       [-0.87362627,  1.77823747, -0.84539315, ..., -0.60442933,
        -0.39375258, -1.05458292],
       [-0.83369581,  1.75014627, -1.00430931, ..., -0.03397701,
         0.07967221, -0.78012947]])

In [20]:
from sklearn.compose import ColumnTransformer

num_attribs = list(df_num.drop('median_house_value', axis = 1))
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

df_prepared = full_pipeline.fit_transform(df.drop('median_house_value', axis = 1))

In [21]:
df_prepared

array([[-1.32783522,  1.05254828,  0.98214266, ...,  0.        ,
         1.        ,  0.        ],
       [-1.32284391,  1.04318455, -0.60701891, ...,  0.        ,
         1.        ,  0.        ],
       [-1.33282653,  1.03850269,  1.85618152, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.8237132 ,  1.77823747, -0.92485123, ...,  0.        ,
         0.        ,  0.        ],
       [-0.87362627,  1.77823747, -0.84539315, ...,  0.        ,
         0.        ,  0.        ],
       [-0.83369581,  1.75014627, -1.00430931, ...,  0.        ,
         0.        ,  0.        ]])

# Feature selection

In [22]:
df_prepared.shape

(20640, 13)

These will be our explanatory variables.

In [23]:
# Response variable
y = df['median_house_value']

# Splitting data into train and test datset

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_prepared, y, random_state=42)

# Creating SVR model - linear

In [25]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
svm_reg = SVR(kernel="linear")
svm_reg.fit(X_train, y_train)
predictions = svm_reg.predict(X_test)
svm_mse = mean_squared_error(y_test, predictions)
svm_rmse = np.sqrt(svm_mse)
svm_rmse

111393.26023165749

In [26]:
svm_reg.get_params()

{'C': 1.0,
 'cache_size': 200,
 'coef0': 0.0,
 'degree': 3,
 'epsilon': 0.1,
 'gamma': 'scale',
 'kernel': 'linear',
 'max_iter': -1,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

# Creating SVR model - rbf

In [27]:
svm_reg = SVR(kernel="rbf")
svm_reg.fit(X_train, y_train)
predictions = svm_reg.predict(X_test)
svm_mse = mean_squared_error(y_test, predictions)
svm_rmse = np.sqrt(svm_mse)
svm_rmse

117732.99931829379

In [28]:
svm_reg.get_params()

{'C': 1.0,
 'cache_size': 200,
 'coef0': 0.0,
 'degree': 3,
 'epsilon': 0.1,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

# Using GridSearchCV to find tbe best hyper parameters

In [29]:
from sklearn.model_selection import GridSearchCV

param_grid = [{'kernel': ['linear'], 'C': [2**x for x in range(10,18)]},
             {'kernel': ['rbf'], 'C': [2**x for x in range(10,18)],\
              'gamma' : ['scale', 'auto']}]
# train across 3 folds, that's a total of (8+16)*3=72 rounds of training 
grid_search = GridSearchCV(estimator=SVR(), param_grid= param_grid, cv=3,
                           return_train_score=True, verbose = 3)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV 1/3] END ..........................C=1024, kernel=linear; total time=  11.3s
[CV 2/3] END ..........................C=1024, kernel=linear; total time=  11.0s
[CV 3/3] END ..........................C=1024, kernel=linear; total time=  11.5s
[CV 1/3] END ..........................C=2048, kernel=linear; total time=  10.9s
[CV 2/3] END ..........................C=2048, kernel=linear; total time=  10.0s
[CV 3/3] END ..........................C=2048, kernel=linear; total time=  10.1s
[CV 1/3] END ..........................C=4096, kernel=linear; total time=  10.7s
[CV 2/3] END ..........................C=4096, kernel=linear; total time=  11.3s
[CV 3/3] END ..........................C=4096, kernel=linear; total time=  10.2s
[CV 1/3] END ..........................C=8192, kernel=linear; total time=   7.3s
[CV 2/3] END ..........................C=8192, kernel=linear; total time=   7.6s
[CV 3/3] END ..........................C=8192, k

GridSearchCV(cv=3, estimator=SVR(),
             param_grid=[{'C': [1024, 2048, 4096, 8192, 16384, 32768, 65536,
                                131072],
                          'kernel': ['linear']},
                         {'C': [1024, 2048, 4096, 8192, 16384, 32768, 65536,
                                131072],
                          'gamma': ['scale', 'auto'], 'kernel': ['rbf']}],
             return_train_score=True, verbose=3)

In [30]:
grid_search.best_params_

{'C': 131072, 'gamma': 'scale', 'kernel': 'rbf'}

In [31]:
grid_search.best_score_

0.7583448195953707

In [32]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results[['param_C', 'param_kernel' , 'param_gamma', 'rank_test_score', 'mean_test_score', 'mean_train_score']]

Unnamed: 0,param_C,param_kernel,param_gamma,rank_test_score,mean_test_score,mean_train_score
0,1024,linear,,22,0.630761,0.631668
1,2048,linear,,21,0.631739,0.632643
2,4096,linear,,20,0.632348,0.633162
3,8192,linear,,19,0.632665,0.633513
4,16384,linear,,18,0.632879,0.633719
5,32768,linear,,17,0.633017,0.633862
6,65536,linear,,16,0.633071,0.633919
7,131072,linear,,15,0.633152,0.634086
8,1024,rbf,scale,24,0.597752,0.599367
9,1024,rbf,auto,23,0.6083,0.609533


In [35]:
predictions = grid_search.predict(X_test)
grid_search_mse = mean_squared_error(y_test, predictions)
grid_search_rmse = np.sqrt(grid_search_mse)
grid_search_rmse

56723.56399853856