In [53]:
import os
import pandas as pd
# from settings import ROOT_PATH

ROOT = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(""))), "machine-learning\\chp2")
HOUSING_PATH= os.path.join(os.path.join(ROOT,"datasets"), "housing")
DATASET_PATH = os.path.join(HOUSING_PATH, "housing.csv")
print(DATASET_PATH)

with open(DATASET_PATH) as f:
    data = pd.read_csv(f)
    

x:\E\Documents\Code\Repo\machine-learning\chp2\datasets\housing\housing.csv


In [54]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [55]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [56]:
data.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [57]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipe = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("std_scaler", StandardScaler())
    ]
)


In [58]:
# handle null values
data_num = data.drop("ocean_proximity", axis=1)
data_cat = data["ocean_proximity"]

In [59]:
def check_null(data: pd.DataFrame):
    null_col = []
    for col in data.columns:
        if data[col].isnull().any():
            null_col.append(col)
    if len(null_col) > 0:
        print(f"Category [{col}] contains null values")
    else:
        print("No nulls detected")

check_null(data_num)

Category [median_house_value] contains null values


In [60]:
imp = SimpleImputer(strategy="median")
imp_p = imp.fit_transform(data_num)
# [:-1] is kinda a hack to mute error. It's ok because this is just used to check if imputer is working as expected
check = pd.DataFrame(imp_p, columns=data.columns[:-1])
check_null(check)

No nulls detected


In [61]:
td = data.drop("ocean_proximity", axis=1)
td.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [62]:
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit

# I want to estimate median income
# However I need to make sure that the dataset splitting results in representative dataset for both training and testing
# Hence, I need to create a new category income_cat to properly categorise median income into proper stratums
# Doing this is important as without defining my own stratum, the estimate of correlation between median income and other attributes may be biased
data["income_cat"] = pd.cut(data["median_income"], bins=[0., 1.5, 3, 4.5, 6, np.inf], labels=[1, 2, 3, 4, 5])

split = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
for tr_idx, tst_idx in split.split(data, data["income_cat"]):
    strat_tr_s = data.loc[tr_idx]
    strat_tst_s = data.loc[tst_idx]

strat_tst_s["income_cat"].value_counts() / len(strat_tst_s)

income_cat
3    0.350533
2    0.318798
4    0.176357
5    0.114341
1    0.039971
Name: count, dtype: float64

In [63]:
for s in (strat_tr_s, strat_tst_s):
    # inplace=True overwrites instead of returning a copy
    s.drop("income_cat", axis=1, inplace=True)

In [64]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

attr_data = strat_tr_s.drop("median_income", axis=1)
num_data = attr_data.drop("ocean_proximity", axis=1)
cat_data = attr_data["ocean_proximity"].copy()
target_data = strat_tr_s["median_income"].copy()

def transformer_func (num_pipe, num_col, encoder, col):
    return ColumnTransformer(
        [
            ("numerical", num_pipe, num_col),
            ("categorical", encoder, col)
        ]
    )
    
# transformer = ColumnTransformer(
#     [
#         ("numerical", num_pipe, list(num_data)),
#         ("categorical", OneHotEncoder(), ["ocean_proximity"])
#     ]
# )
transformer = transformer_func(num_pipe, list(num_data), OneHotEncoder(), ["ocean_proximity"])

In [65]:
transformed_data = transformer.fit_transform(attr_data)

In [66]:
len(transformed_data[0])

13

In [67]:
strat_tr_s["median_income"].isnull().any()

False

In [68]:
from sklearn.svm import LinearSVR

linear_reg = LinearSVR()
linear_reg.fit(transformed_data, target_data)



In [69]:
tr_predictions = linear_reg.predict(transformed_data)

In [70]:
from sklearn.metrics import mean_squared_error, root_mean_squared_error

mse = mean_squared_error(target_data, tr_predictions)
rmse = root_mean_squared_error(target_data, tr_predictions)
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")

MSE: 1.2698100592313015
RMSE: 1.1268584912185298


In [76]:
tst_attr_data = strat_tst_s.drop("median_income", axis=1)
tst_target_data = strat_tst_s["median_income"].copy()
tst_transformed_data = transformer.transform(tst_attr_data)

In [77]:
tst_predictions = linear_reg.predict(tst_transformed_data)

In [78]:
tst_mse = mean_squared_error(tst_target_data, tst_predictions)
tst_rmse = root_mean_squared_error(tst_target_data, tst_predictions)
print(f"MSE: {tst_mse}")
print(f"RMSE: {tst_rmse}")

MSE: 1.3095807007950844
RMSE: 1.1443691278582642
