In [218]:
from __future__ import annotations
from abc import ABC, abstractmethod
from typing import NoReturn
from typing import Tuple
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
filename = "../datasets/house_prices.csv"

In [219]:
df = pd.read_csv(filename)
df.shape

(21616, 21)

In [220]:
df.dtypes

id               float64
date              object
price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view             float64
condition        float64
grade            float64
sqft_above       float64
sqft_basement    float64
yr_built         float64
yr_renovated     float64
zipcode          float64
lat              float64
long             float64
sqft_living15    float64
sqft_lot15       float64
dtype: object

In [221]:
# clearing duplicates
df = df.drop_duplicates()

In [222]:
# Validating features:

to_remove = pd.concat([df.loc[(df.bedrooms <=0) & (df.bathrooms <=0)], 
                      df.loc[(df.sqft_living <= 0) | (df.sqft_lot <= 0) | (df.sqft_above <= 0) | (df.sqft_basement < 0) 
                             | (df.sqft_living15 < 0) | (df.sqft_lot15 < 0) | (df.price < 0) | (df.price.isnull())]]).drop_duplicates()
df.drop(to_remove.index, inplace=True)

In [223]:
# parse the date column:

df['date'] = pd.to_datetime(df.date, errors='coerce')

In [224]:
# delete samples with no date

df.drop(df.loc[df.date.isnull()].index, inplace=True)
# for row_idx in df.loc[df.date.isnull()].index:
#     print(row_idx)
#     df.drop(row_idx, axis=0, inplace=True)

In [225]:
# replace date column with year month and day columns:
df['sale_year'] = df.date.dt.year
df['sale_month'] = df.date.dt.month
df['sale_day'] = df.date.dt.weekday

In [226]:
# Removing redundant columns:
df.drop(columns=['id','date','lat', 'long'], inplace=True)

In [227]:
# Create column for age := sale_year - max(yr_built, yr_renovated):

df['age'] = df.sale_year - np.maximum(df.yr_built, df.yr_renovated)

# there are 18 rows with negative age - need to deal with that?
# df.loc[df.age < 0]

In [228]:
# Create sqft ratio columns:

df['sqft_living_ratio'] = df.sqft_living15 / df.sqft_living
df['sqft_lot_ratio'] = df.sqft_lot15 / df.sqft_lot
# no division by zero because i removed rows with sqft_living==0 or sqft_lot==0

# print(df[['sqft_living', 'sqft_living15','sqft_living_ratio']])
# print("\n")
# print(df[['sqft_lot', 'sqft_lot15','sqft_lot_ratio']])

In [229]:
# use one-hot encoding on the zip column:

print("number of unique zipcode values: ", pd.unique(df['zipcode']).size, "\n")
#prefix=[f"zipcode_{z}" for z in pd.unique(df['zipcode'])]
df = pd.get_dummies(df, columns=['zipcode'])


number of unique zipcode values:  70 



In [230]:
response = df.pop('price')
response

0        221900.0
1        538000.0
2        180000.0
3        604000.0
4        510000.0
           ...   
21611    360000.0
21612    400000.0
21613    402101.0
21614    400000.0
21615    325000.0
Name: price, Length: 21602, dtype: float64

In [231]:
df

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,...,zipcode_98146.0,zipcode_98148.0,zipcode_98155.0,zipcode_98166.0,zipcode_98168.0,zipcode_98177.0,zipcode_98178.0,zipcode_98188.0,zipcode_98198.0,zipcode_98199.0
0,3,1.00,1180,5650,1.0,0,0.0,3.0,7.0,1180.0,...,0,0,0,0,0,0,1,0,0,0
1,3,2.25,2570,7242,2.0,0,0.0,3.0,7.0,2170.0,...,0,0,0,0,0,0,0,0,0,0
2,2,1.00,770,10000,1.0,0,0.0,3.0,6.0,770.0,...,0,0,0,0,0,0,0,0,0,0
3,4,3.00,1960,5000,1.0,0,0.0,5.0,7.0,1050.0,...,0,0,0,0,0,0,0,0,0,0
4,3,2.00,1680,8080,1.0,0,0.0,3.0,8.0,1680.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21611,3,2.50,1530,1131,3.0,0,0.0,3.0,8.0,1530.0,...,0,0,0,0,0,0,0,0,0,0
21612,4,2.50,2310,5813,2.0,0,0.0,3.0,8.0,2310.0,...,1,0,0,0,0,0,0,0,0,0
21613,2,0.75,1020,1350,2.0,0,0.0,3.0,7.0,1020.0,...,0,0,0,0,0,0,0,0,0,0
21614,3,2.50,1600,2388,2.0,0,0.0,3.0,8.0,1600.0,...,0,0,0,0,0,0,0,0,0,0


In [232]:
def split_train_test(X: pd.DataFrame, y: pd.Series, train_proportion: float = .25) \
        -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
    train_X = X.sample(frac=train_proportion, random_state=7)
    train_y = y.sample(frac=train_proportion, random_state=7)
    test_X = X.drop(train_X.index).sample(frac=1, random_state=7)
    test_y = y.drop(train_y.index).sample(frac=1, random_state=7)
    return train_X, train_y, test_X, test_y


In [233]:
train_data, train_responses, test_data, test_responses = split_train_test(df, response, 0.75)
test_data

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,...,zipcode_98146.0,zipcode_98148.0,zipcode_98155.0,zipcode_98166.0,zipcode_98168.0,zipcode_98177.0,zipcode_98178.0,zipcode_98188.0,zipcode_98198.0,zipcode_98199.0
20709,2,1.75,1120,758,2.0,0,0.0,3.0,7.0,1120.0,...,0,0,0,0,0,0,0,0,0,0
20358,4,2.50,2470,5954,2.0,0,0.0,3.0,8.0,2470.0,...,0,0,0,0,0,0,0,0,0,0
6231,4,2.50,2960,6031,2.0,0,0.0,3.0,9.0,2960.0,...,0,0,0,0,0,0,0,0,0,0
12984,3,1.00,900,4770,1.0,0,0.0,3.0,6.0,900.0,...,0,0,0,0,0,0,0,0,0,0
13510,3,1.00,970,8378,1.0,0,0.0,4.0,6.0,970.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17209,5,2.75,3090,19865,1.0,0,0.0,4.0,9.0,3090.0,...,0,0,0,0,0,0,0,0,0,0
10087,3,1.00,1150,10132,1.0,0,0.0,4.0,7.0,1150.0,...,0,0,0,0,0,0,0,0,0,0
2033,4,1.00,1010,6000,1.0,0,0.0,3.0,6.0,750.0,...,0,0,0,0,0,0,0,0,0,0
4741,3,2.25,2780,31510,2.0,0,0.0,3.0,8.0,2780.0,...,0,0,0,0,0,0,0,0,0,0


In [234]:
class BaseEstimator(ABC):

    def __init__(self) -> BaseEstimator:
        self.fitted_ = False

    def fit(self, X: np.ndarray, y: np.ndarray) -> BaseEstimator:
        self._fit(X, y)
        self.fitted_ = True
        return self

    def predict(self, X: np.ndarray) -> np.ndarray:
        if not self.fitted_:
            raise ValueError("Estimator must first be fitted before calling ``predict``")
        return self._predict(X)

    def loss(self, X: np.ndarray, y: np.ndarray) -> float:
        if not self.fitted_:
            raise ValueError("Estimator must first be fitted before calling ``loss``")
        return self._loss(X, y)

    @abstractmethod
    def _fit(self, X: np.ndarray, y: np.ndarray) -> NoReturn:
        raise NotImplementedError()

    @abstractmethod
    def _predict(self, X: np.ndarray) -> np.ndarray:
        raise NotImplementedError()

    @abstractmethod
    def _loss(self, X: np.ndarray, y: np.ndarray) -> float:
        raise NotImplementedError()

    def fit_predict(self, X: np.ndarray, y: np.ndarray) -> np.ndarray:
        self.fit(X, y)
        return self.predict(X)

def mean_square_error(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    return (np.square(y_true - y_pred)).mean()

class LinearRegression(BaseEstimator):

    def __init__(self, include_intercept: bool = True) -> LinearRegression:
        super().__init__()
        self.include_intercept_, self.coefs_ = include_intercept, None

    def _fit(self, X: np.ndarray, y: np.ndarray) -> NoReturn:
        # If an intercept is needed - add a column of ones to the design matrix:
        if self.include_intercept_:
            X = np.c_[np.ones(X.shape[0]), X]

        # define the coefficients vector as the moore-penrose-inverse of X multiplied with y:
        self.coefs_ = np.linalg.pinv(X) @ y

    def _predict(self, X: np.ndarray) -> np.ndarray:
        # If an intercept is needed - add a column of ones to the design matrix:
        if self.include_intercept_:
            X = np.c_[np.ones(X.shape[0]), X]

        return X @ self.coefs_

    def _loss(self, X: np.ndarray, y: np.ndarray) -> float:
        return mean_square_error(y, self._predict(X))

In [235]:
linear_model = LinearRegression()
percentages = np.arange(10, 101)
loss_means, loss_stds = [], []
for percentage in percentages:
    losses = []

    for i in range(10):
        print("\n%=", percentage / 100, "i=", i)
        train_sample = train_data.sample(frac=(percentage / 100), random_state=i)
        response_sample = train_responses.sample(frac=(percentage / 100), random_state=i)
        linear_model.fit(np.array(train_sample), np.array(response_sample))
        losses.append(linear_model.loss(np.array(test_data), np.array(test_responses)))
        print("loss=", losses[i])

    loss_means.append(np.array(losses).mean())
    loss_stds.append(np.array(losses).std())

loss_means_np = np.array(loss_means)
loss_stds_np = np.array(loss_stds)
fig = go.Figure(data=[go.Scatter(x=percentages, y=loss_means_np, mode="markers+lines",
                                 marker=dict(color="blue", opacity=0.7)),
                      go.Scatter(x=percentages, y=(loss_means_np - (2 * loss_stds_np)),
                                 fill=None, mode="lines", line=dict(color="lightgrey"),
                                 showlegend=False),
                      go.Scatter(x=percentages, y=(loss_means_np + (2 * loss_stds_np)),
                                 fill="tonexty", mode="lines", line=dict(color="lightgrey"),
                                 showlegend=False)],
                layout=dict(title="Title",
                            xaxis_title="Percentages",
                            yaxis_title="Average Loss"))
fig.show()


%= 0.1 i= 0
loss= 22847265428.880527

%= 0.1 i= 1
loss= 22345628587.36695

%= 0.1 i= 2
loss= 25349643247.479374

%= 0.1 i= 3
loss= 22845449828.18654

%= 0.1 i= 4
loss= 22332483781.400898

%= 0.1 i= 5
loss= 22538263656.153427

%= 0.1 i= 6
loss= 22970331735.624016

%= 0.1 i= 7
loss= 21213708973.86427

%= 0.1 i= 8
loss= 21022265100.980793

%= 0.1 i= 9
loss= 23147851801.889706

%= 0.11 i= 0
loss= 22651932105.011616

%= 0.11 i= 1
loss= 22230366063.461914

%= 0.11 i= 2
loss= 24398345323.414524

%= 0.11 i= 3
loss= 22254208976.70905

%= 0.11 i= 4
loss= 21993828078.24438

%= 0.11 i= 5
loss= 22134674552.126156

%= 0.11 i= 6
loss= 23084089286.10745

%= 0.11 i= 7
loss= 21401495740.630737

%= 0.11 i= 8
loss= 20849638280.3734

%= 0.11 i= 9
loss= 23317262856.443687

%= 0.12 i= 0
loss= 22742062846.878796

%= 0.12 i= 1
loss= 21985075116.8587

%= 0.12 i= 2
loss= 24715395145.04533

%= 0.12 i= 3
loss= 21987638432.72275

%= 0.12 i= 4
loss= 21906077184.695534

%= 0.12 i= 5
loss= 21793067004.80061

%= 0.12 

loss= 21506303965.51922

%= 0.31 i= 8
loss= 20475590095.208847

%= 0.31 i= 9
loss= 22212697663.396847

%= 0.32 i= 0
loss= 21574032554.559074

%= 0.32 i= 1
loss= 21231068624.40147

%= 0.32 i= 2
loss= 21103376125.484104

%= 0.32 i= 3
loss= 21503344782.81523

%= 0.32 i= 4
loss= 20709205775.090267

%= 0.32 i= 5
loss= 20868539823.10629

%= 0.32 i= 6
loss= 20930481966.30038

%= 0.32 i= 7
loss= 21736576524.471752

%= 0.32 i= 8
loss= 20518556291.455933

%= 0.32 i= 9
loss= 22171298272.4743

%= 0.33 i= 0
loss= 21543327633.24326

%= 0.33 i= 1
loss= 21247818167.723812

%= 0.33 i= 2
loss= 21021942305.637794

%= 0.33 i= 3
loss= 21448670700.42604

%= 0.33 i= 4
loss= 20601245717.40844

%= 0.33 i= 5
loss= 20935996107.75036

%= 0.33 i= 6
loss= 20840032400.782394

%= 0.33 i= 7
loss= 21674824376.15767

%= 0.33 i= 8
loss= 20445160155.70469

%= 0.33 i= 9
loss= 21936239342.749763

%= 0.34 i= 0
loss= 21325209722.083557

%= 0.34 i= 1
loss= 21173355419.466442

%= 0.34 i= 2
loss= 20917243868.362774

%= 0.34 i= 3

loss= 20736157008.426495

%= 0.53 i= 2
loss= 20637450432.752617

%= 0.53 i= 3
loss= 20715473818.287464

%= 0.53 i= 4
loss= 20543270953.004475

%= 0.53 i= 5
loss= 20584959214.304035

%= 0.53 i= 6
loss= 20509765632.73704

%= 0.53 i= 7
loss= 20951471610.35137

%= 0.53 i= 8
loss= 20585236288.235836

%= 0.53 i= 9
loss= 21219526414.33847

%= 0.54 i= 0
loss= 20677741980.439705

%= 0.54 i= 1
loss= 20755946008.03478

%= 0.54 i= 2
loss= 20706697141.681526

%= 0.54 i= 3
loss= 20690697689.235146

%= 0.54 i= 4
loss= 20530134089.73449

%= 0.54 i= 5
loss= 20784022364.05332

%= 0.54 i= 6
loss= 20574954522.915115

%= 0.54 i= 7
loss= 20905270179.231846

%= 0.54 i= 8
loss= 20545999553.597385

%= 0.54 i= 9
loss= 21155965007.616776

%= 0.55 i= 0
loss= 20662164388.166912

%= 0.55 i= 1
loss= 20702965190.846504

%= 0.55 i= 2
loss= 20733693128.145046

%= 0.55 i= 3
loss= 20692411606.395226

%= 0.55 i= 4
loss= 20518413166.161972

%= 0.55 i= 5
loss= 20820907185.988945

%= 0.55 i= 6
loss= 20544747742.24264

%= 0.5

loss= 20476981790.817036

%= 0.74 i= 6
loss= 20559860476.191105

%= 0.74 i= 7
loss= 20743223311.515068

%= 0.74 i= 8
loss= 20560059607.377678

%= 0.74 i= 9
loss= 20910768777.160652

%= 0.75 i= 0
loss= 20959872868.751667

%= 0.75 i= 1
loss= 20988132797.598793

%= 0.75 i= 2
loss= 20913866411.38651

%= 0.75 i= 3
loss= 20670494011.58466

%= 0.75 i= 4
loss= 20601608059.782345

%= 0.75 i= 5
loss= 20673405742.73177

%= 0.75 i= 6
loss= 20545202800.05736

%= 0.75 i= 7
loss= 20742747456.13231

%= 0.75 i= 8
loss= 20573006869.666714

%= 0.75 i= 9
loss= 20889976916.436897

%= 0.76 i= 0
loss= 20975077916.4351

%= 0.76 i= 1
loss= 20990301157.167374

%= 0.76 i= 2
loss= 20959557449.431694

%= 0.76 i= 3
loss= 20637901827.686485

%= 0.76 i= 4
loss= 20723349318.904156

%= 0.76 i= 5
loss= 20656455057.1306

%= 0.76 i= 6
loss= 20656131391.041737

%= 0.76 i= 7
loss= 20816796291.14872

%= 0.76 i= 8
loss= 20578188970.971645

%= 0.76 i= 9
loss= 20842294026.099003

%= 0.77 i= 0
loss= 20957483105.324574

%= 0.77 i

loss= 20653238654.878456

%= 0.96 i= 1
loss= 20755669583.300995

%= 0.96 i= 2
loss= 20749284350.71515

%= 0.96 i= 3
loss= 20704387756.92547

%= 0.96 i= 4
loss= 20582015096.477455

%= 0.96 i= 5
loss= 20580668601.151726

%= 0.96 i= 6
loss= 20716575451.19213

%= 0.96 i= 7
loss= 20728689481.754574

%= 0.96 i= 8
loss= 20837378434.53272

%= 0.96 i= 9
loss= 20804586747.64359

%= 0.97 i= 0
loss= 20746477849.56579

%= 0.97 i= 1
loss= 20754868183.575672

%= 0.97 i= 2
loss= 20746275731.783646

%= 0.97 i= 3
loss= 20708258928.376

%= 0.97 i= 4
loss= 20615033297.413383

%= 0.97 i= 5
loss= 20583585079.757442

%= 0.97 i= 6
loss= 20691142773.756165

%= 0.97 i= 7
loss= 20714776425.01672

%= 0.97 i= 8
loss= 20794161148.80772

%= 0.97 i= 9
loss= 20779462627.831604

%= 0.98 i= 0
loss= 20736640504.313854

%= 0.98 i= 1
loss= 20745677793.57807

%= 0.98 i= 2
loss= 20745052783.82607

%= 0.98 i= 3
loss= 20710906386.2488

%= 0.98 i= 4
loss= 20585972999.714565

%= 0.98 i= 5
loss= 20612913207.163967

%= 0.98 i= 6
l

In [244]:
linear_model.fit(train_data, train_responses)
pred_y = linear_model.predict(test_data)
loss = np.abs(np.array(test_responses) - pred_y)
fig_1 = px.histogram(x=(100 * loss / test_responses), nbins=50)
fig_1.update_layout(bargap=0.2, title="error in %")
fig_1.show()
fig_2 = px.histogram(x=loss, nbins=50)
fig_2.update_layout(bargap=0.2, title="error in abs of diff")
fig_2.show()

information about the dataset:
    https://geodacenter.github.io/data-and-lab/KingCounty-HouseSales2015/



What to do with specific columns:
* id column:
    - turn it into index column?
    - delete it?

* date column:
    - convert to year, month, day columns?
    - use it to get the 'age' of the house?
    - delete it afterwards

* numerical values that don't have natural ordering - what to do with them?:
    - zipcode, lat, long

* price column - this is the response vector

Validations:
* remove duplicates

* Values that should be validated as non-negative:
    - bedrooms, bathrooms, floors
    - sqft_living, sqft_lot, sqft_above, sqft_basement, sqft_living15, sqft_lot15
    - price
    
* yr_built and yr_yr_renovated columns:
    - should validate that the year make sense.
    
* condition column:
    - should be integer between 1 and 5
    
* view column:
    - should be integer between 0 and 4

Custum Features:
* age
* ratio of sqft_living and sqft_living15
* ratio of sqft_lot and sqft_lot15
    
Questions:
* which values here are categorical and needs to be dealt with as such?
    - date, and i think i delt with it alright
* new features that can be created from existing features?

Usefull things:
* select rows with conditions on strings:
    https://kanoki.org/2019/03/27/pandas-select-rows-by-condition-and-string-operations/
* select rows with conditions on numbers:
    https://www.stackvidhya.com/select-rows-from-dataframe/#:~:text=Pandas%20Dataframe%20stores%20data%20in,column_name'%5D%20%3D%3D%20value%5D.
* convert strings to floats:
https://datatofish.com/convert-string-to-float-dataframe/