In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(7)

df = pd.read_csv("../../datasets/kc_house_price.txt", sep=",", header=0, index_col=None)

df

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.00,1180,5650,1.0,,0.0,...,7,1180,0.0,1955,0.0,98178,47.5112,-122.257,1340,5650
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,...,7,2170,400.0,1951,1991.0,98125,47.7210,-122.319,1690,7639
2,5631500400,2/25/2015,180000.0,2,1.00,770,10000,1.0,0.0,0.0,...,6,770,0.0,1933,,98028,47.7379,-122.233,2720,8062
3,2487200875,12/9/2014,604000.0,4,3.00,1960,5000,1.0,0.0,0.0,...,7,1050,910.0,1965,0.0,98136,47.5208,-122.393,1360,5000
4,1954400510,2/18/2015,510000.0,3,2.00,1680,8080,1.0,0.0,0.0,...,8,1680,0.0,1987,0.0,98074,47.6168,-122.045,1800,7503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21592,263000018,5/21/2014,360000.0,3,2.50,1530,1131,3.0,0.0,0.0,...,8,1530,0.0,2009,0.0,98103,47.6993,-122.346,1530,1509
21593,6600060120,2/23/2015,400000.0,4,2.50,2310,5813,2.0,0.0,0.0,...,8,2310,0.0,2014,0.0,98146,47.5107,-122.362,1830,7200
21594,1523300141,6/23/2014,402101.0,2,0.75,1020,1350,2.0,0.0,0.0,...,7,1020,0.0,2009,0.0,98144,47.5944,-122.299,1020,2007
21595,291310100,1/16/2015,400000.0,3,2.50,1600,2388,2.0,,0.0,...,8,1600,0.0,2004,0.0,98027,47.5345,-122.069,1410,1287


##### Data preprocessing (remove NaN, see distribution of data,...)

- Extract needed features 

In [2]:
list(df.columns)

['id',
 'date',
 'price',
 'bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'zipcode',
 'lat',
 'long',
 'sqft_living15',
 'sqft_lot15']

In [3]:
LABEL_COLNAME = "price"

potential_features = [
    "bedrooms",
    "bathrooms",
    "sqft_living",
    "sqft_lot",
    "floors",
    "waterfront",
    "view",
    "condition",
    "grade",
    "sqft_above",
    "sqft_basement",
]

calculate_df = df.loc[:, [*potential_features, LABEL_COLNAME]]

# Remove NaN
for col_name in potential_features:
    calculate_df[col_name] = pd.to_numeric(
        calculate_df.loc[:, col_name], errors="coerce"
    )

calculate_df = calculate_df.dropna()
calculate_df

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,price
1,3,2.25,2570,7242,2.0,0.0,0.0,3,7,2170,400.0,538000.0
2,2,1.00,770,10000,1.0,0.0,0.0,3,6,770,0.0,180000.0
3,4,3.00,1960,5000,1.0,0.0,0.0,5,7,1050,910.0,604000.0
4,3,2.00,1680,8080,1.0,0.0,0.0,3,8,1680,0.0,510000.0
5,4,4.50,5420,101930,1.0,0.0,0.0,3,11,3890,1530.0,1230000.0
...,...,...,...,...,...,...,...,...,...,...,...,...
21591,3,2.50,1310,1294,2.0,0.0,0.0,3,8,1180,130.0,475000.0
21592,3,2.50,1530,1131,3.0,0.0,0.0,3,8,1530,0.0,360000.0
21593,4,2.50,2310,5813,2.0,0.0,0.0,3,8,2310,0.0,400000.0
21594,2,0.75,1020,1350,2.0,0.0,0.0,3,7,1020,0.0,402101.0


- Reserve features with high relationship strength 

In [4]:
label_series = calculate_df[LABEL_COLNAME]
reserved_df = calculate_df[potential_features]

for col_name in reserved_df.columns:
    correlate_coef = np.corrcoef(reserved_df[col_name], label_series)[0, 1]

    if correlate_coef < 0.6 or correlate_coef > 0.9:
        reserved_df = reserved_df.drop(col_name, axis=1)
        print("DROP", end=" ")
    else:
        print("OK", end=" ")

    print(col_name, correlate_coef)

related_features = list(reserved_df.columns)
features_df = reserved_df
features_df

DROP bedrooms 0.3091625662847244
DROP bathrooms 0.5262983305796305
OK sqft_living 0.7049486569307863
DROP sqft_lot 0.08759438562674839
DROP floors 0.25895901716830666
DROP waterfront 0.27266104128961477
DROP view 0.3972433038237466
DROP condition 0.03685744457186101
OK grade 0.6676753459297716
OK sqft_above 0.6096396002330994
DROP sqft_basement 0.32289475609247925


Unnamed: 0,sqft_living,grade,sqft_above
1,2570,7,2170
2,770,6,770
3,1960,7,1050
4,1680,8,1680
5,5420,11,3890
...,...,...,...
21591,1310,8,1180
21592,1530,8,1530
21593,2310,8,2310
21594,1020,7,1020


- Extract X_train, y_train and X_test, y_test

In [5]:
X = reserved_df.to_numpy()
y = label_series.to_numpy()

print(X.shape, y.shape)

num_samples = X.shape[0]
is_train = np.random.rand(num_samples) > 0.2
is_test = np.logical_not(is_train)

X_train, y_train = X[is_train], y[is_train]
X_test, y_test = X[is_test], y[is_test]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

(18749, 3) (18749,)


((14908, 3), (14908,), (3841, 3), (3841,))

- Calculate solution with gradient descent


In [43]:
from MyLinearRegression import LinearRegression


def check(X, y):
    y_pred = my_model.predict(X)
    print("Predicted: ", y_pred)
    print("Real: ", y)


my_model = LinearRegression(learn_rate=1e-9, num_iterations=1000)
my_model.fit(X_train, y_train)

check(X_train, y_train)
check(X_test, y_test)


my_model.weights, my_model.bias

Predicted:  [216220.93298324 443701.27065641 471750.60852851 ... 429630.59507938
 648654.66501486 286421.7523754 ]
Real:  [180000. 604000. 510000. ... 360000. 400000. 402101.]
Predicted:  [674772.85134965 290566.98492785 449285.8043787  ... 401550.58611329
 713238.68563686 286421.7523754 ]
Real:  [538000. 468000. 485000. ... 505000. 347500. 325000.]


(array([163.57748966,   0.79697694, 117.2226    ]), array([0.08208274]))