## Uczenie liniowej hipotezy dla problemu przewidywania cen nieruchomości

### Wczytanie i prezentacja danych

In [408]:
import pandas as pd
import numpy as np
import matplotlib
import sklearn

In [409]:
housing = pd.read_csv('housing_data.csv')

In [410]:
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


### Przekształcanie stringów na inty

In [411]:
housing_prep = housing
housing_prep['mainroad'] = housing_prep['mainroad'].apply(lambda x: 1 if x == 'yes' else 0)
housing_prep['guestroom'] = housing_prep['guestroom'].apply(lambda x: 1 if x == 'yes' else 0)
housing_prep['basement'] = housing_prep['basement'].apply(lambda x: 1 if x == 'yes' else 0)
housing_prep['hotwaterheating'] = housing_prep['hotwaterheating'].apply(lambda x: 1 if x == 'yes' else 0)
housing_prep['airconditioning'] = housing_prep['airconditioning'].apply(lambda x: 1 if x == 'yes' else 0)
housing_prep['prefarea'] = housing_prep['prefarea'].apply(lambda x: 1 if x == 'yes' else 0)
housing_prep['furnishingstatus'] = housing_prep['furnishingstatus'].map({'furnished': 2, 'semi-furnished': 1, 'unfurnished': 0})

In [412]:
housing_prep.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
count,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.858716,0.177982,0.350459,0.045872,0.315596,0.693578,0.234862,0.930275
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.348635,0.382849,0.477552,0.209399,0.46518,0.861586,0.424302,0.761373
min,1750000.0,1650.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,2.0
max,13300000.0,16200.0,6.0,4.0,4.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,2.0


In [413]:
housing_data = housing_prep.iloc[:, 1:]
housing_target = housing_prep.iloc[:, 0]

In [414]:
housing_data.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,7420,4,2,3,1,0,0,0,1,2,1,2
1,8960,4,4,4,1,0,0,0,1,3,0,2
2,9960,3,2,2,1,0,1,0,0,2,1,1
3,7500,4,2,2,1,0,1,0,1,3,1,2
4,7420,4,1,2,1,1,1,0,1,2,0,2


In [415]:
housing_target.head()

0    13300000
1    12250000
2    12250000
3    12215000
4    11410000
Name: price, dtype: int64

### Przekształcanie na tabele danych

In [416]:
data_np = np.array(housing_data, dtype=np.float64)
target_np = np.array(housing_target, dtype=np.float64)

print(type(data_np))
print(type(target_np))

print(data_np.shape)
print(target_np.shape)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(545, 12)
(545,)


In [417]:
print("First house in database")
print(data_np[0,:])

First house in database
[7.42e+03 4.00e+00 2.00e+00 3.00e+00 1.00e+00 0.00e+00 0.00e+00 0.00e+00
 1.00e+00 2.00e+00 1.00e+00 2.00e+00]


In [418]:
print(target_np[0])

13300000.0


In [419]:
print("---Mean---")
print(data_np.mean(axis=0))
print("---std---")
print(data_np.std(axis=0))

---Mean---
[5.15054128e+03 2.96513761e+00 1.28623853e+00 1.80550459e+00
 8.58715596e-01 1.77981651e-01 3.50458716e-01 4.58715596e-02
 3.15596330e-01 6.93577982e-01 2.34862385e-01 9.30275229e-01]
---std---
[2.16814915e+03 7.37386427e-01 5.02008423e-01 8.66696233e-01
 3.48314687e-01 3.82497298e-01 4.77113618e-01 2.09206500e-01
 4.64752931e-01 8.60794942e-01 4.23912780e-01 7.60673843e-01]


### Skalowanie danych

In [420]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data_np)

In [421]:
print("First house in database")
print(scaled_data[0,:])
print('---Mean---')
print(scaled_data.mean(axis=0))
print('--std---')
print(scaled_data.std(axis=0))

First house in database
[ 1.04672629  1.40341936  1.42181174  1.37821692  0.40562287 -0.46531479
 -0.73453933 -0.2192645   1.4726183   1.51769249  1.80494113  1.40628573]
---Mean---
[-1.56449777e-16 -1.82524739e-16 -2.60749628e-17 -1.30374814e-16
 -1.82524739e-16  5.21499256e-17 -7.82248883e-17 -6.51874070e-18
 -5.21499256e-17  0.00000000e+00  0.00000000e+00  0.00000000e+00]
--std---
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


### Podział danych na zbiory testowe i treningowe

In [422]:
from sklearn.model_selection import train_test_split

housing_train_data, housing_test_data, \
housing_train_target, housing_test_target = \
train_test_split(scaled_data, target_np, test_size=0.3)

In [423]:
print("Training dataset:")
print("clients_train_data:", housing_train_data.shape)
print("clients_train_target:", housing_train_target.shape)

Training dataset:
clients_train_data: (381, 12)
clients_train_target: (381,)


In [424]:
print("Testing dataset:")
print("clients_test_data:", housing_test_data.shape)
print("clients_test_target:", housing_test_target.shape)

Testing dataset:
clients_test_data: (164, 12)
clients_test_target: (164,)


### Trenowanie modelu

In [425]:
from sklearn.linear_model import LinearRegression

linear_regression = LinearRegression()
linear_regression.fit(housing_train_data, housing_train_target)

### Ewaluacja modelu

In [426]:
from sklearn.metrics import mean_squared_error
print("Mean squared error of a learned model: %.2f" % 
      mean_squared_error(housing_test_target, linear_regression.predict(housing_test_data)))

Mean squared error of a learned model: 1023924901162.55


In [427]:
accuracy = linear_regression.score(housing_test_data, housing_test_target)
print(f"Dokladnosc modelu: {accuracy}")

Dokladnosc modelu: 0.6383131449598749


In [428]:
from sklearn.metrics import r2_score
print('Variance score: %.2f' % r2_score(housing_test_target, linear_regression.predict(housing_test_data)))

Variance score: 0.64


### Cross validation NIE DZIAŁA ?

In [429]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(linear_regression, scaled_data, target_np, cv=10, scoring='neg_mean_squared_error')
print(-scores)

[8.17361596e+12 1.26015985e+12 6.60211846e+11 9.47517719e+11
 9.41923662e+11 7.04726760e+11 8.11540690e+11 8.27080546e+11
 6.27006181e+11 1.16211917e+12]


In [430]:
scaled_data

array([[ 1.04672629,  1.40341936,  1.42181174, ...,  1.51769249,
         1.80494113,  1.40628573],
       [ 1.75700953,  1.40341936,  5.40580863, ...,  2.67940935,
        -0.55403469,  1.40628573],
       [ 2.21823241,  0.04727831,  1.42181174, ...,  1.51769249,
         1.80494113,  0.09166185],
       ...,
       [-0.70592066, -1.30886273, -0.57018671, ..., -0.80574124,
        -0.55403469, -1.22296203],
       [-1.03338891,  0.04727831, -0.57018671, ..., -0.80574124,
        -0.55403469,  1.40628573],
       [-0.5998394 ,  0.04727831, -0.57018671, ..., -0.80574124,
        -0.55403469, -1.22296203]])

### Predykcja

In [431]:
id=1
linear_regression_prediction = linear_regression.predict(housing_test_data[id,:].reshape(1,-1))

In [432]:
print("Model predicted for house {0} value {1}".format(id, linear_regression_prediction))

Model predicted for house 1 value [4791954.7213488]


In [433]:
print("Real value for house \"{0}\" is {1}".format(id, housing_test_target[id]))

Real value for house "1" is 3080000.0


In [434]:
linear_regression_predictions = linear_regression.predict(housing_test_data)
print(linear_regression_predictions)

[2623397.50975404 4791954.7213488  3015004.68494276 3915067.1241552
 4554309.60748347 3164373.79000359 6797941.57005304 6770656.31067152
 5109533.57332519 3697541.01434343 3129213.25185847 3485634.21931145
 5333886.90135166 5339025.72241733 5517823.15371273 4927091.37806069
 2645832.72899776 6181540.72517478 6975183.57912054 3469554.29268563
 3394570.37476201 5219234.50577252 5561988.53269344 7978266.76643361
 6709229.79443238 6612794.23272994 7580763.28385205 6798362.32618884
 6485619.18833927 3845855.76265854 2124430.52037196 6593618.87509682
 2978198.87655777 2640360.47333334 2852808.98262418 3812504.52402251
 3756508.90002894 5008593.54933001 2749961.86519421 5717448.57715755
 6274266.32903899 3336815.51466246 3676586.09263277 4662147.90766039
 3862897.66035256 6208131.26740378 4186146.41260031 4844563.06812179
 6209335.7731335  2303850.37125652 7677478.7600606  3062153.94194664
 5972910.52076783 6268718.45134912 6022906.43781087 4977243.82490229
 3577626.92334986 5568711.18804292 