In [39]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import StandardScaler

In [40]:
data = pd.read_csv("real_estate_price_size_year.csv")
data.head()

Unnamed: 0,price,size,year
0,234314.144,643.09,2015
1,228581.528,656.22,2009
2,281626.336,487.29,2018
3,401255.608,1504.75,2015
4,458674.256,1275.46,2009


In [41]:
data.describe()


Unnamed: 0,price,size,year
count,100.0,100.0,100.0
mean,292289.47016,853.0242,2012.6
std,77051.727525,297.941951,4.729021
min,154282.128,479.75,2006.0
25%,234280.148,643.33,2009.0
50%,280590.716,696.405,2015.0
75%,335723.696,1029.3225,2018.0
max,500681.128,1842.51,2018.0


In [42]:
x = data[['size',"year"]]
y = data["price"]
x.shape

(100, 2)

In [43]:
scaler = StandardScaler()
scaler.fit(x)
x_scaled = scaler.transform(x)

In [44]:
x_scaled

array([[-0.70816415,  0.51006137],
       [-0.66387316, -0.76509206],
       [-1.23371919,  1.14763808],
       [ 2.19844528,  0.51006137],
       [ 1.42498884, -0.76509206],
       [-0.937209  , -1.40266877],
       [-0.95171405,  0.51006137],
       [-0.78328682, -1.40266877],
       [-0.57603328,  1.14763808],
       [-0.53467702, -0.76509206],
       [ 0.69939906, -0.76509206],
       [ 3.33780001, -0.76509206],
       [-0.53467702,  0.51006137],
       [ 0.52699137,  1.14763808],
       [ 1.51100715, -1.40266877],
       [ 1.77668568, -1.40266877],
       [-0.54810263,  1.14763808],
       [-0.77276222, -1.40266877],
       [-0.58004747, -1.40266877],
       [ 0.58943055,  1.14763808],
       [-0.78365788,  0.51006137],
       [-1.02322731,  0.51006137],
       [ 1.19557293,  0.51006137],
       [-1.12884431,  0.51006137],
       [-1.10378093, -0.76509206],
       [ 0.84424715,  1.14763808],
       [-0.95171405,  1.14763808],
       [ 1.62279723,  0.51006137],
       [-0.58004747,

In [45]:
reg = LinearRegression()
reg.fit(x_scaled,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [46]:
def adj_r2(x,y):
    r2 = reg.score(x,y)
    n = x.shape[0]
    p = x.shape[1]
    adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
    return adjusted_r2

In [47]:
f_reg = f_regression(x_scaled,y)
f_statistic = f_reg[0].round(3)
p_val = f_reg[1].round(3)

In [53]:
summary = pd.DataFrame(data=[["bias"],["size"],["year"]],columns=["features"])
summary['Weights'] = reg.intercept_, reg.coef_[0], reg.coef_[1]
summary["p_val"] = None,p_val[0],p_val[1]
summary["f_statistic"] = None,f_statistic[0],f_statistic[1]
summary


Unnamed: 0,features,Weights,p_val,f_statistic
0,bias,292289.47016,,
1,size,67501.576142,0.0,285.921
2,year,13724.397082,0.357,0.855


In [54]:
new_data = [[500,2001],[750,2009]]
new_data_scaled = scaler.transform(new_data)
reg.predict(new_data_scaled)

array([178070.84854337, 258330.34465995])

In [56]:
adj_r2(x_scaled,y)

0.77187171612825

In [57]:
reg.score(x_scaled,y)


0.7764803683276793