# house price pridiction

In [267]:
import os
import numpy as np
import pandas as pd
from scipy import stats


def OLS(X, Y):
    X_T = np.transpose(X)
    Betas = np.dot(np.dot(np.matrix(np.dot(X_T, X)).I, X_T), Y)
    Betas = np.array(Betas)
    return Betas


def Predict(X, Betas):
    Y_predict = np.dot(X, Betas)
    return Y_predict


def T_Beta_i(Skk, S_sq, T_up):
    up = T_up * (Skk*S_sq)**0.5
    low = -up
    return low, up


def T_test(Betas, SKK, S_sq, T_up):
    Betas_interval = []
    beta_num = Betas.shape[0]
    for j in range(beta_num):
        Skk = SKK[j, j]
        Beta_i = Betas[j, 0]
        t_value = Beta_i/(Skk*S_sq)**0.5
        std = (Skk*S_sq)**0.5
        low, up = T_Beta_i(Skk, S_sq, T_up)
        if Beta_i>low and Beta_i<up:
            IN_interval = 1
        else:
            IN_interval = 0
        PR = 2*(1- stats.t.cdf(abs(t_value), 250-12-1))
        ls_tmp = [Beta_i, t_value, PR, low, up, IN_interval]
        Betas_interval.append(ls_tmp)
    return Betas_interval
    

def R_suqare(Y_predict, Y):
    Y_mean = np.mean(Y)
    RSS = np.sum((Y_predict-Y_mean)**2)
    TSS = np.sum((Y-Y_mean)**2)
    return RSS/TSS

# 1 read  and analyse data

In [142]:
house_price = pd.read_csv('./data_problemset1/Housing_Prices.csv')
house_price["City Zone"] = pd.factorize(house_price["City Zone"])[0].astype(np.uint16)
col_name = house_price.columns.tolist()
col_name.insert(col_name.index("City Zone"),"Ones")
house_price = house_price.reindex(columns=col_name)
house_price["Ones"] = pd.Series(np.ones(250).tolist())
house_train = house_price.to_numpy()

In [246]:
house_price.to_csv('./data_problemset1/Housing_Prices1.csv')

In [159]:
cov = np.cov(house_train.transpose())
cov1 = cov[12, :]
df = pd.DataFrame(cov1, index=house_price.columns, columns=["row1"])
df['s'] = df['row1'].abs()
df1 = df.sort_values(axis=0, by='s', ascending=False)
df1

Unnamed: 0,row1,s
Price [euros],20386700000.0,20386700000.0
m^2,4120062.0,4120062.0
City Zone,-93297.34,93297.34
Rooms,65517.42,65517.42
Bathrooms,56975.82,56975.82
Parking,16133.28,16133.28
Terrasse,12466.27,12466.27
Kitchen,10363.18,10363.18
"""Atico""",4614.008,4614.008
Yard,3461.03,3461.03


## 2 get OLS model

In [176]:
X = house_train[:, 0:12]
Y = house_train[:, 12]
Y = np.reshape(Y, (Y.shape[0], 1))
Betas = OLS(X, Y)
Y_pre = Predict(X, Betas)

## 3 model evaluation

In [271]:
R2 = R_suqare(Y_pre, Y)
print(R2)

0.8296404805827557


In [186]:
N = 250
K = 11
P = 0.05
U = Y - Y_pre
S_sq = np.dot(U.transpose(), U)[0][0] / (N-K-1)
SKK = np.matrix(np.dot(X.transpose(), X)).I
f_95_up = T.ppf(1-P/2, N-K-1)
f_95_low = -f_95_up

In [268]:
sig_intervals = T_test(Betas, SKK, S_sq, f_95_up)
numpy_data = np.array(sig_intervals)
df2 = pd.DataFrame(data=numpy_data, index=house_price.columns[0:12], 
        columns=["beta", "t-value", "Pr(>|t|)", "low_bound", 'up_boud', 'in'])
df2

Unnamed: 0,beta,t-value,Pr(>|t|),low_bound,up_boud,in
Ones,-20324.92772,-1.364074,0.1738384,-29353.057913,29353.057913,1.0
City Zone,-8545.351212,-5.794101,2.17519e-08,-2905.400725,2905.400725,0.0
m^2,3303.535538,17.151911,0.0,-379.427342,379.427342,0.0
Rooms,-9401.335439,-1.728309,0.08523536,-10715.943163,10715.943163,1.0
Bathrooms,30244.383089,3.218326,0.001469565,-18513.000784,18513.000784,0.0
Elevator,4753.826983,0.591136,0.5549926,-15842.293862,15842.293862,1.0
"""Atico""",17709.052227,1.237177,0.2172459,-28198.484231,28198.484231,1.0
Terrasse,2603.8936,0.250269,0.8025958,-20496.403911,20496.403911,1.0
Parking,74776.518143,3.975703,9.321522e-05,-37052.151817,37052.151817,0.0
Kitchen,-3600.796539,-0.330011,0.7416832,-21494.748003,21494.748003,1.0


### 3.1 adjustment

In [289]:
np.where(numpy_data[:, 2]<=0.30)

(array([ 0,  1,  2,  3,  4,  6,  8, 10, 11]),)

In [292]:
## train
selected_idx = [0,1,2,3,4,5,6,7,8,9,10,11]
X1 = X[0:200, selected_idx]
Y = house_train[0:200, 12]
Y = np.reshape(Y, (Y.shape[0], 1))
Betas1 = OLS(X1, Y)
Y_pre1 = Predict(X1, Betas1)
R21 = R_suqare(Y_pre1, Y)
print(R21)

X_test = X[200:250, selected_idx]
Y_test = house_train[200:250, 12]
## test
Y_test_pre = Predict(X_test, Betas1)
R2 = R_suqare(Y_test_pre, Y_test)
print(R2)
U = Y_test - Y_test_pre
E = np.dot(U.transpose(), U)[0][0]
print(E**0.5)

0.8401625524295019
0.7471886727133911
3514696.8558887006


## 4 prediction

In [None]:
house_price = pd.read_csv('./data_problemset1/Housing_Prices.csv')
house_price["City Zone"] = pd.factorize(house_price["City Zone"])[0].astype(np.uint16)
col_name = house_price.columns.tolist()
col_name.insert(col_name.index("City Zone"),"Ones")
house_price = house_price.reindex(columns=col_name)
house_price["Ones"] = pd.Series(np.ones(250).tolist())
house_train = house_price.to_numpy()