In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import explained_variance_score, mean_squared_error, r2_score
output_path = "/run/media/yunchen/lacie"


In [29]:
AKL_df = pd.read_csv(f"{output_path}/outputs/property_data_with_street.csv",encoding='latin1')
AKL_df = AKL_df.drop(['Unnamed: 0'],axis=1)
print(AKL_df.head())
print(AKL_df.shape)

   CL_QPID  CL_Suburb  CL_Sale_Tenure  CL_Sale_Date  \
0    75738        256               0            46   
1    75814        223               0            38   
2    75835        223               0            17   
3    75842        223               0             8   
4    76103        181               0            40   

   CL_Land_Valuation_Capital_Value  CL_Building_Floor_Area  \
0                        -0.776857               -0.084574   
1                        -0.865695               -1.225402   
2                        -0.998952                2.538018   
3                        -1.206240               -1.225402   
4                        -0.836082                0.571074   

   CL_Building_Site_Cover  CL_Land_Area  CL_Bldg_Const  CL_Bldg_Cond  ...  \
0               -0.283212     14.915984              2             2  ...   
1               -1.177352      0.444313              3             0  ...   
2                1.147412      2.320124             13           

In [30]:
akl_embedding_df = pd.read_csv(f"./outputs/akl_embedding_1667625923.csv")
print(akl_embedding_df.shape)

embedding_size = akl_embedding_df.shape[1]

akl_embedding_df.columns = ['street_embedding_'+str(i) for i in range(embedding_size)]
print(embedding_size)

akl_street_nodes_df = pd.read_csv(f"{output_path}/outputs/akl_street_nodes.csv")

akl_street_nodes_df = akl_street_nodes_df.rename(columns={"source": "street_sources", "target": "street_targets"})
print(akl_street_nodes_df.shape)
print(akl_street_nodes_df.head(1))

(458252, 8)
8
(458252, 14)
   street_id  street_sources  street_targets      street_name  street_length  \
0  984794487       279051892      2852049636  South Head Road         32.641   

   restaurant    x    y  Average_POI_Distance  amenity  school  shop  \
0         0.0  0.0  0.0                32.641      0.0     0.0   0.0   

   healthcare  clothes  
0         0.0      0.0  


In [31]:
def find_embedding_for_property(property_df,street_df,emb_df):
    street_with_embedding = street_df.merge(emb_df, left_index=True, right_index=True)
    output_df = property_df.merge(street_with_embedding,on=["street_sources","street_targets"])
    return output_df

In [32]:
x_df = find_embedding_for_property(AKL_df,akl_street_nodes_df,akl_embedding_df)
print(x_df["street_name"].unique())

['Silver Hill Road' 'Okahukura Road' 'Wharf Road' ... 'Studholme Street'
 'Kern Road' 'Sollum Road']


In [33]:
AKL_df = find_embedding_for_property(AKL_df,akl_street_nodes_df,akl_embedding_df)
print(AKL_df["street_name"])
#print(AKL_df.shape)
#print(AKL_df.head(1))

0        Silver Hill Road
1          Okahukura Road
2              Wharf Road
3            Becroft Road
4           B M Gubb Road
               ...       
40732       Kawiti Avenue
40733     Schoolside Road
40734      Cheviot Street
40735       Kestrel Place
40736         Sollum Road
Name: street_name, Length: 40737, dtype: object


In [34]:
# print(AKL_df.head())
# print(AKL_df.shape)
# print(AKL_df.columns)
# AKL_df.to_csv("./outputs/akl_sale_with_distance_embeddings.csv",index=False)

In [35]:
property_columns = ['CL_Suburb','CL_Sale_Tenure','CL_Sale_Date','CL_Land_Valuation_Capital_Value',
                'CL_Building_Floor_Area','CL_Building_Site_Cover',
                'CL_Land_Area','CL_Bldg_Const','CL_Bldg_Cond','CL_Roof_Const','CL_Roof_Cond',
                'CL_Category','CL_LUD_Age','CL_LUD_Land_Use_Description',
                'CL_MAS_No_Main_Roof_Garages','CL_Bedrooms','CL_Bathrooms']+['street_embedding_'+str(i) for i in range(embedding_size)]
X_columns = AKL_df[property_columns].values
#print(X_columns)
Y_column = AKL_df['Log_Sale_Price_Net'].values

X_train, X_test, Y_train, Y_test = train_test_split(X_columns, Y_column, test_size=0.2, random_state=1,shuffle=True)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.25, random_state=1,shuffle=True)

# X_train, X_val, Y_train, Y_val = train_test_split(X_columns, Y_column, test_size=0.2, random_state=1,shuffle=True)

print(X_train.shape)

(24441, 25)


In [36]:
hedonic_regression = LinearRegression()
hedonic_regression.fit(X_train, Y_train)

hedonic_regression_training_result = hedonic_regression.predict(X_train)
hedonic_regression_validation_result = hedonic_regression.predict(X_val)

print("\nTraining RMSE:", round(mean_squared_error(Y_train, hedonic_regression_training_result),4))
print("Validation RMSE:", round(mean_squared_error(Y_val, hedonic_regression_validation_result),4))

# (e^0.09) 还原log
# print(Y_train,hedonic_regression_training_result)
# print(np.exp(Y_train), np.exp(hedonic_regression_training_result))

# print("\nTraining RMSE:", round(mean_squared_error(np.exp(Y_train), np.exp(hedonic_regression_training_result)),4))
# print("Validation RMSE:", round(mean_squared_error(np.exp(Y_test), np.exp(hedonic_regression_Test_result)),4))

print("\nTraining r2:", round(r2_score(Y_train, hedonic_regression_training_result),4))
print("Validation r2:", round(r2_score(Y_val, hedonic_regression_validation_result),4))

print('----------------------------------------------------------------------')
print('Explained Variance Score of OLS model is {}'.format(explained_variance_score(Y_val,hedonic_regression_validation_result)))


Training RMSE: 0.0635
Validation RMSE: 0.0601

Training r2: 0.784
Validation r2: 0.796
----------------------------------------------------------------------
Explained Variance Score of OLS model is 0.7959776800840996
