In [199]:
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np

In [200]:
df_train = pd.read_csv('real_estate_train.csv')
df_test = pd.read_csv('real_estate_test.csv')

feature_columns = ["X1 transaction date", "X2 house age", "X3 distance to the nearest MRT station", "X4 number of convenience stores", "X5 latitude", "X6 longitude"]
label_column = "Y house price of unit area"

#train
features_train = df_train.loc[:, feature_columns]
label_train = df_train.loc[:, label_column]
X_train = features_train.values
y_train = label_train.values

#test
features_test = df_test.loc[:, feature_columns]
label_test = df_test.loc[:, label_column]
X_test = features_test.values
y_test = label_test.values

df_train.head(5)

Unnamed: 0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,2012.917,32.0,84.87882,,24.98298,121.54024,37.9
1,2012.917,19.5,306.5947,9.0,24.98034,121.53951,42.2
2,2013.583,13.3,561.9845,5.0,24.98746,121.54391,47.3
3,2013.5,13.3,561.9845,5.0,24.98746,121.54391,54.8
4,2012.833,5.0,390.5684,5.0,24.97937,121.54245,43.1


In [201]:
continous_impute = SimpleImputer(np.nan, strategy = "mean")
X_train[:, 0:3] = continous_impute.fit_transform(X_train[:, 0:3])
X_train[:, 4:6] = continous_impute.fit_transform(X_train[:, 4:6])

species_impute = SimpleImputer(np.nan, strategy = "most_frequent")
X_train[:, 3] = species_impute.fit_transform(X_train[:, 3].reshape(-1,1)).ravel()

#X_train[:, 0:6] = continous_impute.fit_transform(X_train[:, 0:6])

y_train[:] = continous_impute.fit_transform(y_train[:].reshape(-1,1)).ravel()
new_df = pd.DataFrame.from_records(X_train, columns = feature_columns)
new_df.isnull().sum()

X1 transaction date                       0
X2 house age                              0
X3 distance to the nearest MRT station    0
X4 number of convenience stores           0
X5 latitude                               0
X6 longitude                              0
dtype: int64

In [202]:
ones = np.ones((X_train.shape[0], 1))
X_bar = np.concatenate((ones, X_train), axis=1)
inverse = np.linalg.pinv(np.asarray(np.dot(X_bar.T, X_bar), dtype='float'))
w = np.dot(np.dot(X_bar.T, y_train), inverse)
w

array([-1.13558035e+04,  5.70293613e+00, -2.63314570e-01, -5.41776404e-03,
        1.32078662e+00, -4.47750350e-01, -5.82029299e-01])

In [203]:
from sklearn.linear_model import LinearRegression

In [204]:
linear = LinearRegression()

linear.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [205]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [206]:
predict = linear.predict(X_test)
train = linear.predict(X_train)

print("Mean Squared Test Error: {}".format(np.sqrt(mean_squared_error(y_test, predict))))
print("Mean Squared Train Error: {}".format(np.sqrt(mean_squared_error(y_train, train))))

Mean Squared Test Error: 7.707638687808493
Mean Squared Train Error: 9.401471996659081


In [207]:
from sklearn.linear_model import Ridge

In [208]:
ridge = Ridge(alpha = 1)
ridge.fit(X_train, y_train)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [209]:
predict = ridge.predict(X_test)
train = ridge.predict(X_train)

print("Mean Squared Test Error: {}".format(np.sqrt(mean_squared_error(y_test, predict))))
print("Mean Squared Train Error: {}".format(np.sqrt(mean_squared_error(y_train, train))))

Mean Squared Test Error: 7.706982825470644
Mean Squared Train Error: 9.401665123467717
