In [13]:
import matplotlib.pyplot as plt
import pandas as pd
import pylab as pl
import numpy as np
import scipy.stats as stats
import seaborn as sns
from sklearn import linear_model

In [None]:
# loading data
path = 'realtor-data.csv'
df = pd.read_csv(path)
# taking all necessary variables
cdf = df[['bed', 'bath', 'acre_lot', 'house_size', 'price']].copy()

In [None]:
# inspecting data

fig, axs = plt.subplots(nrows=2, ncols=2)

fig.subplots_adjust(right=1)
fig.subplots_adjust(hspace=0.5, wspace=0.25)
# Plot the data on each subplot
axs[0][0].scatter(cdf.bed, cdf.price)
axs[0][0].set_title('Jumlah Kasur dan Harga Rumah')


axs[0][1].scatter(cdf.bath, cdf.price)
axs[0][1].set_title('Jumlah Kamar Mandi dan Harga Rumah')


axs[1][0].scatter(cdf.acre_lot, cdf.price)
axs[1][0].set_title('Luas Tanah dan Harga Rumah')

axs[1][1].scatter(cdf.house_size, cdf.price)
axs[1][1].set_title('Luas Rumah dan Harga Rumah')
plt.show()

# show its coorelation coefficient
tempCorr = np.asanyarray(cdf.corr(), dtype='float64')

plt.imshow(tempCorr, cmap='coolwarm', interpolation='nearest')
for i in range(len(tempCorr)):
    for j in range(len(tempCorr[0])):
        plt.annotate("{:.4f}".format(tempCorr[i][j]), xy=(j,i), ha='center', va='center')
plt.xticks([0,1,2,3,4], cdf.columns.values)
plt.yticks([0,1,2,3,4], cdf.columns.values)
plt.colorbar()
plt.show()


plt.boxplot(cdf['price'])
plt.show()

In [None]:
# dealing with null values
cdf.bed.fillna(cdf.bed.mode()[0], inplace=True)
cdf.bath.fillna(cdf.bath.mode()[0], inplace=True)
cdf.acre_lot.fillna(cdf.acre_lot.mode()[0], inplace=True)
cdf.house_size.fillna(cdf.house_size.mode()[0], inplace=True)


In [None]:
def outlier_treatment(datacolumn):
    """function to Remove outlier"""
    sorted(datacolumn)
    Q1,Q3 = np.percentile(datacolumn,[40,60])
    IQR = Q3-Q1
    lower_range = Q1 - (1.5*IQR)
    upper_range = Q3 + (1.5*IQR)
    return lower_range,upper_range

l,u = outlier_treatment(cdf['price'])
cdf.drop(cdf[(cdf['price'] > u) | (df['price'] < l)].index, inplace=True)
print(cdf.shape)

In [None]:
sns.displot(cdf.price, kind='kde')
plt.show()

In [15]:
cdf.drop(columns = ['acre_lot'], inplace=True)

In [18]:
mask = np.random.rand(len(cdf)) < 0.8
train = cdf[mask]
test = cdf[~mask]

In [20]:
regr = linear_model.LinearRegression()
x = np.asanyarray(train[['bed', 'bath', 'house_size']])
y = np.asanyarray(train[['price']])
regr.fit(x, y)
print(regr.coef_)

[[-7276.84881634 27159.2693475     30.19848013]]


In [22]:
predicted_values = regr.predict(test[['bed', 'bath', 'house_size']])
xT = np.asanyarray(test[['bed', 'bath', 'house_size']])
yT = np.asanyarray(test[['price']])
print("RSE: %.2f"
      % np.mean((predicted_values - yT) ** 2))
print('Variance score: %.2f' % regr.score(xT, yT))

RSE: 15290186380.09
Variance score: 0.13


