# authors: Rambod Azimi & Mahdi Moghassemi
# Date: May 2023

In [37]:
# importing the necessary libraries to the project
import numpy as np
import math, copy
import pandas as pd
from sklearn.linear_model import SGDRegressor # linear regression model library in scikit-learn
from sklearn.preprocessing import StandardScaler # Z-score scaling library in scikit-learn

In [38]:
# This method will replace non-numeric values in the dataset to numeric and computable values
def replace_data(initial_data):
    #  yes --> 1
    # no --> 0

    initial_data['mainroad'] = initial_data['mainroad'].replace(['yes'], 1)
    initial_data['mainroad'] = initial_data['mainroad'].replace(['no'], 0)
    
    initial_data['guestroom'] = initial_data['guestroom'].replace(['yes'], 1)
    initial_data['guestroom'] = initial_data['guestroom'].replace(['no'], 0)

    initial_data['basement'] = initial_data['basement'].replace(['yes'], 1)
    initial_data['basement'] = initial_data['basement'].replace(['no'], 0)

    initial_data['hotwaterheating'] = initial_data['hotwaterheating'].replace(['yes'], 1)
    initial_data['hotwaterheating'] = initial_data['hotwaterheating'].replace(['no'], 0)

    initial_data['airconditioning'] = initial_data['airconditioning'].replace(['yes'], 1)
    initial_data['airconditioning'] = initial_data['airconditioning'].replace(['no'], 0)

    initial_data['prefarea'] = initial_data['prefarea'].replace(['yes'], 1)
    initial_data['prefarea'] = initial_data['prefarea'].replace(['no'], 0)

    initial_data['furnishingstatus'] = initial_data['furnishingstatus'].replace(['furnished'], 2) 
    initial_data['furnishingstatus'] = initial_data['furnishingstatus'].replace(['semi-furnished'], 1)
    initial_data['furnishingstatus'] = initial_data['furnishingstatus'].replace(['unfurnished'], 0)

    return initial_data

In [39]:
def predict(x, w, b): 
    """
    prediction of all the training examples using linear regression
    Args:
      x (ndarray): Shape (n,) example with multiple features
      w (ndarray): Shape (n,) model parameters   
      b (scalar):             model parameter 
      
    Returns:
      p (scalar):  prediction
    """
    m = x.shape[0] # number of training examples (545)
    f = np.zeros(m)
    for i in range(m):
      f[i] = np.dot(x[i,:], w[i,:]) + b
    return f # f[i] = x1w1 + x2w2 + ... + b  

In [40]:
initial_data = pd.read_csv('Housing.csv') # reading the dataset from a csv file in the same directory
y_train = initial_data.iloc[:, 0].to_numpy() # target values (1D array)
initial_data = replace_data(initial_data) # make all the feature variables numeric using this helper function
x_train = initial_data.iloc[:, 1:].to_numpy() # feature values (2D array) --> skip the first row

In [41]:
# just for testing purposes
# data is stored in numpy array/matrix
print(f"x Shape: {x_train.shape}, X Type:{type(x_train)})")
print(x_train)
print(f"y Shape: {y_train.shape}, y Type:{type(y_train)})")
print(y_train)

x Shape: (545, 12), X Type:<class 'numpy.ndarray'>)
[[7420    4    2 ...    2    1    2]
 [8960    4    4 ...    3    0    2]
 [9960    3    2 ...    2    1    1]
 ...
 [3620    2    1 ...    0    0    0]
 [2910    3    1 ...    0    0    2]
 [3850    3    1 ...    0    0    0]]
y Shape: (545,), y Type:<class 'numpy.ndarray'>)
[13300000 12250000 12250000 12215000 11410000 10850000 10150000 10150000
  9870000  9800000  9800000  9681000  9310000  9240000  9240000  9100000
  9100000  8960000  8890000  8855000  8750000  8680000  8645000  8645000
  8575000  8540000  8463000  8400000  8400000  8400000  8400000  8400000
  8295000  8190000  8120000  8080940  8043000  7980000  7962500  7910000
  7875000  7840000  7700000  7700000  7560000  7560000  7525000  7490000
  7455000  7420000  7420000  7420000  7350000  7350000  7350000  7350000
  7343000  7245000  7210000  7210000  7140000  7070000  7070000  7035000
  7000000  6930000  6930000  6895000  6860000  6790000  6790000  6755000
  6720000  668

In [42]:
# cost function J(w, b) to check the accuracy of the generated model
def compute_cost(x, y, w, b):
    m = x.shape[0]
    total_cost = 0.0

    for i in range(m):
        f = np.dot(x[i], w) + b
        total_cost += (f - y[i]) ** 2
    total_cost /= (2 * m)

    return total_cost

In [43]:
# testing the cost function
cost = compute_cost(x_train, y_train, x_train[0], 0.0)
print(f"Cost at optimal w: {cost}")

Cost at optimal w: 674500372862323.4


In [44]:
def compute_gradient(x, y, w, b):
    m = x.shape[0]
    n = x.shape[1]

    dj_dw = np.zeros((n,))
    dj_db = 0

    for i in range(m):
        f = np.dot(x[i], w) + b
        error = f - y[i]

        for j in range(n):
            dj_dw[j] += error * x[i, j]
        dj_db += error
    dj_dw /= m
    dj_db /= m

    return dj_db, dj_dw

In [45]:
# testying compute_gradient method
temp_dj_dw, temp_dj_db = compute_gradient(x_train, y_train, x_train[0], 0.0)
print(f"dj_db at initial w, b: {temp_dj_db}")
print(f"dj_dw at initial w, b: {temp_dj_dw}")

dj_db at initial w, b: [2.04995992e+11 1.00481234e+08 4.41048791e+07 6.08844699e+07
 3.01497789e+07 6.63421655e+06 1.19201310e+07 1.46696897e+06
 1.18261952e+07 2.74702094e+07 9.19611604e+06 3.27828973e+07]
dj_dw at initial w, b: 33450311.588990826


In [46]:
def gradient_descent(x, y, w_in, b_in, alpha, iterations):
    w = copy.deepcopy(w_in)
    b = b_in

    for i in range(iterations):
        dj_db, dj_dw = compute_gradient(x, y, w, b)

        w = w - alpha * dj_dw
        b - b - alpha * dj_db

    return w, b

In [47]:
initial_w = np.array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 10, 20, 10, 0, 20, 0])
initial_b = 2.0
scaler = StandardScaler()
x_normalized = scaler.fit_transform(x_train) # normalized (Z-score) version of x_train 

iterations = 1900
alpha = 5.0e-7

w_final, b_final = gradient_descent(x_normalized, y_train, initial_w, initial_b, alpha, iterations)

total_accuracy = 0.0

for i in range(x_train.shape[0]):
    accuracy = 0.0
    if(((np.dot(x_train[i], w_final) + b_final) / y_train[i]) * 100 > 100):
        accuracy = (y_train[i] / (np.dot(x_train[i], w_final) + b_final)) * 100
    else:
        accuracy = ((np.dot(x_train[i], w_final) + b_final) / y_train[i]) * 100
    print(f"prediction: {np.dot(x_train[i], w_final) + b_final:0.0f}, target value: {y_train[i]}, accuracy: {accuracy:0.2f}%")
    total_accuracy += accuracy

print(f"Overal accuracy: {total_accuracy / x_train.shape[0]:0.2f}%")

prediction: 7064604, target value: 13300000, accuracy: 53.12%
prediction: 8531206, target value: 12250000, accuracy: 69.64%
prediction: 9476751, target value: 12250000, accuracy: 77.36%
prediction: 7140930, target value: 12215000, accuracy: 58.46%
prediction: 7063130, target value: 11410000, accuracy: 61.90%
prediction: 7139231, target value: 10850000, accuracy: 65.80%
prediction: 8168438, target value: 10150000, accuracy: 80.48%
prediction: 15407937, target value: 10150000, accuracy: 65.88%
prediction: 7710151, target value: 9870000, accuracy: 78.12%
prediction: 5475870, target value: 9800000, accuracy: 55.88%
prediction: 12557169, target value: 9800000, accuracy: 78.04%
prediction: 5713929, target value: 9681000, accuracy: 59.02%
prediction: 6235604, target value: 9310000, accuracy: 66.98%
prediction: 3336227, target value: 9240000, accuracy: 36.11%
prediction: 7421724, target value: 9240000, accuracy: 80.32%
prediction: 5711455, target value: 9100000, accuracy: 62.76%
prediction: 62

In [48]:
# input from the user to predit the price
while(True):
    area = input("Please enter the area (Square foot): ")
    bedroom = input("Please enter the number if bedrooms: ")
    bathroom = input("Please enter the number of bathrooms: ")
    story = input("Please enter the number of stories: ")

    mainroad = input("Is the house on the main road (y/n)? ")
    if (mainroad == 'y'):
        mainroad2 = "yes"
        mainroad = 1
    elif (mainroad == 'n'):
        mainroad = 0
        mainroad2 = "no"
    else:
        print("wrong answer in mainroad!")

    guestroom = input("Does the house have any guest room (y/n)? ")
    if (guestroom == 'y'):
        guestroom = 1
        guestroom2 = "yes"
    elif (guestroom == 'n'):
        guestroom = 0
        guestroom2 = "no"
    else:
        print("wrong answer in guest room!")

    basement = input("Does the house have basement (y/n)? ")
    if (basement == 'y'):
        basement = 1
        basement2 = "yes"
    elif (basement == 'n'):
        basement = 0
        basement2 = "no"
    else:
        print("wrong answer in basement!")

    hot_water_heating = input("Does the house have heating system (y/n)? ")
    if (hot_water_heating == 'y'):
        hot_water_heating = 1
        hot_water_heating2 = "yes"
    elif (hot_water_heating == 'n'):
        hot_water_heating = 0
        hot_water_heating2 = "no"
    else:
        print("wrong answer in heating!")

    air_conditioning = input("Does the house have air conditioning (y/n)? ")
    if (air_conditioning == 'y'):
        air_conditioning = 1
        air_conditioning2 = "yes"
    elif (air_conditioning == 'n'):
        air_conditioning = 0
        air_conditioning2 = "no"
    else:
        print("wrong answer in air conditioning!")

    parking = input("Please enter the number of parkings: ")
    prefarea = input("Does the house have prefarea (y/n)? ")
    if (prefarea == 'y'):
        prefarea = 1
        prefarea2 = "yes"
    elif (prefarea == 'n'):
        prefarea = 0
        prefarea2 = "no"
    else:
        print("wrong answer in air prefarea!")

    furnished = input("Is the house furnished (y), semi-furnished (s), or unfurnished (n)? ")
    if (furnished == 'y'):
        furnished = 2
        furnished2 = "furnished"
    elif (furnished == 's'):
        furnished = 1
        furnished2 = "semi-furnished"
    elif (furnished == 'n'):
        furnished = 0
        furnished2 = "unfurnished"
    else:
        print("wrong answer in air furnished!")

    print(f"You entered the following values: \n area: {area} \n bedrooms: {bedroom} \n bathrooms: {bathroom} \n stories: {story} \n mainroad: {mainroad2} \n guestroom: {guestroom2} \n basement: {basement2} \n heating system: {hot_water_heating2} \n airconditioning: {air_conditioning2} \n parking: {parking} \n prefarea: {prefarea2} \n furnishing status: {furnished2}")
    x_train_input = np.array([int(area), int(bedroom), int(bathroom), int(story), int(mainroad), int(guestroom), int(basement), int(hot_water_heating), int(air_conditioning), int(parking), int(prefarea), int(furnished)])

    result = np.dot(w_final, x_train_input) + b_final
    print(f"Prediction: {result:0.2f}")

    break

You entered the following values: 
 area: 7420 
 bedrooms: 4 
 bathrooms: 2 
 stories: 2 
 mainroad: yes 
 guestroom: no 
 basement: no 
 heating system: no 
 airconditioning: yes 
 parking: 2 
 prefarea: yes 
 furnishing status: furnished


ValueError: Expected 2D array, got 1D array instead:
array=[7.42e+03 4.00e+00 2.00e+00 2.00e+00 1.00e+00 0.00e+00 0.00e+00 0.00e+00
 1.00e+00 2.00e+00 1.00e+00 2.00e+00].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.