Fire up GraphLab Create

In [1]:
import graphlab

Load house sales data

In [2]:
sales = graphlab.SFrame('kc_house_data.gl/')

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\Nipuna\AppData\Local\Temp\graphlab_server_1595754242.log.0


This non-commercial license of GraphLab Create for academic use is assigned to 160645l@uom.lk and will expire on July 20, 2021.


Split data into training and testing

In [4]:
train_data,test_data =sales.random_split(0.8,seed=0)

Useful SFrame Summary Functions

In [5]:
#Let's compute the mean of the house prices in King County in 2 different ways
prices = sales['price'] #extract the price column of the sales SFrame -- this is now a SArray
#recall that the arithmetic average(the mean) is the sum of the prices divided by the total number of houses
sum_prices = prices.sum()
num_houses = prices.size()

avg_price_1 = float(sum_prices)/num_houses
avg_price_2 = prices.mean()

print avg_price_1
print avg_price_2

540088.141905
540088.141905


In [6]:
#if we want to multiply every price by 0.5 it's simple as
half_price = 0.5*prices
#let's compute the sum of squares of price. We can multiply two SArrays of the same length elementwise also with *
prices_squared = prices*prices
sum_prices_squared = prices_squared.sum()
print sum_prices_squared

9.21732513355e+15


Build a generic simple linear regression function

In [7]:
def simple_linear_regression(input_feature,output):
    Xi= input_feature
    Yi= output
    N = len(Xi)
    #compute mean of input_feature and output
    YMean = Yi.mean()
    XMean = Xi.mean()
    #compute the product of the output and the input_feature and its mean
    SumYiXi = (Yi*Xi).sum()
    YiXiByN = (Yi.sum()*Xi.sum())/N
    #compute the squared value of the input_feature and its mean
    XiSq = (Xi*Xi).sum()
    XiXiByN = (Xi.sum()*Xi.sum())/N
    #calculate slope
    slope = (SumYiXi-YiXiByN)/(XiSq-XiXiByN)
    #calculate intercept
    intercept = YMean-(slope*XMean)
    return (intercept,slope)

In [9]:
test_feature = graphlab.SArray(range(5))
test_output = graphlab.SArray(1+1*test_feature)
(test_intercept, test_slope)=simple_linear_regression(test_feature,test_output)
print "Intercept: "+str(test_intercept)
print "Slope: "+str(test_slope)

Intercept: 1.0
Slope: 1


In [10]:
sqft_intercept,sqft_slope = simple_linear_regression(train_data['sqft_living'],train_data['price'])
print "Intercept: "+str(sqft_intercept)
print "Slope: "+str(sqft_slope) 

Intercept: -47116.0765749
Slope: 281.958838568


Predicting Values

In [11]:
def get_regression_predictions(input_feature, intercept, slope):
    #calculate the predicted values
    predicted_values = intercept+(slope*input_feature)
    return predicted_values

In [13]:
my_house_sqft = 2650
estimated_price = get_regression_predictions(my_house_sqft,sqft_intercept,sqft_slope)
print "The estimated price for a house with %d squarefeet is $%.2f" %(my_house_sqft,estimated_price)

The estimated price for a house with 2650 squarefeet is $700074.85


Residual Sum Of Squares

In [14]:
def get_residual_sum_of_squares(input_feature, output, intercept, slope):
    #first get the predictions
    predicted_values = intercept+(slope*input_feature)
    #then compute the residuals
    residuals = output - predicted_values
    #suare them up
    RSS = (residuals*residuals).sum()
    return (RSS)

In [15]:
print get_residual_sum_of_squares(test_feature,test_output,test_intercept,test_slope)

0.0


In [16]:
rss_price_on_sqft = get_residual_sum_of_squares(train_data['sqft_living'],train_data['price'],
                                               sqft_intercept,sqft_slope)
print 'The RSS of predicting prices based on Sq.ft. is: '+str(rss_price_on_sqft)

The RSS of predicting prices based on Sq.ft. is: 1.20191835632e+15


Predict the Sq.Ft. given price

In [17]:
def inverse_regression_predictions(output, intercept,slope):
    estimated_feature = (output-intercept)/slope
    return estimated_feature

In [18]:
my_house_price = 800000
estimated_squarefeet = inverse_regression_predictions(my_house_price, sqft_intercept, sqft_slope)
print "The estimated squarefeet for a house worth $%.2f is %d "%(my_house_price,estimated_squarefeet)

The estimated squarefeet for a house worth $800000.00 is 3004 


New Model: Estimate Prices from Bedrooms

In [20]:
bedrooms_intercept,bedrooms_slope = simple_linear_regression(train_data['bedrooms'],train_data['price'])
print "Intercept: " + str(bedrooms_intercept)
print "Slope: " + str(bedrooms_slope)

Intercept: 109473.180469
Slope: 127588.952175


Test the Linear Regression Algorithm

In [21]:
#Compute RSS when using bedrooms on TEST data
bedrooms_intercept,bedrooms_slope = simple_linear_regression(train_data['bedrooms'],train_data['price'])
rss_prices_on_bedrooms = get_residual_sum_of_squares(test_data['bedrooms'],test_data['price'],bedrooms_intercept,bedrooms_slope)
print 'The RSS of predicting Prices based on Bedrooms is : ' + str(rss_prices_on_bedrooms)

The RSS of predicting Prices based on Bedrooms is : 4.93364582868e+14


In [22]:
# Compute RSS when using squarfeet on TEST data:
sqft_intercept, sqft_slope = simple_linear_regression(train_data['sqft_living'], train_data['price'])
rss_prices_on_sqft = get_residual_sum_of_squares(test_data['sqft_living'], test_data['price'], sqft_intercept, sqft_slope)
print 'The RSS of predicting Prices based on Square Feet is : ' + str(rss_prices_on_sqft)

The RSS of predicting Prices based on Square Feet is : 2.75402936247e+14
