In [1]:
import pandas as pd
import numpy as np

In [2]:
house_data=pd.read_csv('kc_house_data.csv')

In [3]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

## Change the data types of columns 

In [4]:
house_data=house_data.astype(dtype_dict)

In [5]:
house_data.dtypes

id                object
date              object
price            float64
bedrooms         float64
bathrooms        float64
sqft_living      float64
sqft_lot           int64
floors            object
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode           object
lat              float64
long             float64
sqft_living15    float64
sqft_lot15       float64
dtype: object

## import training data and test data


In [6]:
train_data = pd.read_csv('kc_house_train_data.csv')
train_data=train_data.astype(dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv')
test_data=test_data.astype(dtype_dict)

## function of simple linear regression 

In [7]:
def simple_linear_regression(input_feature,output):
    n=np.mean(input_feature*output)-(np.mean(input_feature)*np.mean(output))
    d=np.mean(input_feature*input_feature)-(np.mean(input_feature)*np.mean(input_feature))
    slope=n/d
    intercept=np.mean(output)-slope*np.mean(input_feature)
    return (intercept,slope)

## predict price using sqft_living

In [8]:
input_feature = train_data['sqft_living']
output = train_data['price']

In [9]:
squarefeet_intercept,squarefeet_slope = simple_linear_regression(input_feature,output)

In [10]:
print("Squarefeet intercept :",squarefeet_intercept)
print("Squarefeet Slope : ",squarefeet_slope)

Squarefeet intercept : -47116.07907289488
Squarefeet Slope :  281.95883963034294


## function get regression predictions 

In [11]:
def get_regression_predictions(input_feature, intercept, slope):
    predicted_output = intercept+slope*input_feature
    return(predicted_output)

In [12]:
#2650 sq.ft
get_regression_predictions(2650,squarefeet_intercept,squarefeet_slope)

700074.8459475139

## Residual sum of square 

In [13]:
def get_residual_sum_of_squares(input_feature, output, intercept,slope):
    RSS = ((output-(intercept+slope*input_feature))**2).sum(axis=0)
    return(RSS)

In [14]:
get_residual_sum_of_squares(input_feature,output,squarefeet_intercept,squarefeet_slope)

1201918354177283.0

## inverse linear function 

In [15]:
def inverse_regression_predictions(output, intercept, slope):
    estimated_input = (output-intercept)/slope
    return(estimated_input)

In [16]:
inverse_regression_predictions(800000,squarefeet_intercept,squarefeet_slope)

3004.3962451522752

## predict price using bedrooms 

In [17]:
input_feature = train_data['bedrooms']
output = train_data['price']

In [18]:
bedroom_intercept,bedroom_slope=simple_linear_regression(input_feature,output)

In [19]:
print("Bedroom intercept :",bedroom_intercept)
print("bedroom Slope : ",bedroom_slope)

Bedroom intercept : 109473.17762295791
bedroom Slope :  127588.95293398833


## Multiple Linear Regression Assignment 1


### first we need to add 4 new varible in training data set and test data set 

In [20]:
train_data['bedrooms_squared']=train_data['bedrooms']*train_data['bedrooms']
train_data['bed_bath_rooms']=train_data['bedrooms']*train_data['bathrooms']
train_data['log_sqft_living']=np.log(train_data['sqft_living'])
train_data['lat_plus_long']=train_data['lat']+train_data['long']

In [21]:
test_data['bedrooms_squared']=test_data['bedrooms']*test_data['bedrooms']
test_data['bed_bath_rooms']=test_data['bedrooms']*test_data['bathrooms']
test_data['log_sqft_living']=np.log(test_data['sqft_living'])
test_data['lat_plus_long']=test_data['lat']+test_data['long']

### Find the mean value of this new four varible 

In [22]:
print(test_data['bedrooms_squared'].mean())
print(np.mean(test_data['bed_bath_rooms']))
print(np.mean(test_data['log_sqft_living']))
print(np.mean(test_data['lat_plus_long']))

12.4466777015843
7.5039016315913925
7.550274679645921
-74.65333355403185


### Rss

In [23]:
def get_residual_sum_of_squares(model, data, outcome):
    # First get the predictions
    pred=model.predict(data)
    # Then compute the residuals/errors
    rss = outcome - pred
    # Then square and add them up
    RSS = np.sum(np.square(rss))
    return(RSS)

## Create three differenet model for regression

In [24]:
from sklearn.linear_model import LinearRegression

In [25]:
linreg = LinearRegression()

### Model 1 

In [26]:
x=train_data[['sqft_living','bedrooms','bathrooms','lat','long']].values
y=train_data['price'].values

In [27]:
x_test1=test_data[['sqft_living','bedrooms','bathrooms','lat','long']].values
y_test=test_data['price'].values

In [28]:
model1 = linreg.fit(x,y)

In [29]:
model1.coef_

array([ 3.12258646e+02, -5.95865332e+04,  1.57067421e+04,  6.58619264e+05,
       -3.09374351e+05])

In [30]:
model1.predict(x)

array([244657.18811044, 855689.66538486, 318101.67899464, ...,
       528928.42823836, 356549.38348044, 317948.91207276])

In [31]:
get_residual_sum_of_squares(model1,x,y)

967879963049546.4

In [32]:
get_residual_sum_of_squares(model1,x_test1,y_test)

225500469795490.16

### Model 2

In [33]:
x_model2=train_data[['sqft_living','bedrooms','bathrooms','lat','long','bed_bath_rooms']].values

In [34]:
x_test2=test_data[['sqft_living','bedrooms','bathrooms','lat','long','bed_bath_rooms']].values

In [35]:
model2 = linreg.fit(x_model2,y)

In [36]:
model2.coef_

array([ 3.06610053e+02, -1.13446368e+05, -7.14613083e+04,  6.54844630e+05,
       -2.94298969e+05,  2.55796520e+04])

In [37]:
model2.predict(x_model2)

array([251332.76667795, 839750.73997917, 354879.4630783 , ...,
       515931.64001571, 347955.13436443, 361864.55175639])

In [38]:
get_residual_sum_of_squares(model2,x_model2,y)

958419635074069.2

In [39]:
get_residual_sum_of_squares(model2,x_test2,y_test)

223377462976466.88

### Model 3 

In [40]:
x_model3 = train_data[['sqft_living','bedrooms','bathrooms','lat','long','bed_bath_rooms','bedrooms_squared','log_sqft_living','lat_plus_long']].values

In [41]:
x_test3 = test_data[['sqft_living','bedrooms','bathrooms','lat','long','bed_bath_rooms','bedrooms_squared','log_sqft_living','lat_plus_long']].values

In [42]:
model3 = linreg.fit(x_model3,y)

In [43]:
model3_test = linreg.fit(x_test3,y_test)

In [44]:
model3.coef_

array([ 4.69716691e+02, -3.28857029e+04,  5.04637585e+04,  5.25504326e+05,
       -4.08891578e+05, -2.28592380e+03,  1.08100171e+03, -4.33486888e+05,
        1.16612748e+05])

In [45]:
model3.predict(x_model3)

array([268690.87788665, 791519.51977584, 429473.43869987, ...,
       532696.08952394, 359401.70460159, 340501.22368976])

In [46]:
get_residual_sum_of_squares(model3,x_model3,y)

909773229981851.2

In [47]:
get_residual_sum_of_squares(model3,x_test3,y_test)

211457519132478.75

## Multiple Linear Regression Assignment 2

In [55]:
def get_numpy_data(df, features, output):
    df['constant'] = 1 # add a constant column to an SFrame
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    # select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’""
    # this will convert the features_sframe into a numpy matrix:
    features_matrix = df[features].to_numpy()
    # assign the column of df associated with the target to the variable ‘output_sarray’
    output_array = df[output]
    # this will convert the SArray into a numpy array:
    output_array = output_array.to_numpy()
    return(features_matrix, output_array)

In [66]:
(example_features, example_output) = get_numpy_data(train_data, ['sqft_living'], 'price')
print(example_features[0,:])
# print(example_features)

[1.00e+00 1.18e+03]


In [69]:
train_data['sqft_living'][0]

1180.0

In [98]:
def predict_output(feature_matrix, weights):
    predictions = np.dot(feature_matrix,weights)
    return(predictions)

In [99]:
my_weights = np.array([1., 1.])
test_predictions = predict_outcome(example_features,my_weights)
print(test_predictions[0])
print(test_predictions[1])

1181.0
2571.0


In [100]:
def feature_derivative(errors, feature):
    derivative =  2*np.dot(feature,errors)
    return(derivative)

In [101]:
(example_features, example_output) = get_numpy_data(train_data, ['sqft_living'], 'price') 
my_weights = np.array([0., 0.]) # this makes all the predictions 0
test_predictions = predict_output(example_features, my_weights) 
# just like SFrames 2 numpy arrays can be elementwise subtracted with '-': 
errors = test_predictions - example_output # prediction errors in this case is just the -example_output
feature = example_features[:,0] # let's compute the derivative with respect to 'constant', the ":" indicates "all rows"
derivative = feature_derivative(errors, feature)
print(derivative)
print(-np.sum(example_output)*2)

-18752698920.0
-18752698920.0


In [102]:
print(example_output)
print(errors)
print(feature)

[221900. 538000. 180000. ... 360000. 400000. 325000.]
[-221900. -538000. -180000. ... -360000. -400000. -325000.]
[1. 1. 1. ... 1. 1. 1.]


In [103]:
from math import sqrt

In [104]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    while not converged:
        # compute the predictions based on feature_matrix and weights:
        # compute the errors as predictions - output:
        predictions = predict_output(feature_matrix,weights)
        errors =  predictions - output
        gradient_sum_squares = 0 # initialize the gradient
        # while not converged, update each weight individually:
        for i in range(len(weights)):
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            derivative = feature_derivative(errors,feature_matrix[:,i])
            # add the squared derivative to the gradient magnitude
            gradient_sum_squares += derivative**2
            # update the weight based on step size and derivative:
            weights[i]=weights[i]-(step_size*derivative)
        gradient_magnitude = sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)

In [105]:
simple_features = ['sqft_living']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [106]:
g=regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance)
g

array([-46999.88716555,    281.91211918])

In [107]:
g.shape

(2,)

#### test data 

In [108]:
simple_features = ['sqft_living']
my_output = 'price'
(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [112]:
test_weights=regression_gradient_descent(test_simple_feature_matrix, test_output, initial_weights, step_size, tolerance)
test_weights

array([-46999.87880043,    282.3594539 ])

test_prediction = predict_output(test_simple_feature_matrix,test_weights)
test_prediction

### rss 

In [114]:
residual = test_output - test_prediction
rss =  np.sum(np.square(residual))
rss

275395691278132.8

In [115]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [116]:
weight1 = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)

In [117]:
weight1

array([-9.99999688e+04,  2.45072603e+02,  6.52795267e+01])

In [125]:
(test_feature_matrix, test_output) = get_numpy_data(test_data, model_features, my_output)

test_prediction = predict_output(test_feature_matrix, weight1)
test_prediction[0]

366651.4116294939

In [126]:
test_data['price'][0]

310000.0

In [128]:
residual = test_output - test_prediction
rss =  np.sum(np.square(residual))
rss

270263443629803.56

In [None]:
275395691278132.8
1186978095794202.5