<a href="https://colab.research.google.com/github/millejade/Machine-Learning/blob/main/MultipleRegression2UoWashington.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [3]:
sales = pd.read_csv('dataset/kc_house_data.csv')

In [None]:
df = pd.DataFrame(sales)

In [10]:
def get_numpy_data(data, features, output):
  data['constant'] = 1  # add new feature to dataset
  features = ['constant'] + features  # add new feature on feature set
  features_df = data[features]  
  feature_matrix = features_df.to_numpy()
  output_array = data[output]
  out_array = output_array.to_numpy()

  return(feature_matrix, out_array)


In [12]:
(example_features, example_output) = get_numpy_data(df, ['sqft_living'], 'price') # the [] around 'sqft_living' makes it a list
print (example_features[0,:]) # this accesses the first row of the data the ':' indicates 'all columns'
print (example_output[0]) # and the corresponding output

[   1 1180]
221900.0


In [32]:
my_weights = np.array([1., 1.])
my_features = example_features[0, ]
pred_val = np.dot(my_features, my_weights)
print(pred_val)

1181.0


In [33]:
def pred_out(feature_matrix, weights):
  pred = np.dot(feature_matrix, weights)

  return(pred)

In [34]:
test_predictions = pred_out(example_features, my_weights)
print (test_predictions[0]) # should be 1181.0
print (test_predictions[1]) # should be 2571.0

1181.0
2571.0


In [35]:
def feat_deriv(errors, feat):
  deriv = 2*np.dot(errors, feat)
  return(deriv)

In [36]:
(example_features, example_output) = get_numpy_data(sales, ['sqft_living'], 'price') 
my_weights = np.array([0., 0.]) # this makes all the predictions 0
test_predictions = pred_out(example_features, my_weights) 

errors = test_predictions - example_output # prediction errors in this case is just the -example_output
feature = example_features[:,0] # let's compute the derivative with respect to 'constant', the ":" indicates "all rows"
derivative = feat_deriv(errors, feature)
print (derivative)
print (-np.sum(example_output)*2) # should be the same as derivative

-23349913996.0
-23349913996.0


In [41]:
from math import sqrt

In [42]:
def grad_desc(feature_matrix, output, init_weights, step_size, tolerance):
  converge = False
  weights = np.array(init_weights)

  while not converge:
    pred = pred_out(feature_matrix, weights)
    errors = pred - output
    grad_sum_squares = 0  # initialization

    for i in range(len(weights)):
      deriv = feat_deriv(errors, feature_matrix[:, i])
      grad_sum_squares += (deriv**2)
      weights[i] -= (step_size * deriv)

    grad_magnitude = sqrt(grad_sum_squares)
    if grad_magnitude < tolerance:
      converge = True
    
    return(weights)


In [43]:
train, test = train_test_split(df, test_size=0.2)

In [44]:
simple_feat = ['sqft_living']
my_target = 'price'
(simple_feat_matrix, target) = get_numpy_data(train, simple_feat, my_target)
init_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [45]:
test_weight = grad_desc(simple_feat_matrix, target, init_weights, step_size, tolerance)
print(test_weight)

[-46999.85786137    356.0781142 ]


In [46]:
(test_simple_feat_matrix, test_target) = get_numpy_data(test, simple_feat, my_target)


In [None]:
test_weight

In [None]:
test_simple_feat_matrix

In [52]:
test_pred = pred_out(test_simple_feat_matrix, test_weight)
print(test_pred)

[ 494238.87571524 1135179.48126649  704324.96309037 ...  472874.18886353
  672277.93281281  860999.33333623]


In [53]:
test_pred[0]

494238.87571524

In [54]:
test_resi = test_target - test_pred
test_RSS = (test_resi**2).sum()
print(test_RSS)

389646275542472.5


RUNNING A MULTIPLE REGRESSION

In [55]:
model_feat = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feat_matrix, output) = get_numpy_data(train, model_feat, my_output)
init_W = np.array([-100000., 1., 1.])
step = 4e-12
tol = 1e9

In [56]:
weight = grad_desc(feat_matrix, output, init_W, step, tol)
print(weight)

[-99999.91172181    218.5321808     197.16193752]


In [58]:
(test_feat_matrix, test_out) = get_numpy_data(test, model_feat, my_output)

test_pred_2 = pred_out(test_feat_matrix, weight)
print(test_pred_2)

[ 587060.49063006 1280104.5610999   865837.54981725 ...  540431.0304036
  619433.42539657  916644.46373912]


In [59]:
print(test_pred_2[0])

587060.4906300638


In [69]:
outt = test['price'].to_numpy()
print(outt[0])

413000.0


In [70]:
test_resi_2 = test_out - test_pred_2
test_RSS_2 = (test_resi_2**2).sum()
print(test_RSS_2)

473902374832350.2
